#MLlib Basic - Data Transformation


In [None]:
#do this for Google Colab
%pip install pyspark



In [None]:
#import necessary packages
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler


In [None]:
#Create Spark Session
spark = SparkSession.builder.appName('MlLib Basics').getOrCreate()

In [None]:
df = spark.createDataFrame( [(0, "Male"),
                             (1, "Male"),
                             (2, "Female"),
                             (3, "Female"),
                             (4, "Female"),
                             (5, "Male")
                          ], ["id", "gender"])

df.show()

#Indexer
indexer = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

encoder = OneHotEncoder(inputCols=["genderIndex"],
                        outputCols=["genderEncoded"])
encoded = encoder.fit(indexed).transform(indexed)
encoded.show()

+---+------+
| id|gender|
+---+------+
|  0|  Male|
|  1|  Male|
|  2|Female|
|  3|Female|
|  4|Female|
|  5|  Male|
+---+------+

+---+------+-----------+
| id|gender|genderIndex|
+---+------+-----------+
|  0|  Male|        1.0|
|  1|  Male|        1.0|
|  2|Female|        0.0|
|  3|Female|        0.0|
|  4|Female|        0.0|
|  5|  Male|        1.0|
+---+------+-----------+

+---+------+-----------+-------------+
| id|gender|genderIndex|genderEncoded|
+---+------+-----------+-------------+
|  0|  Male|        1.0|    (1,[],[])|
|  1|  Male|        1.0|    (1,[],[])|
|  2|Female|        0.0|(1,[0],[1.0])|
|  3|Female|        0.0|(1,[0],[1.0])|
|  4|Female|        0.0|(1,[0],[1.0])|
|  5|  Male|        1.0|    (1,[],[])|
+---+------+-----------+-------------+



In [None]:
df = spark.createDataFrame( [("Female", "Blue", 300, 0.0, 0),
                             ("Female", "Black", 200, 15.1, 1),
                             ("Male", "Red", 100, 12.4, 0),
                             ("Female", "Green", 100, 0.0, 1),
                             ("Female", "Blue", 200, 0.0, 0),
                             ("Male", "Green", 400, 20.0, 1),
                              ("Male", "Yellow", 400, 20.0, 1)],
                            ["gender", "color", "num1", "num2", "target"])

df.show()

indexer_1 = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexed_1 = indexer_1.fit(df).transform(df)
indexed_1.show()

indexer_2 = StringIndexer(inputCol="color", outputCol="colorIndex")
indexed_2 = indexer_2.fit(indexed_1).transform(indexed_1)
indexed_2.show()


encoder_1 = OneHotEncoder(inputCols=["genderIndex"],
                        outputCols=["genderEncoded"])
encoded_1 = encoder_1.fit(indexed_2).transform(indexed_2)
encoded_1.show()

encoder_2 = OneHotEncoder(inputCols=["colorIndex"],
                        outputCols=["colorEncoded"])
encoded_2 = encoder_2.fit(encoded_1).transform(encoded_1)
encoded_2.show()

+------+------+----+----+------+
|gender| color|num1|num2|target|
+------+------+----+----+------+
|Female|  Blue| 300| 0.0|     0|
|Female| Black| 200|15.1|     1|
|  Male|   Red| 100|12.4|     0|
|Female| Green| 100| 0.0|     1|
|Female|  Blue| 200| 0.0|     0|
|  Male| Green| 400|20.0|     1|
|  Male|Yellow| 400|20.0|     1|
+------+------+----+----+------+

+------+------+----+----+------+-----------+
|gender| color|num1|num2|target|genderIndex|
+------+------+----+----+------+-----------+
|Female|  Blue| 300| 0.0|     0|        0.0|
|Female| Black| 200|15.1|     1|        0.0|
|  Male|   Red| 100|12.4|     0|        1.0|
|Female| Green| 100| 0.0|     1|        0.0|
|Female|  Blue| 200| 0.0|     0|        0.0|
|  Male| Green| 400|20.0|     1|        1.0|
|  Male|Yellow| 400|20.0|     1|        1.0|
+------+------+----+----+------+-----------+

+------+------+----+----+------+-----------+----------+
|gender| color|num1|num2|target|genderIndex|colorIndex|
+------+------+----+----+---

In [None]:
#Assemble the vectors into 1 features vector, using VectorAssembler transformator
assembler = VectorAssembler(
    inputCols=["genderEncoded", "colorEncoded", "num1", "num2"],
    outputCol="features")

output = assembler.transform(encoded_2)
output.show(truncate=False)

+------+------+----+----+------+-----------+----------+-------------+-------------+--------------------------------+
|gender|color |num1|num2|target|genderIndex|colorIndex|genderEncoded|colorEncoded |features                        |
+------+------+----+----+------+-----------+----------+-------------+-------------+--------------------------------+
|Female|Blue  |300 |0.0 |0     |0.0        |0.0       |(1,[0],[1.0])|(4,[0],[1.0])|(7,[0,1,5],[1.0,1.0,300.0])     |
|Female|Black |200 |15.1|1     |0.0        |2.0       |(1,[0],[1.0])|(4,[2],[1.0])|[1.0,0.0,0.0,1.0,0.0,200.0,15.1]|
|Male  |Red   |100 |12.4|0     |1.0        |3.0       |(1,[],[])    |(4,[3],[1.0])|(7,[4,5,6],[1.0,100.0,12.4])    |
|Female|Green |100 |0.0 |1     |0.0        |1.0       |(1,[0],[1.0])|(4,[1],[1.0])|(7,[0,2,5],[1.0,1.0,100.0])     |
|Female|Blue  |200 |0.0 |0     |0.0        |0.0       |(1,[0],[1.0])|(4,[0],[1.0])|(7,[0,1,5],[1.0,1.0,200.0])     |
|Male  |Green |400 |20.0|1     |1.0        |1.0       |(1,[],[])

In [None]:
#Create the same process with Pipeline

indexer_1 = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexer_2 = StringIndexer(inputCol="color", outputCol="colorIndex")
indexers = [indexer_1, indexer_2]

encoder_1 = OneHotEncoder(inputCols=["genderIndex"], outputCols=["genderEncoded"])
encoder_2 = OneHotEncoder(inputCols=["colorIndex"], outputCols=["colorEncoded"])
encoders = [encoder_1, encoder_2]

assembler = VectorAssembler(inputCols=["genderEncoded", "colorEncoded", "num1", "num2"], outputCol="features")

pipeline = Pipeline(stages=indexers + encoders + [assembler])

model=pipeline.fit(df)
data = model.transform(df)

data.show(truncate=False)

+------+------+----+----+------+-----------+----------+-------------+-------------+--------------------------------+
|gender|color |num1|num2|target|genderIndex|colorIndex|genderEncoded|colorEncoded |features                        |
+------+------+----+----+------+-----------+----------+-------------+-------------+--------------------------------+
|Female|Blue  |300 |0.0 |0     |0.0        |0.0       |(1,[0],[1.0])|(4,[0],[1.0])|(7,[0,1,5],[1.0,1.0,300.0])     |
|Female|Black |200 |15.1|1     |0.0        |2.0       |(1,[0],[1.0])|(4,[2],[1.0])|[1.0,0.0,0.0,1.0,0.0,200.0,15.1]|
|Male  |Red   |100 |12.4|0     |1.0        |3.0       |(1,[],[])    |(4,[3],[1.0])|(7,[4,5,6],[1.0,100.0,12.4])    |
|Female|Green |100 |0.0 |1     |0.0        |1.0       |(1,[0],[1.0])|(4,[1],[1.0])|(7,[0,2,5],[1.0,1.0,100.0])     |
|Female|Blue  |200 |0.0 |0     |0.0        |0.0       |(1,[0],[1.0])|(4,[0],[1.0])|(7,[0,1,5],[1.0,1.0,200.0])     |
|Male  |Green |400 |20.0|1     |1.0        |1.0       |(1,[],[])