# Hands-on MLlib Basic - Data Transformation


In [1]:
#do this for Google Colab
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=70d54cefb3bfbd9446f668670e381ccfae699e4967e8ffbd9a3f8fbf08ec4bf9
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
#import necessary packages
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler


In [None]:
#Create Spark Session
spark = SparkSession.builder.appName('MlLib Basics').getOrCreate()

Proses untuk melakukan one-hot encoding dari variabel kategorikal yaitu
1. StringIndexer : mengubah string ke dalam integer
2. OneHotEncoder : mengubah integer menjadi vektor one-hot encoding dalam bentuk *sparse vector format*
3. VectorAssembler : menyatukan seluruh kolom input menjadi 1 vektor feature

Bentuk sparse vector format : `[Vector size, [Index of nonzero elements], [Values of nonzero elements]]`

Sehingga untuk hasil encoding gender yaitu `gender_encoded` = `[0]` dinyatakan dengan `[[1], [], []]` dan `[1]` dinyatakan dengan `[[1], [1], [1]]`



In [None]:
df = spark.createDataFrame( [(0, "Male"),
                             (1, "Male"),
                             (2, "Female"),
                             (3, "Female"),
                             (4, "Female"),
                             (5, "Male")
                          ], ["id", "gender"])

df.show()

#Indexer
print("Hasil indexer")
indexer = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

In [None]:
print("Hasil encoder dalam bentuk sparse")
encoder = OneHotEncoder(inputCols=["genderIndex"],
                        outputCols=["genderEncoded"])
encoded = encoder.fit(indexed).transform(indexed)
encoded.show()

untuk `color_encoded` ada 5 nilai : blue, black, red, green, yellow. Sehingga hasil one-hot encodingnya ada 4 kolom
- `[0,0,0,0]` sparse vector : `[4,[],[]]`
- `[1,0,0,0]` sparse vector : `[4,[0],[1]]`
- `[0,1,0,0]` sparse vector : `[4,[1],[1]]`
- `[0,0,1,0]` sparse vector : `[4,[2],[1]]`
- `[0,0,0,1]` sparse vector : `[4,[3],[1]]`


In [None]:
df = spark.createDataFrame( [("Female", "Blue", 300, 0.0, 0),
                             ("Female", "Black", 200, 15.1, 1),
                             ("Male", "Red", 100, 12.4, 0),
                             ("Female", "Green", 100, 0.0, 1),
                             ("Female", "Blue", 200, 0.0, 0),
                             ("Male", "Green", 400, 20.0, 1),
                              ("Male", "Yellow", 400, 20.0, 1)],
                            ["gender", "color", "num1", "num2", "target"])

df.show()

indexer_1 = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexed_1 = indexer_1.fit(df).transform(df)
#indexed_1.show()



In [None]:
indexed_1.show()

In [None]:
indexer_2 = StringIndexer(inputCol="color", outputCol="colorIndex")
indexed_2 = indexer_2.fit(indexed_1).transform(indexed_1)
indexed_2.show()

In [None]:
encoder_1 = OneHotEncoder(inputCols=["genderIndex"],
                        outputCols=["genderEncoded"])
encoded_1 = encoder_1.fit(indexed_2).transform(indexed_2)
encoded_1.show()

In [None]:
encoder_2 = OneHotEncoder(inputCols=["colorIndex"],
                        outputCols=["colorEncoded"])
encoded_2 = encoder_2.fit(encoded_1).transform(encoded_1)
encoded_2.show()

In [None]:
#Assemble the vectors into 1 features vector, using VectorAssembler transformator
assembler = VectorAssembler(
    inputCols=["genderEncoded", "colorEncoded", "num1", "num2"],
    outputCol="features")

output = assembler.transform(encoded_2)
output.show(truncate=False)

In [None]:
#Create the same process with Pipeline

indexer_1 = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexer_2 = StringIndexer(inputCol="color", outputCol="colorIndex")
indexers = [indexer_1, indexer_2]

encoder_1 = OneHotEncoder(inputCols=["genderIndex"], outputCols=["genderEncoded"])
encoder_2 = OneHotEncoder(inputCols=["colorIndex"], outputCols=["colorEncoded"])
encoders = [encoder_1, encoder_2]

assembler = VectorAssembler(inputCols=["genderEncoded", "colorEncoded", "num1", "num2"], outputCol="features")

pipeline = Pipeline(stages=indexers + encoders + [assembler])

model=pipeline.fit(df)
data = model.transform(df)

data.show(truncate=False)