<a href="https://colab.research.google.com/github/vu-topics-in-big-data-2023/examples/blob/main/spark-ml/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Binarization is the process of thresholding numerical features to binary (0/1) features.
#https://spark.apache.org/docs/3.1.1/ml-features.html#binarizer

In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#install spark. we are using the one that uses hadoop as the underlying scheduler.
!wget -q https://downloads.apache.org/spark//spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!ls -l
os.environ["SPARK_HOME"] = "spark-3.2.1-bin-hadoop3.2"
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

total 587852
drwxr-xr-x  1 root root      4096 Apr 25 13:46 sample_data
drwxr-xr-x 13  501 1000      4096 Jan 20 20:10 spark-3.2.1-bin-hadoop3.2
-rw-r--r--  1 root root 300971569 Jan 20 21:37 spark-3.2.1-bin-hadoop3.2.tgz
-rw-r--r--  1 root root 300971569 Jan 20 21:37 spark-3.2.1-bin-hadoop3.2.tgz.1


In [6]:
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([
    (0, 0.1),
    (1, 0.8),
    (2, 0.2)
], ["id", "feature"])

binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

Binarizer output with Threshold = 0.500000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    0.1|              0.0|
|  1|    0.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



In [7]:
#Another interesting transformer is PCA
#PCA is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components

In [8]:
from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

static sparse(size, *args)[source]
Create a sparse vector, using either a dictionary, a list of (index, value) pairs, or two separate arrays of indices and values (sorted by index).

Parameters
sizeint
Size of the vector.

args
Non-zero entries, as a dictionary, list of tuples, or two sorted lists containing indices and values.

In [9]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])

In [10]:
df.show(truncate=False)

+---------------------+
|features             |
+---------------------+
|(5,[1,3],[1.0,7.0])  |
|[2.0,0.0,3.0,4.0,5.0]|
|[4.0,0.0,0.0,6.0,7.0]|
+---------------------+



In [11]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")

In [12]:
model = pca.fit(df)

In [13]:
#select specific output and transform the dataframe
result = model.transform(df).select(["features","pcaFeatures"])
result.explain()

== Physical Plan ==
*(1) Project [features#54, UDF(features#54) AS pcaFeatures#66]
+- *(1) Scan ExistingRDD[features#54]




In [14]:
result.show(truncate=False)

+---------------------+------------------------------------------------------------+
|features             |pcaFeatures                                                 |
+---------------------+------------------------------------------------------------+
|(5,[1,3],[1.0,7.0])  |[1.6485728230883814,-4.0132827005162985,-1.0091435193998504]|
|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781533,-1.1167972663619048,-1.0091435193998501]|
|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676488,-5.337951427775359,-1.009143519399851]  |
+---------------------+------------------------------------------------------------+



In [15]:
# An important transformer is discrete consine transofm
#The Discrete Cosine Transform transforms a length N real-valued sequence in the time domain into another length N real-valued sequence in the frequency domain. 

In [16]:
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
    (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
    (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")

dctDf = dct.transform(df)

dctDf.select(["features","featuresDCT"]).show(truncate=False)

+--------------------+----------------------------------------------------------------+
|features            |featuresDCT                                                     |
+--------------------+----------------------------------------------------------------+
|[0.0,1.0,-2.0,3.0]  |[1.0,-1.1480502970952693,2.0000000000000004,-2.7716385975338604]|
|[-1.0,2.0,4.0,-7.0] |[-1.0,3.378492794482933,-7.000000000000001,2.9301512653149677]  |
|[14.0,-2.0,-5.0,1.0]|[4.0,9.304453421915744,11.000000000000002,1.5579302036357163]   |
+--------------------+----------------------------------------------------------------+



In [17]:
#Another useful thing is string Indexer. 
from pyspark.ml.feature import StringIndexer, IndexToString

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [18]:
#Index to String is similar
#Applying IndexToString with categoryIndex as the input column, originalCategory as the output column, we are able to retrieve our original labels (they will be inferred from the columns’ metadata):
print("Transformed string column '%s' to indexed column '%s'"
      % (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer stores labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)

Transformed string column 'category' to indexed column 'categoryIndex'
+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+

StringIndexer stores labels in output column metadata



In [19]:

converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)

print("Transformed indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()


Transformed indexed column 'categoryIndex' back to original string column 'originalCategory' using labels in metadata
+---+-------------+----------------+
| id|categoryIndex|originalCategory|
+---+-------------+----------------+
|  0|          0.0|               a|
|  1|          2.0|               b|
|  2|          1.0|               c|
|  3|          0.0|               a|
|  4|          0.0|               a|
|  5|          1.0|               c|
+---+-------------+----------------+



In [20]:
#One Hot Encoding is another useful transformer
#One-hot encoding maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value 
#indicating the presence of a specific feature value from among the set of all feature values.
#This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features. 
#For string type input data, it is common to encode categorical features using StringIndexer first.

In [21]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()
#For those not familiar with vector types in spark, the sparse vector seen in the third and fourth column below has 3 parts.
#The first component which is a 0 indicates that it is a sparse vector. Not shown in the output
#The second component talks about the size of the vector. The third component talks about the indices where the vector is populated while the fourth component talks about what values these are. 
#This truncates the vector and is really efficient when you have really large vector representations.

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [22]:
from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import FloatType,ArrayType
from pyspark.sql.functions import UserDefinedFunction
def toDense(v):
  v = DenseVector(v)
  new_array = list([float(x) for x in v])
  return new_array 
denseudf=UserDefinedFunction(toDense, ArrayType(FloatType()))
encoded.withColumn('categoryVecDense',denseudf('categoryVec1')).show()

+--------------+--------------+-------------+-------------+----------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|categoryVecDense|
+--------------+--------------+-------------+-------------+----------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|      [1.0, 0.0]|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|      [0.0, 1.0]|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|      [0.0, 0.0]|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|      [1.0, 0.0]|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|      [1.0, 0.0]|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|      [0.0, 0.0]|
+--------------+--------------+-------------+-------------+----------------+



In [23]:
#in the above 

In [24]:
#Normalizer is used transforms a dataset of Vector rows, normalizing each Vector to have unit norm.
# It takes parameter p, which specifies the p-norm used for normalization. (p=2 by default.) 
# This normalization can help standardize your input data and improve the behavior of learning algorithms.

In [25]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show(truncate=False)

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show(truncate=False)


# Normalize each Vector using $L^2$ norm.
l2NormData = normalizer.transform(dataFrame, {normalizer.p: float(2)})
print("Normalized using L^2 norm")
l2NormData.show(truncate=False)


Normalized using L^1 norm
+---+--------------+------------------+
|id |features      |normFeatures      |
+---+--------------+------------------+
|0  |[1.0,0.5,-1.0]|[0.4,0.2,-0.4]    |
|1  |[2.0,1.0,1.0] |[0.5,0.25,0.25]   |
|2  |[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+

Normalized using L^inf norm
+---+--------------+--------------+
|id |features      |normFeatures  |
+---+--------------+--------------+
|0  |[1.0,0.5,-1.0]|[1.0,0.5,-1.0]|
|1  |[2.0,1.0,1.0] |[1.0,0.5,0.5] |
|2  |[4.0,10.0,2.0]|[0.4,1.0,0.2] |
+---+--------------+--------------+

Normalized using L^2 norm
+---+--------------+-----------------------------------------------------------+
|id |features      |normFeatures                                               |
+---+--------------+-----------------------------------------------------------+
|0  |[1.0,0.5,-1.0]|[0.6666666666666666,0.3333333333333333,-0.6666666666666666]|
|1  |[2.0,1.0,1.0] |[0.8164965809277261,0.4082482904638631,0.4

In [26]:
#Bucketizer: Bucketizer transforms a column of continuous features to a column of feature buckets
# it usesthe split parameter for mapping continuous features into buckets

In [27]:
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+

