<a href="https://colab.research.google.com/github/vu-topics-in-big-data-2023/examples/blob/main/spark-ml/Feature_Selector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#A common problem is feature selection

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#install spark. we are using the one that uses hadoop as the underlying scheduler.
!wget -q https://downloads.apache.org/spark//spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!ls -l
os.environ["SPARK_HOME"] = "spark-3.2.1-bin-hadoop3.2"
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

total 293932
drwxr-xr-x  1 root root      4096 Apr 25 13:46 sample_data
drwxr-xr-x 13  501 1000      4096 Jan 20 20:10 spark-3.2.1-bin-hadoop3.2
-rw-r--r--  1 root root 300971569 Jan 20 21:37 spark-3.2.1-bin-hadoop3.2.tgz


In [3]:
#VectorSlicer is a transformer that takes a feature vector and outputs a new feature vector with a sub-array of the original features. It is useful for extracting features from a vector column.

from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

output = slicer.transform(df)

output.select("userFeatures", "features").show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(1,[0],[2.3])|
|      [-2.0,2.3,0.0]|        [2.3]|
+--------------------+-------------+



In [4]:
#VarianceThresholdSelector is a selector that removes low-variance features. Features with a variance not greater than the varianceThreshold will be removed. If not set, varianceThreshold defaults to 0, which means only features with variance 0 (i.e. features that have the same value in all samples) will be removed.

from pyspark.ml.feature import VarianceThresholdSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (1, Vectors.dense([6.0, 7.0, 0.0, 7.0, 6.0, 0.0])),
    (2, Vectors.dense([0.0, 9.0, 6.0, 0.0, 5.0, 9.0])),
    (3, Vectors.dense([0.0, 9.0, 3.0, 0.0, 5.0, 5.0])),
    (4, Vectors.dense([0.0, 9.0, 8.0, 5.0, 6.0, 4.0])),
    (5, Vectors.dense([8.0, 9.0, 6.0, 5.0, 4.0, 4.0])),
    (6, Vectors.dense([8.0, 9.0, 6.0, 0.0, 0.0, 0.0]))], ["id", "features"])

selector = VarianceThresholdSelector(varianceThreshold=8.0, outputCol="selectedFeatures")

result = selector.fit(df).transform(df)

print("Output: Features with variance lower than %f are removed." %
      selector.getVarianceThreshold())
result.show()

Output: Features with variance lower than 8.000000 are removed.
+---+--------------------+-----------------+
| id|            features| selectedFeatures|
+---+--------------------+-----------------+
|  1|[6.0,7.0,0.0,7.0,...|[6.0,0.0,7.0,0.0]|
|  2|[0.0,9.0,6.0,0.0,...|[0.0,6.0,0.0,9.0]|
|  3|[0.0,9.0,3.0,0.0,...|[0.0,3.0,0.0,5.0]|
|  4|[0.0,9.0,8.0,5.0,...|[0.0,8.0,5.0,4.0]|
|  5|[8.0,9.0,6.0,5.0,...|[8.0,6.0,5.0,4.0]|
|  6|[8.0,9.0,6.0,0.0,...|[8.0,6.0,0.0,0.0]|
+---+--------------------+-----------------+

