In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
 

In [2]:
# Spark session created . Basically it is important to manage the DB session, as there are limited threads to these sessions
spark = SparkSession.builder.appName("nlp").getOrCreate()

In [27]:
# Read in data and store in dataframe
# Pyspark can infer schema types by setting inferSchema and header to True
#df = spark.read.csv("./InputData/winemag-data-130k-v2.csv", inferSchema=True, header=True)
df = spark.read.csv("test.csv", inferSchema=True, header=True)

In [28]:
tokened = Tokenizer(inputCol="description", outputCol="words")
tokened

Tokenizer_48138d3e96ad3d9dce05

In [29]:
# Transform and show dataframe
tokened_transformed = tokened.transform(df )
tokened_transformed.select('words').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                                                                                                                                  |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[aromas, include, tropical, fruit, broom, brimstone, and, dried, herb, the, palate, isn't, overly, expres

In [30]:
stop_list = ["box"]
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_list)
removed_frame = remover.transform(tokened_transformed)
removed_frame.show(truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|description                                                                                                                  

In [31]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues")
hashing

HashingTF_4b75b5e1090fe0a877ec

In [32]:
# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.select("description", "words", "filtered","hashedValues").show()

+--------------------+--------------------+--------------------+--------------------+
|         description|               words|            filtered|        hashedValues|
+--------------------+--------------------+--------------------+--------------------+
|Aromas include tr...|[aromas, include,...|[aromas, include,...|(262144,[11076,21...|
|This is ripe and ...|[this, is, ripe, ...|[this, is, ripe, ...|(262144,[5460,158...|
|Tart and snappy t...|[tart, and, snapp...|[tart, and, snapp...|(262144,[2650,963...|
|Pineapple rind le...|[pineapple, rind,...|[pineapple, rind,...|(262144,[9639,158...|
|Much like the reg...|[much, like, the,...|[much, like, the,...|(262144,[9639,164...|
|Blackberry and ra...|[blackberry, and,...|[blackberry, and,...|(262144,[3521,963...|
|Here's a bright i...|[here's, a, brigh...|[here's, a, brigh...|(262144,[9639,110...|
|This dry and rest...|[this, dry, and, ...|[this, dry, and, ...|(262144,[16332,21...|
|Savory dried thym...|[savory, dried, t...|[savory, dr

In [33]:
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)

In [34]:
# idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [35]:
rescaledData.select("description", "features").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Feature Transformations

In [None]:
start_data = 