In [4]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('9_NPL_1').getOrCreate()
df=spark.createDataFrame([(1,'I really liked Bitcoin '),
 (2,'I would recommend this coin to my friends'),
(3,'Crytocurencies was alright but I actually afraid of them'),
(4,'We believe in BNB and CZ'),
(5,'I am never buy that coin ever again')],
 ['user_id','review'])
df.show(5,False)

+-------+--------------------------------------------------------+
|user_id|review                                                  |
+-------+--------------------------------------------------------+
|1      |I really liked Bitcoin                                  |
|2      |I would recommend this coin to my friends               |
|3      |Crytocurencies was alright but I actually afraid of them|
|4      |We believe in BNB and CZ                                |
|5      |I am never buy that coin ever again                     |
+-------+--------------------------------------------------------+



In [3]:
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='review',outputCol='tokens')
tokenized_df=tokenization.transform(df)
tokenized_df.show(5,False)

[Stage 1:>                                                          (0 + 1) / 1]

+-------+--------------------------------------------------------+------------------------------------------------------------------+
|user_id|review                                                  |tokens                                                            |
+-------+--------------------------------------------------------+------------------------------------------------------------------+
|1      |I really liked Bitcoin                                  |[i, really, liked, bitcoin]                                       |
|2      |I would recommend this coin to my friends               |[i, would, recommend, this, coin, to, my, friends]                |
|3      |Crytocurencies was alright but I actually afraid of them|[crytocurencies, was, alright, but, i, actually, afraid, of, them]|
|4      |We believe in BNB and CZ                                |[we, believe, in, bnb, and, cz]                                   |
|5      |I am never buy that coin ever again                  

                                                                                

In [5]:
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [7]:
refined_df=stopword_removal.transform(tokenized_df)
refined_df.select(['user_id','tokens','refined_tokens']).show(5,False)

+-------+------------------------------------------------------------------+-------------------------------------------+
|user_id|tokens                                                            |refined_tokens                             |
+-------+------------------------------------------------------------------+-------------------------------------------+
|1      |[i, really, liked, bitcoin]                                       |[really, liked, bitcoin]                   |
|2      |[i, would, recommend, this, coin, to, my, friends]                |[recommend, coin, friends]                 |
|3      |[crytocurencies, was, alright, but, i, actually, afraid, of, them]|[crytocurencies, alright, actually, afraid]|
|4      |[we, believe, in, bnb, and, cz]                                   |[believe, bnb, cz]                         |
|5      |[i, am, never, buy, that, coin, ever, again]                      |[never, buy, coin, ever]                   |
+-------+-----------------------

In [26]:
from pyspark.ml.feature import CountVectorizer
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
cv_df=count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['user_id','refined_tokens','features']).show(5,False)

+-------+-------------------------------------------+----------------------------------+
|user_id|refined_tokens                             |features                          |
+-------+-------------------------------------------+----------------------------------+
|1      |[really, liked, bitcoin]                   |(16,[2,8,9],[1.0,1.0,1.0])        |
|2      |[recommend, coin, friends]                 |(16,[0,3,14],[1.0,1.0,1.0])       |
|3      |[crytocurencies, alright, actually, afraid]|(16,[4,5,10,13],[1.0,1.0,1.0,1.0])|
|4      |[believe, bnb, cz]                         |(16,[1,11,12],[1.0,1.0,1.0])      |
|5      |[never, buy, coin, ever]                   |(16,[0,6,7,15],[1.0,1.0,1.0,1.0]) |
+-------+-------------------------------------------+----------------------------------+



In [24]:
count_vec.fit(refined_df).vocabulary

['coin',
 'believe',
 'liked',
 'recommend',
 'actually',
 'afraid',
 'buy',
 'never',
 'bitcoin',
 'really',
 'crytocurencies',
 'bnb',
 'cz',
 'alright',
 'friends',
 'ever']

In [27]:
from pyspark.ml.feature import HashingTF,IDF
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')
hashing_df=hashing_vec.transform(refined_df)
hashing_df.select(['user_id','refined_tokens','tf_features']).show(5,False)

+-------+-------------------------------------------+--------------------------------------------------------+
|user_id|refined_tokens                             |tf_features                                             |
+-------+-------------------------------------------+--------------------------------------------------------+
|1      |[really, liked, bitcoin]                   |(262144,[99172,132778,229264],[1.0,1.0,1.0])            |
|2      |[recommend, coin, friends]                 |(262144,[68228,130047,163906],[1.0,1.0,1.0])            |
|3      |[crytocurencies, alright, actually, afraid]|(262144,[126078,132975,171118,194136],[1.0,1.0,1.0,1.0])|
|4      |[believe, bnb, cz]                         |(262144,[191454,211748,225157],[1.0,1.0,1.0])           |
|5      |[never, buy, coin, ever]                   |(262144,[113673,163906,203802,213760],[1.0,1.0,1.0,1.0])|
+-------+-------------------------------------------+--------------------------------------------------------+



In [28]:
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['user_id','tf_idf_features']).show(5,False)

                                                                                

22/10/27 11:57:16 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
+-------+--------------------------------------------------------------------------------------------------------------------+
|user_id|tf_idf_features                                                                                                     |
+-------+--------------------------------------------------------------------------------------------------------------------+
|1      |(262144,[99172,132778,229264],[1.0986122886681098,1.0986122886681098,1.0986122886681098])                           |
|2      |(262144,[68228,130047,163906],[1.0986122886681098,1.0986122886681098,0.6931471805599453])                           |
|3      |(262144,[126078,132975,171118,194136],[1.0986122886681098,1.0986122886681098,1.0986122886681098,1.0986122886681098])|
|4      |(262144,[191454,211748,225157],[1.0986122886681098,1.0986122886681098,1.0986122886681098])                          |
|5      |(262144,[113673,

In [34]:
tf_idf_df.select(['tf_idf_features']).show(5,False)

22/10/27 11:58:52 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
+--------------------------------------------------------------------------------------------------------------------+
|tf_idf_features                                                                                                     |
+--------------------------------------------------------------------------------------------------------------------+
|(262144,[99172,132778,229264],[1.0986122886681098,1.0986122886681098,1.0986122886681098])                           |
|(262144,[68228,130047,163906],[1.0986122886681098,1.0986122886681098,0.6931471805599453])                           |
|(262144,[126078,132975,171118,194136],[1.0986122886681098,1.0986122886681098,1.0986122886681098,1.0986122886681098])|
|(262144,[191454,211748,225157],[1.0986122886681098,1.0986122886681098,1.0986122886681098])                          |
|(262144,[113673,163906,203802,213760],[1.0986122886681098,0.6931471805599453,1.0