# NATURAL LANGUAGE PROCESSING

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 11 - Demo NLP')
ss= SparkSession(sc)

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [5]:
sentence_df= ss.createDataFrame(
    [(0, 'I head about Pysparl'),
    (1, 'I know Spark can work well with NLP'),
    (2, 'logistic,regression,model,are,supervised')],
    ['id', 'sentence'])

In [18]:
# Tokenizer
tokenizer= Tokenizer(inputCol= 'sentence', outputCol= 'word')
count_token= udf(lambda word: len(word), IntegerType())
token_word= tokenizer.transform(sentence_df)
token_word.select('sentence', 'word')\
.withColumn('tokens', count_token(col('word'))).show(3, False)

+----------------------------------------+--------------------------------------------+------+
|sentence                                |word                                        |tokens|
+----------------------------------------+--------------------------------------------+------+
|I head about Pysparl                    |[i, head, about, pysparl]                   |4     |
|I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]|8     |
|logistic,regression,model,are,supervised|[logistic,regression,model,are,supervised]  |1     |
+----------------------------------------+--------------------------------------------+------+



In [39]:
# RegexTokenizer
regex_tokenizer= RegexTokenizer(inputCol= 'sentence', outputCol= 'word', pattern= '\\W')
token_word= regex_tokenizer.transform(sentence_df)
token_word.select('sentence', 'word')\
.withColumn('tokens', count_token(col('word'))).show(3, False)

+----------------------------------------+----------------------------------------------+------+
|sentence                                |word                                          |tokens|
+----------------------------------------+----------------------------------------------+------+
|I head about Pysparl                    |[i, head, about, pysparl]                     |4     |
|I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]  |8     |
|logistic,regression,model,are,supervised|[logistic, regression, model, are, supervised]|5     |
+----------------------------------------+----------------------------------------------+------+



In [31]:
# StopWordsRemover
from pyspark.ml.feature import StopWordsRemover
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag('en-US'))

stopwords_remover= StopWordsRemover(inputCol= 'word', outputCol= 'filltered')
token_word_filltered= stopwords_remover.transform(token_word)
token_word_filltered.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|                word|           filltered|
+---+--------------------+--------------------+--------------------+
|  0|I head about Pysparl|[i, head, about, ...|     [head, pysparl]|
|  1|I know Spark can ...|[i, know, spark, ...|[know, spark, wor...|
|  2|logistic,regressi...|[logistic,regress...|[logistic,regress...|
+---+--------------------+--------------------+--------------------+



In [40]:
# Ngram
from pyspark.ml.feature import NGram
ngram= NGram(inputCol= 'word', outputCol= 'filltered', n= 2)
ngram_word_filltered= ngram.transform(token_word)
ngram_word_filltered.select('word', 'filltered').show(truncate= False)


+----------------------------------------------+-------------------------------------------------------------------------+
|word                                          |filltered                                                                |
+----------------------------------------------+-------------------------------------------------------------------------+
|[i, head, about, pysparl]                     |[i head, head about, about pysparl]                                      |
|[i, know, spark, can, work, well, with, nlp]  |[i know, know spark, spark can, can work, work well, well with, with nlp]|
|[logistic, regression, model, are, supervised]|[logistic regression, regression model, model are, are supervised]       |
+----------------------------------------------+-------------------------------------------------------------------------+



In [45]:
# TF-IDF
sen_df= ss.createDataFrame(
    [(0, 'a, b, c'),
    (1, 'a, b, c, a'),
    (2, 'a, b, d, d, a, c, c')],
    ['id', 'sentence'])
token_sen_df= regex_tokenizer.transform(sen_df)
token_sen_df.show(truncate= False)

+---+-------------------+---------------------+
|id |sentence           |word                 |
+---+-------------------+---------------------+
|0  |a, b, c            |[a, b, c]            |
|1  |a, b, c, a         |[a, b, c, a]         |
|2  |a, b, d, d, a, c, c|[a, b, d, d, a, c, c]|
+---+-------------------+---------------------+



In [50]:
from pyspark.ml.feature import HashingTF, IDF
hashing_TF= HashingTF(inputCol= 'word', outputCol= 'rowfeature', numFeatures= 10)
featurized_data= hashing_TF.transform(token_sen_df)
featurized_data.show(truncate= False)

+---+-------------------+---------------------+--------------------------------+
|id |sentence           |word                 |rowfeature                      |
+---+-------------------+---------------------+--------------------------------+
|0  |a, b, c            |[a, b, c]            |(10,[0,1,2],[1.0,1.0,1.0])      |
|1  |a, b, c, a         |[a, b, c, a]         |(10,[0,1,2],[2.0,1.0,1.0])      |
|2  |a, b, d, d, a, c, c|[a, b, d, d, a, c, c]|(10,[0,1,2,4],[2.0,1.0,2.0,2.0])|
+---+-------------------+---------------------+--------------------------------+



In [52]:
idf= IDF(inputCol= 'rowfeature', outputCol= 'features')
idf_model= idf.fit(featurized_data)
rescale_data= idf_model.transform(featurized_data)
rescale_data.select('rowfeature', 'features').show(truncate= False)

+--------------------------------+-----------------------------------------------+
|rowfeature                      |features                                       |
+--------------------------------+-----------------------------------------------+
|(10,[0,1,2],[1.0,1.0,1.0])      |(10,[0,1,2],[0.0,0.0,0.0])                     |
|(10,[0,1,2],[2.0,1.0,1.0])      |(10,[0,1,2],[0.0,0.0,0.0])                     |
|(10,[0,1,2,4],[2.0,1.0,2.0,2.0])|(10,[0,1,2,4],[0.0,0.0,0.0,1.3862943611198906])|
+--------------------------------+-----------------------------------------------+



In [56]:
# CountVectorizer
from pyspark.ml.feature import CountVectorizer
count_vectorizer= CountVectorizer(inputCol= 'word', outputCol= 'features')
count_vectorizer_model= count_vectorizer.fit(token_sen_df)
count_vectorizer_model.transform(token_sen_df).show(truncate= False)

+---+-------------------+---------------------+-------------------------------+
|id |sentence           |word                 |features                       |
+---+-------------------+---------------------+-------------------------------+
|0  |a, b, c            |[a, b, c]            |(4,[0,1,2],[1.0,1.0,1.0])      |
|1  |a, b, c, a         |[a, b, c, a]         |(4,[0,1,2],[2.0,1.0,1.0])      |
|2  |a, b, d, d, a, c, c|[a, b, d, d, a, c, c]|(4,[0,1,2,3],[2.0,2.0,1.0,2.0])|
+---+-------------------+---------------------+-------------------------------+

