# DEMO - NATURAL LANGUAGE PROCESSING (NLP)

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [None]:
sc= SparkContext(master= 'local', appName= 'Chapter 11 - Demo NLP')
ss= SparkSession(sc)

In [167]:
sen_df= ss.createDataFrame(
    [(0, 'a b c'),
    (1, 'a, c, a'),
    (2, 'a b d, d a. c c b')],
    ['id', 'sentence'])
sen_df.show()

+---+-----------------+
| id|         sentence|
+---+-----------------+
|  0|            a b c|
|  1|          a, c, a|
|  2|a b d, d a. c c b|
+---+-----------------+



## Tokenizer

Chia văn bản thành list các từ viết thường, bởi dấu space

In [169]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

count_token= udf(f= lambda sentence: len(sentence) , returnType= IntegerType())

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

sen_df_by_tokenizer= tokenizer.transform(sen_df).withColumn('count', count_token(col('words')))
sen_df_by_tokenizer.show(truncate= False)

+---+-----------------+--------------------------+-----+
|id |sentence         |words                     |count|
+---+-----------------+--------------------------+-----+
|0  |a b c            |[a, b, c]                 |3    |
|1  |a, c, a          |[a,, c,, a]               |3    |
|2  |a b d, d a. c c b|[a, b, d,, d, a., c, c, b]|8    |
+---+-----------------+--------------------------+-----+



## RegexTokenizer

In [170]:
from pyspark.ml.feature import RegexTokenizer
regex_tokenizer= RegexTokenizer(inputCol="sentence", outputCol="words", gaps= True,
                                 pattern='\\W', toLowercase= True)
sen_df_by_regextokenizer= regex_tokenizer.transform(sen_df).withColumn('count', count_token(col('words')))
sen_df_by_regextokenizer.show(truncate= False)

+---+-----------------+------------------------+-----+
|id |sentence         |words                   |count|
+---+-----------------+------------------------+-----+
|0  |a b c            |[a, b, c]               |3    |
|1  |a, c, a          |[a, c, a]               |3    |
|2  |a b d, d a. c c b|[a, b, d, d, a, c, c, b]|8    |
+---+-----------------+------------------------+-----+



## StopWordsRemover

In [96]:
from pyspark.ml.feature import StopWordsRemover
en_stopwords= StopWordsRemover.loadDefaultStopWords('english')
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))
stopwords_remover= StopWordsRemover(inputCol= 'words', outputCol= 'words_filtered', stopWords= en_stopwords)

In [171]:
sen_df_stopwords_removed= stopwords_remover.transform(sen_df_by_regextokenizer)
sen_df_stopwords_removed.show(truncate= False)

+---+-----------------+------------------------+-----+------------------+
|id |sentence         |words                   |count|words_filtered    |
+---+-----------------+------------------------+-----+------------------+
|0  |a b c            |[a, b, c]               |3    |[b, c]            |
|1  |a, c, a          |[a, c, a]               |3    |[c]               |
|2  |a b d, d a. c c b|[a, b, d, d, a, c, c, b]|8    |[b, d, d, c, c, b]|
+---+-----------------+------------------------+-----+------------------+



## NGram

In [172]:
from pyspark.ml.feature import NGram
ngram= NGram(inputCol= 'words', outputCol= 'n_grams', n= 2)
ngram.transform(sen_df_by_regextokenizer).show()

+---+-----------------+--------------------+-----+--------------------+
| id|         sentence|               words|count|             n_grams|
+---+-----------------+--------------------+-----+--------------------+
|  0|            a b c|           [a, b, c]|    3|          [a b, b c]|
|  1|          a, c, a|           [a, c, a]|    3|          [a c, c a]|
|  2|a b d, d a. c c b|[a, b, d, d, a, c...|    8|[a b, b d, d d, d...|
+---+-----------------+--------------------+-----+--------------------+



## CountVectorizer  
Tương đương với TF nhưng được chuẩn hóa

In [189]:
from pyspark.ml.feature import CountVectorizer
count_vectorizer= CountVectorizer(inputCol= 'words_filtered', outputCol= 'tf_word')
count_vectorizer_model= count_vectorizer.fit(sen_df_stopwords_removed)
count_vectorized= count_vectorizer_model.transform(sen_df_stopwords_removed)
count_vectorized.select('words_filtered', 'tf_word').show(truncate= False)

+------------------+-------------------------+
|words_filtered    |tf_word                  |
+------------------+-------------------------+
|[b, c]            |(3,[0,1],[1.0,1.0])      |
|[c]               |(3,[0],[1.0])            |
|[b, d, d, c, c, b]|(3,[0,1,2],[2.0,2.0,2.0])|
+------------------+-------------------------+



## TF-IDF

### HashingTF  
Vì CountVectorizer đã trả về ma trận số lần xuất hiện của từ được chuẩn hóa theo toàn tập data vì vậy dùng luôn kết quả từ CountVectorizer mà không cần tihs TF nữa.

In [185]:
from pyspark.ml.feature import  HashingTF
hashing_tf= HashingTF(inputCol= 'words_filtered', outputCol= 'tf_word', binary= False)
tf_df= hashing_tf.transform(sen_df_stopwords_removed)
tf_df.select('words_filtered', 'tf_word').show(truncate= False)

+------------------+------------------------------------------+
|words_filtered    |tf_word                                   |
+------------------+------------------------------------------+
|[b, c]            |(262144,[28698,30913],[1.0,1.0])          |
|[c]               |(262144,[28698],[1.0])                    |
|[b, d, d, c, c, b]|(262144,[27526,28698,30913],[2.0,2.0,2.0])|
+------------------+------------------------------------------+



### IDF

In [190]:
from pyspark.ml.feature import IDF
idf= IDF(inputCol= 'tf_word', outputCol= 'idf_word')
tfidf_model= idf.fit(count_vectorized)
tfidf_df= tfidf_model.transform(count_vectorized)
tfidf_df.select('idf_word').show(truncate= False)

+-------------------------------------------------------+
|idf_word                                               |
+-------------------------------------------------------+
|(3,[0,1],[0.0,0.28768207245178085])                    |
|(3,[0],[0.0])                                          |
|(3,[0,1,2],[0.0,0.5753641449035617,1.3862943611198906])|
+-------------------------------------------------------+

