In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

# Import / Create Data

In [2]:
#Create new data
sen_df = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,models,are,neat')
    ],['id','sentence'])

In [3]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



# Tokenize Data

In [4]:
#Import Tokenizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer


#1. Create an Instance of that Tokenizer Class
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')



In [5]:
#2. Create an Instance of the regex class
#Regex extracts a token from a particular pattern
regex_tokenizer = RegexTokenizer(inputCol='sentence', 
                                 outputCol='words',
                                pattern='\\W')

In [6]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

#3. Create a user defined function (udf) for tokenization
#UDF call, that takes in words, and returns length of word or Int Type
count_tokens = udf(lambda words:len(words),IntegerType())

In [7]:
tokenized = tokenizer.transform(sen_df)
#tokenized.show()
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [8]:
#Use Regex to split sentence 3
rg_tokenized = regex_tokenizer.transform(sen_df)
rg_tokenized = rg_tokenized.withColumn('tokens',count_tokens(col('words')))
rg_tokenized.show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



# Remove Stop Words

In [9]:
#Creaqting new DF
df_sen = spark.createDataFrame([
    (0, ['I', 'saw', 'the', 'green', 'horse']),
    (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id','tokens'])
df_sen.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [10]:
from pyspark.ml.feature import StopWordsRemover
stop_worder = StopWordsRemover(inputCol='tokens', outputCol='words_stp')
df_sen_removed = stop_worder.transform(df_sen)
df_sen_removed = df_sen_removed.withColumn('token_count', count_tokens(col('words_stp')))
df_sen_removed.show()

+---+--------------------+--------------------+-----------+
| id|              tokens|           words_stp|token_count|
+---+--------------------+--------------------+-----------+
|  0|[I, saw, the, gre...| [saw, green, horse]|          3|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|          3|
+---+--------------------+--------------------+-----------+



# N-Gram

In [11]:
#Takes the input of a sequence of strings (i.e. ouput of tokenizer)
# Paramter 'N' is used to decide the number of each tokens in the N-Gram
#Output is a sequence of N-Grams
from pyspark.ml.feature import NGram

rg_tokenized.show()

ngram = NGram(n = 2, inputCol='words', outputCol='grams')
ngram.transform(rg_tokenized).select('grams').show(truncate=False)

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[hi i, i heard, heard about, about spark]                         |
|[i wish, wish java, java could, could use, use case, case classes]|
|[logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



# TFIDF

In [12]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [13]:
sen_df = spark.createDataFrame([
    (0.0, 'Hi I heard about Spark'),
    (0.0, 'I wish java could use case classes'),
    (1.0, 'Logistic, regression, models, are, neat')
    ],['label','sentence'])
sen_df.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish java could...|
|  1.0|Logistic, regress...|
+-----+--------------------+



In [14]:
#Tokenize this corpus
tokenizer = Tokenizer(inputCol='sentence', outputCol='tokens')
word_data = tokenizer.transform(sen_df)
word_data.show()

+-----+--------------------+--------------------+
|label|            sentence|              tokens|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish java could...|[i, wish, java, c...|
|  1.0|Logistic, regress...|[logistic,, regre...|
+-----+--------------------+--------------------+



In [15]:
#Term Frequ
hasing_tf = HashingTF(inputCol='tokens', outputCol='rawFeatures')
featurized_data = hasing_tf.transform(word_data)
featurized_data.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|              tokens|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|  0.0|I wish java could...|[i, wish, java, c...|(262144,[20719,24...|
|  1.0|Logistic, regress...|[logistic,, regre...|(262144,[22467,91...|
+-----+--------------------+--------------------+--------------------+



In [16]:
#IDF
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_fitted = idf.fit(featurized_data)
rescaled_data = idf_fitted.transform(featurized_data)
rescaled_data.select('label', 'features').show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0.0  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

# Count Vectorizer

In [28]:
df = spark.createDataFrame([
    (0, 'a b c'.split(" ")),
    (1, 'a b b b c a'.split(" "))
], ['id', 'words'])
df.show()

+---+------------------+
| id|             words|
+---+------------------+
|  0|         [a, b, c]|
|  1|[a, b, b, b, c, a]|
+---+------------------+



In [29]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='features',
                     vocabSize=3, minDF=2.0)
cv_fitted = cv.fit(df)
df_cv = cv_fitted.transform(df)
df_cv.show(truncate=False)

+---+------------------+-------------------------+
|id |words             |features                 |
+---+------------------+-------------------------+
|0  |[a, b, c]         |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, b, c, a]|(3,[0,1,2],[3.0,2.0,1.0])|
+---+------------------+-------------------------+

