## Import

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("basicNLP").getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf  #get a column and user-define function
from pyspark.sql.types import IntegerType

## Create data

In [4]:
sen_df = spark.createDataFrame(
    [
        (0, "Hi I heard about Spark"),
        (1, "I wish java could use case classes"),
        (2, "Logistics,regression,models,are,neat")
    ],["id", "sentence"])

In [5]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistics,regress...|
+---+--------------------+



## Tokenizing

In [7]:
# tokenizing
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
reg_tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
#\\W (capital) is non-word character

In [9]:
# user define function: input is words, output is len of words, return type is integer
count_tokens = udf(lambda words:len(words), IntegerType())

In [10]:
# perform the tokenize
tokenized = tokenizer.transform(sen_df)

In [11]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistics,regress...|[logistics,regres...|
+---+--------------------+--------------------+



In [12]:
tokenized.withColumn("word_count", count_tokens(col("words"))).show()

+---+--------------------+--------------------+----------+
| id|            sentence|               words|word_count|
+---+--------------------+--------------------+----------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|         5|
|  1|I wish java could...|[i, wish, java, c...|         7|
|  2|Logistics,regress...|[logistics,regres...|         1|
+---+--------------------+--------------------+----------+



In [13]:
reg_tokenized = reg_tokenizer.transform(sen_df)

In [14]:
reg_tokenized.withColumn("word_count", count_tokens(col("words"))).show()

+---+--------------------+--------------------+----------+
| id|            sentence|               words|word_count|
+---+--------------------+--------------------+----------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|         5|
|  1|I wish java could...|[i, wish, java, c...|         7|
|  2|Logistics,regress...|[logistics, regre...|         5|
+---+--------------------+--------------------+----------+



## Stop Word Remover

In [15]:
from pyspark.ml.feature import StopWordsRemover

In [16]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

In [17]:
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

In [18]:
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



## n-gram

In [19]:
from pyspark.ml.feature import NGram

In [20]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

In [21]:
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

In [22]:
ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



## TF-IDF

In [23]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [24]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [25]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [26]:
wordsData = tokenizer.transform(sentenceData)

In [28]:
wordsData.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



### TF

In [30]:
hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures")

In [32]:
featurized_dt = hashing_tf.transform(wordsData)

In [33]:
featurized_dt.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|
+-----+--------------------+--------------------+--------------------+



### IDF

In [36]:
idf_obj = IDF(inputCol="rawFeatures", outputCol="features")

In [37]:
idf_model = idf_obj.fit(featurized_dt)

In [38]:
rescaled_dt = idf_model.transform(featurized_dt)

In [39]:
rescaled_dt.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262144,[24417,49...|
|  0.0|(262144,[20719,24...|
|  1.0|(262144,[13671,91...|
+-----+--------------------+



In [41]:
rescaled_dt.select("label", "features").show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0.0  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

## CountVectorizer

In [42]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [43]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [44]:
from pyspark.ml.feature import CountVectorizer

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [45]:
model = cv.fit(df)

In [46]:
result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

