In [1]:
import findspark
findspark.init('/home/mint/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()
data = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,Regression,models,are,neat')
], ['id', 'sentence'])
data.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,Regressi...|
+---+--------------------+



In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
tokenized = Tokenizer(inputCol='sentence', outputCol='words').transform(data)
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,Regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [6]:
regex_tokenized = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W').transform(data)
regex_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,Regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [8]:
count_tokens = udf(lambda words: len(words), IntegerType())
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,Regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [9]:
regex_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,Regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [10]:
### Remove the stop words
data2 = spark.createDataFrame([
    (0, ["I", "saw", "the", "green", "horse"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ['id', 'words'])
from pyspark.ml.feature import StopWordsRemover
removed = StopWordsRemover(inputCol='words', outputCol='cleaned').transform(data2)
removed.show()

+---+--------------------+--------------------+
| id|               words|             cleaned|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [14]:
### n-grams
from pyspark.ml.feature import NGram
data3 = spark.createDataFrame([
    (0, ['Hi', 'I', 'heard', 'about', 'Spark']),
    (1, ['I', 'wish', 'java', 'could', 'use', 'case', 'classes']),
    (2, ['Logistic', 'Regression', 'models', 'are', 'neat'])
], ['id', 'words'])
ngram_data = NGram(n=2, inputCol='words', outputCol='phrase').transform(data3)
ngram_data.select('phrase').show(truncate=False)

+------------------------------------------------------------------+
|phrase                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic Regression, Regression models, models are, are neat]    |
+------------------------------------------------------------------+

