In [0]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('nlp').getOrCreate()
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType
sen_df = spark.createDataFrame([
     (0,'hi i heard about spark'),(1,'i wish java could use case classes'),(2,'logistic,regressions,models,are,neat')],['id','sentence'])
sen_df.show()


+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|hi i heard about ...|
|  1|i wish java could...|
|  2|logistic,regressi...|
+---+--------------------+



In [0]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')
regex_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='words',pattern='\\W')
count_tokens = udf(lambda words:len(words),IntegerType() )


In [0]:
tokenized=tokenizer.transform(sen_df)
tokenized.withColumn('tokens',count_tokens(col('words' ))).show()


+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi i heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [0]:
rg_tokenized = regex_tokenizer.transform(sen_df)
rg_tokenized.withColumn('tokens',count_tokens(col('words'))).show()


+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi i heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [0]:
from pyspark.ml.feature import StopWordsRemover
sentenceDataFrame = spark.createDataFrame([
     (0,['i','saw','the','green','house']),(1,['marrie','had','a','little','lamb'])],['id','tokens'])


In [0]:
remover = StopWordsRemover(inputCol ='tokens' ,outputCol='filtered')

In [0]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[i, saw, the, gre...| [saw, green, house]|
|  1|[marrie, had, a, ...|[marrie, little, ...|
+---+--------------------+--------------------+



In [0]:
from pyspark.ml.feature import NGram

In [0]:
wordDataFrame = spark.createDataFrame([
     (0,['i','saw','the','green','house']),(1,['marrie','had','a','little','lamb']), (2,['she' ,'have' ,'to','go','home'])],['id','words'])


In [0]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [0]:
ngram.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[i, saw, the, gre...|[i saw, saw the, ...|
|  1|[marrie, had, a, ...|[marrie had, had ...|
|  2|[she, have, to, g...|[she have, have t...|
+---+--------------------+--------------------+



In [0]:
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[i, saw, the, gre...|[i saw, saw the, ...|
|  1|[marrie, had, a, ...|[marrie had, had ...|
|  2|[she, have, to, g...|[she have, have t...|
+---+--------------------+--------------------+



In [0]:
ngram.transform(wordDataFrame).select('grams').show(truncate= False)

+------------------------------------------+
|grams                                     |
+------------------------------------------+
|[i saw, saw the, the green, green house]  |
|[marrie had, had a, a little, little lamb]|
|[she have, have to, to go, go home]       |
+------------------------------------------+

