In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Nlp").getOrCreate()

**Tokenizer**

In [5]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


In [31]:
sent_df = spark.createDataFrame([
    (1, "Hi I heard about Spark"),
    (2, "I wish Java could use case classes"),
    (3, "Logistic regression models are neat"),
    (4,"hey,how,are,you,doing")
], ["id", "sentence"])

In [24]:
sent_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|Hi I heard about ...|
|  2|I wish Java could...|
|  3|Logistic regressi...|
+---+--------------------+



In [33]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regextokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

countTokens = udf(lambda words: len(words), IntegerType())

In [34]:
tokenized = tokenizer.transform(sent_df)

In [35]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  1|Hi I heard about ...|[hi, i, heard, ab...|
|  2|I wish Java could...|[i, wish, java, c...|
|  3|Logistic regressi...|[logistic, regres...|
|  4|hey,how,are,you,d...|[hey,how,are,you,...|
+---+--------------------+--------------------+



In [36]:
tokenized.select("sentence","words").withColumn("tokens",countTokens(col("words"))).show()

+--------------------+--------------------+------+
|            sentence|               words|tokens|
+--------------------+--------------------+------+
|Hi I heard about ...|[hi, i, heard, ab...|     5|
|I wish Java could...|[i, wish, java, c...|     7|
|Logistic regressi...|[logistic, regres...|     5|
|hey,how,are,you,d...|[hey,how,are,you,...|     1|
+--------------------+--------------------+------+



In [37]:
regexTokenized = regextokenizer.transform(sent_df)
regexTokenized.select("sentence","words").withColumn("tokens",countTokens(col("words"))).show()

+--------------------+--------------------+------+
|            sentence|               words|tokens|
+--------------------+--------------------+------+
|Hi I heard about ...|[hi, i, heard, ab...|     5|
|I wish Java could...|[i, wish, java, c...|     7|
|Logistic regressi...|[logistic, regres...|     5|
|hey,how,are,you,d...|[hey, how, are, y...|     5|
+--------------------+--------------------+------+

