In [1]:
import findspark
findspark.init('/home/mint/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()
data = spark.createDataFrame([
    (0.0, 'Hi I heard about Spark'),
    (0.0, 'I wish java could use case classes'),
    (1.0, 'Logistic Regression models are neat')
], ['label', 'sentence'])
data.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish java could...|
|  1.0|Logistic Regressi...|
+-----+--------------------+



In [2]:
from pyspark.ml.feature import Tokenizer, IDF, HashingTF
tokenized = Tokenizer(inputCol='sentence', outputCol='words').transform(data)
tokenized.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish java could...|[i, wish, java, c...|
|  1.0|Logistic Regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [7]:
tf_data = HashingTF(inputCol='words', outputCol='rawFeatures').transform(tokenized)
# tf_data.show()
idf_data = IDF(inputCol='rawFeatures', outputCol='features').fit(tf_data).transform(tf_data)
idf_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+--------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|sentence                           |words                                     |rawFeatures                                                                           |features                                                                                                                                                                                        |
+-----+-----------------------------------+------------------------------------------+--------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------

In [8]:
data2 = spark.createDataFrame([
    (0, 'a b c'.split()),
    (1, 'a b b c a'.split())
], ['id', 'words'])
data2.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [11]:
from pyspark.ml.feature import CountVectorizer
cv_data = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0).fit(data2).transform(data2)
cv_data.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

