In [2]:
spark

In [6]:
import pyspark.ml.feature as ft
from pyspark.ml import Pipeline

In [1]:
text_data = spark.createDataFrame([
['''Machine learning can be applied to a wide variety
of data types, such as vectors, text, images, and structured data. 
This API adopts the DataFrame from Spark SQL in order to support a variety of data types.'''],
['''Columns in a DataFrame are named. The code examples
below use names such as "text," "features," and "label."''']
], ['input'])

In [4]:
text_data.printSchema()

root
 |-- input: string (nullable = true)



In [5]:
tokenizer = ft.RegexTokenizer(
    inputCol='input', 
    outputCol='input_arr', 
    pattern='\s+|[,.\"]'
)

In [7]:
stopwords = ft.StopWordsRemover(
    inputCol=tokenizer.getOutputCol(), 
    outputCol='input_stop'
)

In [8]:
ngram = ft.NGram(
    n=2, 
    inputCol=stopwords.getOutputCol(), 
    outputCol="nGrams"
)

In [9]:
pipeline = Pipeline(stages=[tokenizer, stopwords, ngram])

In [10]:
data_ngram = pipeline.fit(text_data).transform(text_data)

In [12]:
data_ngram.printSchema()

root
 |-- input: string (nullable = true)
 |-- input_arr: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- input_stop: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nGrams: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
data_ngram.select(
    "input_arr",
    "input_stop",
    "nGrams"
).show(1, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|input_arr                                                                                                                                                                                                                                     |input_stop                               