https://stackoverflow.com/questions/38839924/how-to-combine-n-grams-into-one-vocabulary-in-spark

In [None]:
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler
from pyspark.ml import Pipeline


def build_ngrams(inputCol="tokens", n=3):

    ngrams = [
        NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i))
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]

    return Pipeline(stages=ngrams + vectorizers + assembler)

In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
            .master("local") \
            .appName("Test NGram TFIDF") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()

df = spark.createDataFrame([
  (1, ["a", "b", "c", "d"]),
  (2, ["d", "e", "d"])
], ("id", "tokens"))

result = build_ngrams().fit(df).transform(df) 

In [None]:
result.show()

In [None]:
result.select(["1_grams", "2_grams", "1_counts"]).show()

# TFIDF our preprocessed files

Pre-processing pipeline - before we pass to LogisticRegression:
* Tokenize
* TF (count vectorize)
* IDF
* Assemble





In [6]:
import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SparkSession, DataFrameReader, SQLContext


DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-preprocessed.csv"


spark = SparkSession.builder \
            .master("local[*]") \
            .appName("Test NGram TFIDF (local)") \
            .config("spark.logConf", True) \
            .getOrCreate()


df = spark.read.csv(SparkFiles.get(DATA_FILE), header=True, inferSchema= True)

In [9]:
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



In [16]:
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler, Tokenizer
from pyspark.ml import Pipeline


def build_ngrams(inputCol, n=3):
    print("entered build")
    tokenizer = [Tokenizer(inputCol = inputCol, outputCol = "words")]
    
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

#     vectorizers = [
#         CountVectorizer(inputCol="{0}_grams".format(i),
#             outputCol="{0}_counts".format(i))
#         for i in range(1, n + 1)
#     ]

#     assembler = [VectorAssembler(
#         inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
#         outputCol="features"
#     )]

#     return Pipeline(stages=ngrams + vectorizers + assembler)
    return Pipeline(stages=tokenizer + ngrams)


df = build_ngrams(inputCol = "review_body").fit(df).transform(df)

entered build


In [18]:
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull 

In [None]:
import nltk