# PySpark Logistic Regression

Our full dataset file has around 9 million samples. When trying to run feature_generator.

In [1]:
import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SparkSession, DataFrameReader, SQLContext


DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-test1k-preprocessed.csv"


spark = SparkSession.builder \
            .master("local[*]") \
            .appName("Test NGram TFIDF (local)") \
            .config("spark.logConf", True) \
            .getOrCreate()


df = spark.read.csv(SparkFiles.get(DATA_FILE), 
                    header=True, 
                    inferSchema= True)


In [2]:
df.printSchema()

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)



In [3]:
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler, Tokenizer
from pyspark.ml import Pipeline


def build_ngrams(inputCol, n=3):
    print("entered build")
    tokenizer = [Tokenizer(inputCol = inputCol, outputCol = "words")]
    
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

#     vectorizers = [
#         CountVectorizer(inputCol="{0}_grams".format(i),
#             outputCol="{0}_counts".format(i))
#         for i in range(1, n + 1)
#     ]

#     assembler = [VectorAssembler(
#         inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
#         outputCol="features"
#     )]

#     return Pipeline(stages=ngrams + vectorizers + assembler)
    return Pipeline(stages=tokenizer + ngrams)


df = build_ngrams(inputCol = "review_body").fit(df).transform(df)
df.printSchema()
df.show(5, truncate = False)

entered build
root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)

+-----------+-------------+-----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------