In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf, concat, col, lit
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
from pyspark import SparkConf, SparkContext

import sys
from collections import defaultdict
import numpy as np
import re
import time
import pickle

In [2]:
sc = SparkContext()
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
print('*'*60, '\n', sc.getConf().getAll(), '\n', '*'*60, '\n')

************************************************************ 
 [('spark.app.id', 'local-1563996930569'), ('spark.app.name', 'SimpleApp'), ('spark.rdd.compress', 'True'), ('spark.driver.port', '39331'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', 'd6d187d3a6f1'), ('spark.ui.showConsoleProgress', 'true')] 
 ************************************************************ 



In [3]:
strip_chars = ".?,!;:\"'()#&" 
rgx = sc.broadcast(re.compile('[%s]' % strip_chars))

In [4]:
def process_str(row):

    body_list = []
    try:
        for word in row.lower().split(): 
            word = rgx.value.sub('', word)  
            body_list.append(word)
        return body_list
    except Exception as e:
        print(e)
        print(row)
        return ['']
process = udf(process_str, ArrayType(StringType()))

In [5]:
def good_bad_filter(x):
    if x >=4: return 1
    else: return 0

good_bad = udf(good_bad_filter, IntegerType())

In [12]:
#Imports data into spark dataframe
#Directory: s3://amazon-reviews-pds/tsv/
#data = sc.textFile('s3://amazon-reviews-pds/amazon_reviews_us_Video_Games_v1_00.tsv.gz')
repartition_num = 10
data = sc.textFile('data/amazon_reviews_us_Video_Games_v1_00.tsv')
full_df = spark.read.csv(data, sep="\t", header=True, inferSchema=True)
#full_df = full_df.repartition(repartition_num)

In [13]:
#Subset selection for testing purposes
subset_df = full_df.select('review_headline', 'review_body', 'star_rating')\
    .filter(full_df.star_rating != 3)\
    .withColumn('star_rating', good_bad('star_rating'))\
    .limit(100)

In [14]:
two_col_df = subset_df.select(concat(col('review_headline'), lit(' '), col('review_body')).alias('text'), subset_df.star_rating)

In [15]:
text_list_df = two_col_df.withColumn('text_list', process(two_col_df['text']))\
        .select('text_list', 'star_rating')

In [16]:
cv = CountVectorizer(inputCol="text_list", outputCol="count_vec")
cv_fit = cv.fit(text_list_df) #need to save vocabulary from this
cv_transform = cv_fit.transform(text_list_df)
output_df = cv_transform.select(cv_transform.count_vec, cv_transform.star_rating)

In [18]:
vocab = spark.createDataFrame(cv_fit.vocabulary, schema=StringType())
vocab.coalesce(1).write.format('json').save('s3://dsi-amazon-neural/vocab') #saves vocabulary as a json
output_df.write.format('json').save('s3://dsi-amazon-neural/') #saves final dataframe in a series of json files on s3

pyspark.sql.dataframe.DataFrame