In [127]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf, concat, col, lit
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
from pyspark import SparkConf, SparkContext


import sys
from collections import defaultdict
import numpy as np
import re
import time

In [71]:
strip_chars = ".?,!;:\"'()#&" 
rgx = sc.broadcast(re.compile('[%s]' % strip_chars))

In [72]:
def process_str(row):

    body_list = []
    try:
        for word in row.lower().split(): 
            word = rgx.value.sub('', word)  
            body_list.append(word)
        return body_list
    except Exception as e:
        print(e)
        return ['']
process = udf(process_str, ArrayType(StringType()))

In [128]:
def good_bad_filter(x):
    if x >=4: return 1
    else: return 0

good_bad = udf(good_bad_filter, IntegerType())

In [3]:

sc = SparkContext()
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
print('*'*60, '\n', sc.getConf().getAll(), '\n', '*'*60, '\n')


************************************************************ 
 [('spark.app.id', 'local-1563985731720'), ('spark.app.name', 'SimpleApp'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', 'd6d187d3a6f1'), ('spark.ui.showConsoleProgress', 'true'), ('spark.driver.port', '38697')] 
 ************************************************************ 



In [22]:
#Imports data into spark dataframe
#data = sc.textFile('s3://amazon-reviews-pds/amazon_reviews_us_Video_Games_v1_00.tsv.gz')
repartition_num = 10
data = sc.textFile('data/gaming_reviews.tsv')
full_df = spark.read.csv(data, sep="\t", header=True, inferSchema=True)
#full_df = full_df.repartition(repartition_num)

In [133]:
#Subset selection for testing purposes
subset_df = full_df.select('review_headline', 'review_body', 'star_rating')\
    .filter(full_df.star_rating != 3)\
    .withColumn('star_rating', good_bad('star_rating'))

In [49]:
two_col_df = subset_df.select(concat(col('review_headline'), lit(' '), col('review_body')).alias('text'), subset_df.star_rating)

In [51]:
two_col_df.take(1)

[Row(text='an amazing joystick. I especially love that you can twist ... Used this for Elite Dangerous on my mac, an amazing joystick. I especially love that you can twist the stick for different movement bindings as well as move it in the normal way.', star_rating=5)]

In [42]:
two_col_df.columns

['concat(review_headline,  , review_body)']

In [74]:
text_list_df = two_col_df.withColumn('text_list', process(two_col_df['text']))\
        .select('text_list', 'star_rating')

In [78]:
text_list_df.columns

['text_list', 'star_rating']

In [138]:
cv = CountVectorizer(inputCol="text_list", outputCol="count_vec")
cv_fit = cv.fit(text_list_df) #need to save vocabulary from this
cv_transform = cv_fit.transform(text_list_df)
output_df = cv_transform.select(cv_transform.count_vec, cv_transform.star_rating)

AttributeError: 'CountVectorizerModel' object has no attribute 'count_vec'

In [140]:
cv_fit.vocabulary

['the',
 'it',
 'i',
 'and',
 'to',
 'a',
 'stars',
 'game',
 'this',
 'five',
 'for',
 'you',
 'is',
 'of',
 'great',
 'that',
 'not',
 'but',
 'have',
 'one',
 'in',
 'get',
 'my',
 'its',
 'good',
 'on',
 'are',
 'was',
 'as',
 'with',
 'be',
 'so',
 'like',
 'if',
 'love',
 'amazing',
 'ps4',
 'will',
 'your',
 'games',
 '',
 'just',
 'awesome',
 'play',
 'do',
 'buy',
 'we',
 'out',
 'four',
 'tried',
 'pretty',
 'time',
 'star',
 'fun',
 'when',
 'an',
 'well',
 'work',
 'works',
 'way',
 'had',
 'can',
 'they',
 'got',
 'nice',
 'two',
 'make',
 'these',
 'usb',
 'now',
 'very',
 'better',
 'would',
 'more',
 'ports',
 'played',
 'really',
 'doesnt',
 'them',
 'dont',
 'all',
 'controller',
 'loved',
 'am',
 'much',
 'worth',
 'version',
 'were',
 'even',
 'did',
 'thing',
 'stand',
 'up',
 'then',
 'kids',
 'such',
 'little',
 'fit',
 'perfect',
 'extra',
 'before',
 'still',
 'gaming',
 'know',
 'over',
 'hard',
 'story',
 'or',
 'super',
 'gift',
 'does',
 'disc',
 'working',