In [1]:
# Run command below with docker installed to download and run a spark docker instance:
# docker run --name sparkbook -p 8881:8888 -v "$PWD":/home/jovyan/work jupyter/pyspark-notebook start.sh jupyter lab --LabApp.token=''
# Next, open a web browser and navigate to localhost:8881 to enter docker container. This notebook can then be run.

In [2]:
import sys
!{sys.executable} -m pip install nltk
#Required when running from a newly created spark docker container



In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
#required for nltk functionality

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark import SparkConf, SparkContext
from sklearn.decomposition import NMF
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import sys
import string
from collections import defaultdict
import numpy as np

In [5]:
#Creates Spark context and session. Sets
conf = SparkConf().setAll([('spark.executor.memory', '15g'), ('spark.driver.memory', '30g')])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
print(s.getConf().getAll())

[('spark.driver.memory', '30g'), ('spark.app.name', 'SimpleApp'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', 'a3343a2ac2ad'), ('spark.executor.memory', '15g'), ('spark.ui.showConsoleProgress', 'true'), ('spark.driver.port', '36073'), ('spark.app.id', 'local-1562017934017')]


In [14]:
#prints current configuration of spark context
print(sc.getConf().getAll())

[('spark.driver.memory', '30g'), ('spark.app.name', 'SimpleApp'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', 'a3343a2ac2ad'), ('spark.executor.memory', '15g'), ('spark.ui.showConsoleProgress', 'true'), ('spark.driver.port', '36073'), ('spark.app.id', 'local-1562017934017')]


In [35]:
#Creates a spark datafrom from the amazon reviews file. Using .limit(x) will subset the first x items.
df = spark.read.csv("data/gaming_reviews.tsv", sep="\t", header=True, inferSchema=True)

In [36]:
df.count()

1785997

In [16]:
# Stopwords to filter from  
stop_words = set(stopwords.words('english'))
stop_words.add('')

In [17]:
# removes punctuation, adds column 'body_list' which is a list of words, and selects only the star_rating and body_list columns
stop_words = set(stopwords.words('english'))
stop_words.add('')

def process_str(row):
    """Input: String
    Output: List of strings without punctuation
    
    Removes punctuation using functions from the strings library. Performs raw string operations in C via a lookup table for maximum performance."""
    word_list = row.translate(str.maketrans('', '', string.punctuation)).split(' ')
    return [x.lower() for x in word_list if x.lower() not in stop_words]

process = udf(process_str, ArrayType(StringType()))

df_new = df.withColumn('body_list', process(df['review_body']))\
        .withColumn('body_list', process(df['review_body']))\
        .select('star_rating', 'body_list')
df_new.head()

Row(star_rating=5, body_list=['used', 'elite', 'dangerous', 'mac', 'amazing', 'joystick', 'especially', 'love', 'twist', 'stick', 'different', 'movement', 'bindings', 'well', 'move', 'normal', 'way'])

In [18]:
cv = CountVectorizer(inputCol="body_list", outputCol="features")
idf = IDF(inputCol='features', outputCol = 'idf_features')
rf = RandomForestClassifier(labelCol='star_rating', featuresCol='idf_features', seed=100)

In [29]:
# Stage one in pipeline the words are separated into count vectors
stage1_fit = cv.fit(df_new)
stage1_transform = stage1_fit.transform(df_new)

In [30]:
# Stage two in pipeline for tf-IDF
stage2_fit = idf.fit(stage1_transform)
stage2_transform = stage2_fit.transform(stage1_transform)

In [31]:
# Stage three in pipeline for random forest
stage3_fit = rf.fit(stage2_transform)
stage3_transform = stage3_fit.transform(stage2_transform)

In [27]:
# Function that prints the top n features
def print_top_features(model, n=10):
    """
    Input: RandomForest model, number of top feature words to print.
    Output: None. Prints top n words to console.
    """
    sorted_indices = np.flip(np.argsort(model.featureImportances.toArray()))
    for i in range(20):
        print(stage1_fit.vocabulary[sorted_indices[i]])
print_top_features(stage3_fit);

work
disappointed
next
2
terrible
343
way
working
feels
creation
used
ok
fine
triangle
probably
desired
process
isnt
great
br


In [87]:
star_idf = stage3_transform.select('star_rating', 'idf_features')
vocabulary = np.array(stage1_fit.vocabulary)

In [43]:
nmf_model = NMF(n_components=5)

In [68]:
X = [x[0].toArray() for x in star_idf.select('idf_features').collect()] 
nmf_fit = nmf_model.fit(X)
nmf_transform = nmf_model.transform(X)

In [92]:
for i in range(5):
    print(vocabulary[np.flip(np.argsort(nmf_fit.components_[i, :]))[0:10]])

['sonic' '62' 'level' 'youll' 'levels' 'cooper' 'sly' 'boom' 'theyre'
 'gorgeous']
['dishonored' 'cause' 'thief' 'fps' 'gun' 'get' 'ive' 'titlebr' 'second'
 'fighting']
['disney' 'toybox' 'coaster' 'path' 'new' 'wars' 'theres' '30' 'infinity'
 'star']
['batmobile' 'arkham' 'combat' 'city' 'commentary' 'it’s' 'game' 'br'
 'previous' 'batman’s']
['play' 'br' 'player' 'jar' 'game' 'split' '20' 'screen' 'issues' 'disney']


In [84]:
type(np.flip(np.argsort(nmf_fit.components_[0, :]))[0:10])

numpy.ndarray