In [1]:
# Basic PySpark Setup Stuff

import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example").master("local[*]").getOrCreate()

In [2]:
# Read the datafile
df = spark.read.json("reviews_Musical_Instruments_5.json.gz")

In [3]:
# Sanity check dataset size, you know the drill
df.count()

10261

In [64]:
#Set up features -- those from Lab 3, plus a few more

from pyspark.sql.functions import udf

def _countWords(string):
    if string is None:
        return 0
    return len(string.split())

countWords = udf(_countWords)

def _avgWordLength(string):
    if string is None:
        return 0
    words = string.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)
            
avgWordLength = udf(_avgWordLength)
pctUpper = udf(lambda x:0 if x is None or len(x)==0 else sum(1 for y in x if y.isupper())/len(x))
strLen = udf(lambda x:0 if x is None else len(x))

wasReviewed = udf(lambda x:0 if x[1]==0 else 1)

df = df.withColumn('reviewLen', countWords(df.reviewText))
df = df.withColumn('reviewWordAvg', avgWordLength(df.reviewText))
df = df.withColumn('pctUpper', pctUpper(df.reviewText))
df = df.withColumn('nameLen', strLen(df.reviewerName))
df = df.withColumn('wasReviewed', wasReviewed(df.helpful))


In [65]:
#See if we can use ML to predict whether anyone will react to the review

#Get our dataset into the proper format
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

dataset = df.rdd.map(lambda row: LabeledPoint(row['wasReviewed'],
                                              Vectors.dense(row['reviewLen'], 
                                                            row['reviewWordAvg'], 
                                                            row['pctUpper'],
                                                            row['nameLen'], 
                                                            row['overall'])))

# Check out some examples of how the data points look in the form accepted by MLLib
print("Data Point Examples:")
print(dataset.take(2))

Data Point Examples:
[LabeledPoint(0.0, [51.0,4.2745098039215685,0.0037313432835820895,48.0,5.0]), LabeledPoint(1.0, [104.0,4.240384615384615,0.016544117647058824,4.0,5.0])]


In [66]:
print("Dataset Size: " + str(dataset.count()))

#Split data into training and test sets
TRAINING_DATA_RATIO = 0.67
RANDOM_SEED = 0xdeadface
splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data, test_data = dataset.randomSplit(splits, RANDOM_SEED)

print("Training Data Size: " + str(training_data.count()))
print("Test Data Size: " + str(test_data.count()))

Dataset Size: 10261
Training Data Size: 6915
Test Data Size: 3346


In [67]:
#Train Model

from pyspark.mllib.tree import RandomForest
from time import *

RF_NUM_TREES=3
RF_MAX_DEPTH=4
RF_MAX_BINS=8

start_time = time()

model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)


Time to train model: 1.027 seconds


In [68]:
#Get Baseline for performance
positives = dataset.filter(lambda point: point.label == 1).count()
negatives = dataset.count() - positives
print("Postive Examples: " + str(positives) + " Negative Examples: " + str(negatives))
print("Most Common Class Baseline Error: " + str(min(positives, negatives) / dataset.count()))

#Compare our model's prediction accuracy to the baseline
predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda lp: lp.label).zip(predictions)
test_err = labels_and_predictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test_data.count())
print('Test Error: ' + str(test_err))
print()
print()
#Check out the model that was learned
print('Learned classification forest model:')
print(model.toDebugString())

Postive Examples: 3465 Negative Examples: 6796
Most Common Class Baseline Error: 0.3376863853425592
Test Error: 0.28302450687387926

Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 0 <= 103.0)
     If (feature 0 <= 71.0)
      If (feature 4 <= 2.0)
       If (feature 4 <= 1.0)
        Predict: 1.0
       Else (feature 4 > 1.0)
        Predict: 0.0
      Else (feature 4 > 2.0)
       If (feature 4 <= 3.0)
        Predict: 0.0
       Else (feature 4 > 3.0)
        Predict: 0.0
     Else (feature 0 > 71.0)
      If (feature 2 <= 0.031598513011152414)
       If (feature 1 <= 4.579710144927536)
        Predict: 0.0
       Else (feature 1 > 4.579710144927536)
        Predict: 0.0
      Else (feature 2 > 0.031598513011152414)
       If (feature 2 <= 0.0398406374501992)
        Predict: 1.0
       Else (feature 2 > 0.0398406374501992)
        Predict: 0.0
    Else (feature 0 > 103.0)
     If (feature 0 <= 168.0)
      If (feature 1 <= 4

In [None]:
#####################################################
# Challenge! 
#
# Can you improve the performance of the classifier?
# Perhaps consider using some of the features you 
# came up with in the last lab exercise. Alternatively,
# you could make up some new features that you feel are
# particularly suited to this problem. Also, you could
# try adjusting the parameters of the learning algorithm,
# or even applying a different algorithm, such as SVMs.
#
#####################################################