In [None]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from time import *

In [None]:
url_x_train = "s3://lab10data/X_train.txt"
url_x_test = "s3://lab10data/X_test.txt"
url_y_train = "s3://lab10data/y_train.txt"
url_y_test = "s3://lab10data/y_test.txt"

In [None]:
# load an parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return values

def parsePoint_2(line):
    values = int(line)-1
    return values

# Get Training Data
x_train_str = sc.textFile(url_x_train)
train_x = x_train_str.map(parsePoint)
train_y_str = sc.textFile(url_y_train)
train_y = train_y_str.map(parsePoint_2)
# Get Testing Data
test_x_str = sc.textFile(url_x_test)
test_x = test_x_str.map(parsePoint)
test_y_str = sc.textFile(url_y_test)
test_y = test_y_str.map(parsePoint_2)

In [None]:
test_x.take(3)

In [None]:
# Format the training and testing data by labeledPoint
def formatData(x,y):
    data = []
    assert(len(x) == len(y))
    for i in range (len(x)):
        data.append(LabeledPoint(y[i],x[i]))
    
    return data

parsedData_train = formatData(train_x.collect(),train_y.collect())
train_data = sc.parallelize(parsedData_train)
parsedData_test = formatData(test_x.collect(),test_y.collect())
test_data = sc.parallelize(parsedData_test)

In [None]:
model = RandomForest.trainClassifier(train_data, numClasses=12, categoricalFeaturesInfo={},
                                     numTrees=12, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32, seed=42)

In [None]:
print(model)
print(model.numTrees())
print(model.totalNumNodes())

In [None]:
# Evaluate model on test instances and compute test error
predictions_test = model.predict(test_data.map(lambda x: x.features))
predictions_train = model.predict(train_data.map(lambda x: x.features))

In [None]:
# accuracy
def accuracy(predcitions,y,length):
    count = 0
    for a, b in zip(predcitions,y):
        if a == b:
            count += 1
    return count/length*100

train_accuracy = accuracy(predictions_train.collect(),train_y.collect(),predictions_train.count())
test_accuracy = accuracy(predictions_test.collect(),test_y.collect(),predictions_test.count())

In [None]:
print('train_accuracy: {}'.format(train_accuracy))
print('test_accuracy: {}'.format(test_accuracy))

In [None]:
labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions_test)

In [None]:
metrics = MulticlassMetrics(labelsAndPredictions)

In [None]:
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)