In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree').getOrCreate()

In [2]:
folder = '/home/ubuntu/data/raw'
file = '/College.csv'
data = spark.read.csv(folder + file, header=True, inferSchema=True)
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [3]:
data.head(1)
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

# Format this Data

In [4]:
#Create 'Features' column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Apps',
                     'Accept',
                     'Enroll',
                     'Top10perc',
                     'Top25perc',
                     'F_Undergrad',
                     'P_Undergrad',
                     'Outstate',
                     'Room_Board',
                     'Books',
                     'Personal',
                     'PhD',
                     'Terminal',
                     'S_F_Ratio',
                     'perc_alumni',
                     'Expend',
                     'Grad_Rate'],outputCol='features')
output = assembler.transform(data)

In [5]:
#Change Private to numerical
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')
output_fixed = indexer.fit(output)
output_fixed = output_fixed.transform(output)
#or output_fixed = indexer.fit(output).transform(output)

output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = true)



In [6]:
final_data = output_fixed.select('features','PrivateIndex')

# Split Data

In [7]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

# Modeling

In [8]:
#Import and Instanstiate classifiers
from pyspark.ml.classification import (DecisionTreeClassifier, 
                                       RandomForestClassifier, 
                                       GBTClassifier)
from pyspark.ml import Pipeline

dt_clr = DecisionTreeClassifier(labelCol='PrivateIndex')
rf_clf = RandomForestClassifier(labelCol='PrivateIndex',numTrees=100)
gbt_clf = GBTClassifier(labelCol='PrivateIndex')

In [9]:
#Fit Models
dt_fitted = dt_clr.fit(train_data)
rf_fitted = rf_clf.fit(train_data)
gbt_fitted = gbt_clf.fit(train_data)

# Model Evaluation

In [10]:
#EValuate Predcitions
dt_preds = dt_fitted.transform(test_data)
rf_preds = rf_fitted.transform(test_data)
gbt_preds = gbt_fitted.transform(test_data)

In [11]:
from pyspark.ml.evaluation import (MulticlassClassificationEvaluator,
                                   BinaryClassificationEvaluator)
acc_evaluator = MulticlassClassificationEvaluator(metricName='accuracy',labelCol='PrivateIndex')

In [12]:
print(acc_evaluator.evaluate(dt_preds),
      acc_evaluator.evaluate(rf_preds),
      acc_evaluator.evaluate(gbt_preds))


0.9210526315789473 0.9517543859649122 0.9254385964912281


In [13]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [18]:
acc_evaluator2 = BinaryClassificationEvaluator(metricName='accuracy',labelCol='PrivateIndex',rawPredictionCol='prediction')

In [20]:
print(acc_evaluator.evaluate(gbt_preds))

0.9254385964912281


In [16]:
f1_evaluator = MulticlassClassificationEvaluator(metricName='f1',labelCol='PrivateIndex')
print(f1_evaluator.evaluate(dt_preds),
      f1_evaluator.evaluate(rf_preds),
      f1_evaluator.evaluate(gbt_preds))


0.9198559944361486 0.950888629322519 0.9241006089529837
