In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tree').getOrCreate()
df = spark.read.csv('College.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [5]:
import pandas as pd

pd.DataFrame(df.take(5), columns = df.columns).transpose()

Unnamed: 0,0,1,2,3,4
School,Abilene Christian University,Adelphi University,Adrian College,Agnes Scott College,Alaska Pacific University
Private,Yes,Yes,Yes,Yes,Yes
Apps,1660,2186,1428,417,193
Accept,1232,1924,1097,349,146
Enroll,721,512,336,137,55
Top10perc,23,16,22,60,16
Top25perc,52,29,50,89,44
F_Undergrad,2885,2683,1036,510,249
P_Undergrad,537,1227,99,63,869
Outstate,7440,12280,11250,12960,7560


In [6]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [8]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad','Outstate','Room_Board','Books','Personal','PhD','Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate'], outputCol = 'features')

In [9]:
output = assembler.transform(df)

In [11]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'Private', outputCol = 'PrivateIndex')
outputFixed = indexer.fit(output).transform(output)
outputFixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [12]:
final_df = outputFixed.select('features', 'PrivateIndex')
final_df.show(3)

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
+--------------------+------------+
only showing top 3 rows



In [13]:
train, test = final_df.randomSplit([0.7, 0.3])

In [15]:
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier, 
                                      GBTClassifier)
from pyspark.ml import Pipeline

dt = DecisionTreeClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
rf = RandomForestClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
gb = GBTClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')

In [18]:
dt_model = dt.fit(train)
rf_model = rf.fit(train)
gb_model = gb.fit(train)

In [19]:
dt_predictions = dt_model.transform(test)
rf_predictions = rf_model.transform(test)
gb_predictions = gb_model.transform(test)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binary_evaluator = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

print('Decision Tree:', binary_evaluator.evaluate(dt_predictions))

Decision Tree: 0.9042764345001805


In [21]:
print('Random Forest:' , binary_evaluator.evaluate(rf_predictions))

Random Forest: 0.9822717430530494


In [23]:
print('Gradient-boosted Trees:', binary_evaluator.evaluate(gb_predictions))

Gradient-boosted Trees: 0.9343648502345726


In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'PrivateIndex', metricName = 'accuracy')
print('Decision Tree Accu:', multi_evaluator.evaluate(dt_predictions))

Decision Tree Accu: 0.9047619047619048


In [31]:
multi_evaluator.getMetricName()

'accuracy'

In [32]:
print('Random Forest Accu:', multi_evaluator.evaluate(rf_predictions))

Random Forest Accu: 0.935064935064935


In [33]:
print('Gradient-boosted Trees Accu:', multi_evaluator.evaluate(gb_predictions))

Gradient-boosted Trees Accu: 0.9090909090909091
