In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Trees").getOrCreate()

In [2]:
data = spark.read.csv("College.csv", inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

In [5]:
data.head(5)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60),
 Row(School='Adelphi University', Private='Yes', Apps=2186, Accept=1924, Enroll=512, Top10perc=16, Top25perc=29, F_Undergrad=2683, P_Undergrad=1227, Outstate=12280, Room_Board=6450, Books=750, Personal=1500, PhD=29, Terminal=30, S_F_Ratio=12.2, perc_alumni=16, Expend=10527, Grad_Rate=56),
 Row(School='Adrian College', Private='Yes', Apps=1428, Accept=1097, Enroll=336, Top10perc=22, Top25perc=50, F_Undergrad=1036, P_Undergrad=99, Outstate=11250, Room_Board=3750, Books=400, Personal=1165, PhD=53, Terminal=66, S_F_Ratio=12.9, perc_alumni=30, Expend=8735, Grad_Rate=54),
 Row(School='Agnes Scott College', Private='Yes', Apps=417, Accept=349, Enroll=137, Top10perc=60, Top25perc=89, F_Undergrad=510, P

In [12]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [7]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [10]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol="features")

In [11]:
output = assembler.transform(data)

In [13]:
private_indexer = StringIndexer(inputCol="Private", outputCol="Private_index")

In [14]:
final_data = private_indexer.fit(output).transform(output)

In [15]:
final_data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- Private_index: double (nullable = false)



In [16]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [17]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [18]:
train_data.count()

562

In [19]:
test_data.count()

215

In [68]:
dct = DecisionTreeClassifier(labelCol = "Private_index")
rf = RandomForestClassifier(labelCol = "Private_index", numTrees=150)
gbt = GBTClassifier(labelCol = "Private_index")

In [69]:
dct_model = dct.fit(train_data)
rf_model = rf.fit(train_data)
gbt_model = gbt.fit(train_data)

In [70]:
dct_results = dct_model.transform(test_data)
rf_results = rf_model.transform(test_data)
gbt_results = gbt_model.transform(test_data)

In [71]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [72]:
my_eval = MulticlassClassificationEvaluator(labelCol="Private_index", metricName="accuracy")

In [73]:
print("Accuracy for each model:\n")
print("DCT : {} , RF : {} , GBT : {}".format(my_eval.evaluate(dct_results), 
                                             my_eval.evaluate(rf_results), 
                                             my_eval.evaluate(gbt_results)))

Accuracy for each model:

DCT : 0.8883720930232558 , RF : 0.9534883720930233 , GBT : 0.9023255813953488


In [74]:
print(rf_results.columns)
print(rf_model.featureImportances)

['School', 'Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate', 'features', 'Private_index', 'rawPrediction', 'probability', 'prediction']
(17,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],[0.03419743543212926,0.051300266878937816,0.08162656798756215,0.015321536982385054,0.010025974926363963,0.22501130439241854,0.08897466600675795,0.2531714344842951,0.05385170224965905,0.004347883808216207,0.007882273070745412,0.01622524123743248,0.013229420695686769,0.05143211900229788,0.02649886918807719,0.04818881529612339,0.018714488360912003])


In [75]:
evaluator = BinaryClassificationEvaluator(labelCol="Private_index")

In [76]:
print("RF : {}".format(evaluator.evaluate(rf_results)))

RF : 0.9743935309973046


In [77]:
evaluator_2 = BinaryClassificationEvaluator(labelCol="Private_index", rawPredictionCol="predictions")

In [78]:
print("DCT : {} , GBT : {}".format(evaluator.evaluate(dct_results), evaluator.evaluate(gbt_results)))

DCT : 0.9212151841868823 , GBT : 0.9204290206648695
