## Tree Method 
* single decision tree
* random forest
* gradient boosting tree


### We will be using a college dataset to try to classify colleges as Private or Public based off these features:

* Private A factor with levels No and Yes indicating private or public university
* Apps Number of applications received
* Accept Number of applications accepted
* Enroll Number of new students enrolled
* Top10perc Pct. new students from top 10% of H.S. class
* Top25perc Pct. new students from top 25% of H.S. class
* F.Undergrad Number of fulltime undergraduates
* P.Undergrad Number of parttime undergraduates
* Outstate Out-of-state tuition
* Room.Board Room and board costs
* Books Estimated book costs
* Personal Estimated personal spending
* PhD Pct. of faculty with Ph.D.’s
* Terminal Pct. of faculty with terminal degree
* S.F.Ratio Student/faculty ratio
* perc.alumni Pct. alumni who donate
* Expend Instructional expenditure per student
* Grad.Rate Graduation rate

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('collage').getOrCreate()

In [0]:
data = spark.read.csv('dbfs:/FileStore/College.csv', inferSchema=True, header=True)

In [0]:
data.limit(5).display()

School,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F_Undergrad,P_Undergrad,Outstate,Room_Board,Books,Personal,PhD,Terminal,S_F_Ratio,perc_alumni,Expend,Grad_Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [0]:
print((data.count(), len(data.columns)))

(777, 19)


In [0]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [0]:
from pyspark.sql.functions import col, sum as _sum

In [0]:
# check missing values
missing_val = data.select([_sum(col(c).isNull().cast('int')).alias(c) for c in data.columns])
missing_val.show()

+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|     0|      0|   0|     0|     0|        0|        0|          0|          0|       0|         0|    0|       0|  0|       0|        0|          0|     0|        0|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+



In [0]:
# check duplicates
duplicates = data.exceptAll(data.dropDuplicates())
duplicates.show()

+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+



In [0]:
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

## Format for MLlib

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [0]:
assembler = VectorAssembler(inputCols = ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'],  outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
data.groupBy('private').count().show()

+-------+-----+
|private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



In [0]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol= 'Private', outputCol='PrivateIndex')
outputIndex = indexer.fit(output).transform(output)

In [0]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scalerfeatures', withStd=True, withMean=False)
outputScaler = scaler.fit(outputIndex).transform(outputIndex)

In [0]:
outputScaler.limit(5).display()

School,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F_Undergrad,P_Undergrad,Outstate,Room_Board,Books,Personal,PhD,Terminal,S_F_Ratio,perc_alumni,Expend,Grad_Rate,features,PrivateIndex,scalerfeatures
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60,"Map(vectorType -> dense, length -> 17, values -> List(1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0))",0.0,"Map(vectorType -> dense, length -> 17, values -> List(0.42891823763594444, 0.5026286066580069, 0.7759561724207533, 1.3038279424073527, 2.6256290811760095, 0.5947937878021864, 0.3527251396145811, 1.8493585669814325, 3.009036915848813, 2.725532348715692, 3.2492877794997543, 4.287073544918401, 5.29806415542497, 4.57261332483753, 0.9683822009485828, 1.3483937637407082, 3.4928986668901585))"
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56,"Map(vectorType -> dense, length -> 17, values -> List(2186.0, 1924.0, 512.0, 16.0, 29.0, 2683.0, 1227.0, 12280.0, 6450.0, 750.0, 1500.0, 29.0, 30.0, 12.2, 16.0, 10527.0, 56.0))",0.0,"Map(vectorType -> dense, length -> 17, values -> List(0.5648284743808281, 0.7849492201379912, 0.5510257424125183, 0.9070107425442453, 1.4642931414250822, 0.553147914271496, 0.8059473860467244, 3.0524359143188162, 5.881299426431771, 4.542553914526153, 2.2154234860225595, 1.776073325751909, 2.0377169828557578, 3.0820929592827544, 1.2911762679314438, 2.0159836885241353, 3.260038755764148))"
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54,"Map(vectorType -> dense, length -> 17, values -> List(1428.0, 1097.0, 336.0, 22.0, 50.0, 1036.0, 99.0, 11250.0, 3750.0, 400.0, 1165.0, 53.0, 66.0, 12.9, 30.0, 8735.0, 54.0))",0.0,"Map(vectorType -> dense, length -> 17, values -> List(0.36897303815911364, 0.4475516083635012, 0.36161064345821514, 1.2471397709983374, 2.5246433472846244, 0.2135897276128475, 0.06502753970548143, 2.7964091234598274, 3.4193601316463784, 2.4226954210806153, 1.720645574144188, 3.245927112581075, 4.482977362282667, 3.2589343585858637, 2.420955502371457, 1.67280493200896, 3.1436088002011426))"
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59,"Map(vectorType -> dense, length -> 17, values -> List(417.0, 349.0, 137.0, 60.0, 89.0, 510.0, 63.0, 12960.0, 5450.0, 450.0, 875.0, 92.0, 97.0, 7.7, 37.0, 19016.0, 59.0))",0.0,"Map(vectorType -> dense, length -> 17, values -> List(0.10774632836999327, 0.14238424003542566, 0.14744243498147463, 3.40129028454092, 4.493865158166631, 0.10514552228045582, 0.04138116163076091, 3.221463310225721, 4.969470057992736, 2.725532348715692, 1.2923303668464932, 5.63443951617847, 6.588618244566949, 1.9452553923341978, 2.9858451195914637, 3.6416781439132664, 3.434683689108656))"
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15,"Map(vectorType -> dense, length -> 17, values -> List(193.0, 146.0, 55.0, 16.0, 44.0, 249.0, 869.0, 7560.0, 4120.0, 800.0, 1500.0, 76.0, 72.0, 11.9, 2.0, 10922.0, 15.0))",0.0,"Map(vectorType -> dense, length -> 17, values -> List(0.049868204737191134, 0.059564753711094966, 0.05919221842321974, 0.9070107425442453, 2.2216861456104695, 0.051335754995751964, 0.570797292970337, 1.879186930965004, 3.756736997968821, 4.845390842161231, 2.2154234860225595, 4.654536991625693, 4.890520758853818, 3.006303788152851, 0.16139703349143047, 2.0916285595193886, 0.8732246667225396))"


In [0]:
final_data = outputScaler.select('scalerfeatures', 'PrivateIndex')

## Train Test Split

In [0]:
train, test = final_data.randomSplit([0.7, 0.3])

## Model Classifications

In [0]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [0]:
# create the model
rfc = RandomForestClassifier(featuresCol='scalerfeatures', labelCol='PrivateIndex')
gbt = GBTClassifier(featuresCol='scalerfeatures', labelCol='PrivateIndex')
dct = DecisionTreeClassifier(featuresCol='scalerfeatures', labelCol='PrivateIndex')

In [0]:
# train the model 
rfcModel = rfc.fit(train)
gbtModel = gbt.fit(train)
dctModel = dct.fit(train)

In [0]:
# prediction
rfcpredict = rfcModel.transform(test)
gbtpredict = gbtModel.transform(test)
dctpredict = dctModel.transform(test)

## Evaluation Classification

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='PrivateIndex', metricName='accuracy')


In [0]:
acc_rfc = eval.evaluate(rfcpredict)
acc_gbt = eval.evaluate(gbtpredict)
acc_dct = eval.evaluate(dctpredict)

In [0]:
print('Here are the result')
print('-' *100 )
print('Random Forest had a accuracy result : {:.2f}%'.format(acc_rfc * 100))
print('-' *100 )
print('Gradient Boostin Trees had a accuracy result : {:.2f}%'.format(acc_gbt * 100) )
print('-' *100 )
print('Deciont tree had a accuracy result : {:.2f}%'.format(acc_rfc * 100) )

Here are the result
----------------------------------------------------------------------------------------------------
Random Forest had a accuracy result : 94.64%
----------------------------------------------------------------------------------------------------
Gradient Boostin Trees had a accuracy result : 92.41%
----------------------------------------------------------------------------------------------------
Deciont tree had a accuracy result : 94.64%
