In [1]:
def shape(data):
    num_rows = data.count()
    num_columns = len(data.columns)
    print(f"Shape: ({num_rows}, {num_columns})")

In [2]:
import warnings
warnings.filterwarnings("ignore") # Ignores all warnings

In [3]:
import findspark
findspark.init()

import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('treecode').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/09 20:53:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df = spark.read.csv('college.csv', inferSchema=True, header=True)

In [7]:
df.printSchema()

root
 |-- private: string (nullable = true)
 |-- apps: integer (nullable = true)
 |-- accept: integer (nullable = true)
 |-- enroll: integer (nullable = true)
 |-- top10perc: integer (nullable = true)
 |-- top25perc: integer (nullable = true)
 |-- f_undergrad: integer (nullable = true)
 |-- p_undergrad: integer (nullable = true)
 |-- outstate: integer (nullable = true)
 |-- room_board: integer (nullable = true)
 |-- books: integer (nullable = true)
 |-- personal: integer (nullable = true)
 |-- phd: integer (nullable = true)
 |-- terminal: integer (nullable = true)
 |-- s_f_ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- expend: integer (nullable = true)
 |-- grad_rate: integer (nullable = true)



In [8]:
shape(df)

Shape: (777, 18)


In [9]:
df.show(3)

+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|private|apps|accept|enroll|top10perc|top25perc|f_undergrad|p_undergrad|outstate|room_board|books|personal|phd|terminal|s_f_ratio|perc_alumni|expend|grad_rate|
+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|    Yes|1428|  1097|   336|       22|       50|       1036|         99|   11250|      3750|  400|    1165| 53|      66|     12.9|         30|  8735|       54|
+-------+----+------+------+---------+--

In [10]:
df.columns

['private',
 'apps',
 'accept',
 'enroll',
 'top10perc',
 'top25perc',
 'f_undergrad',
 'p_undergrad',
 'outstate',
 'room_board',
 'books',
 'personal',
 'phd',
 'terminal',
 's_f_ratio',
 'perc_alumni',
 'expend',
 'grad_rate']

In [11]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vectors

In [12]:
assembler = VectorAssembler(
    inputCols=['apps',
               'accept',
               'enroll',
               'top10perc',
               'top25perc',
               'f_undergrad',
               'p_undergrad',
               'outstate',
               'room_board',
               'books',
               'personal',
               'phd',
               'terminal',
               's_f_ratio',
               'perc_alumni',
               'expend',
               'grad_rate'],
    outputCol="features")

In [13]:
output = assembler.transform(df)

### Deal with Private column "yes" or "no"

In [14]:
indexer = StringIndexer(inputCol="private", outputCol="privateIndex")
output_fixed = indexer.fit(output).transform(output)

In [15]:
final_df = output_fixed.select("features", "privateIndex")

In [16]:
train_df, valid_df = final_df.randomSplit([.7, .3])

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [18]:
dtc = DecisionTreeClassifier(labelCol='privateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='privateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='privateIndex', featuresCol='features')

In [19]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

In [20]:
dtc_predictions = dtc_model.transform(valid_df)
rfc_predictions = rfc_model.transform(valid_df)
gbt_predictions = gbt_model.transform(valid_df)

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol='privateIndex',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')

In [23]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

25/07/09 20:53:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/07/09 20:53:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [24]:
print("Results: ")
print('-'*80)
print(' A single decision tree - accuracy: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print(' A random forest assemble - accuracy: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print(' An ensemble using GBT - accuracy: {0:2.2f}%'.format(gbt_acc*100))

Results: 
--------------------------------------------------------------------------------
 A single decision tree - accuracy: 86.31%
--------------------------------------------------------------------------------
 A random forest assemble - accuracy: 90.04%
--------------------------------------------------------------------------------
 An ensemble using GBT - accuracy: 86.31%
