In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('decisiontree').getOrCreate()

In [0]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier,DecisionTreeClassifier

In [0]:
df = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')

In [0]:
df.show()

In [0]:
train_df,test_df = df.randomSplit([0.7,0.3])

In [0]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [0]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

In [0]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbt_preds = gbt_model.transform(test_df)

In [0]:
dtc_preds.show()

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [0]:
print('DTC Accuracy:')
acc_eval.evaluate(dtc_preds)

In [0]:
print('RFC Accuracy:')
acc_eval.evaluate(rfc_preds)

In [0]:
print('GBT Accuracy:')
acc_eval.evaluate(gbt_preds)

In [0]:
dtc_model.featureImportances

In [0]:
rfc_model.featureImportances

In [0]:
spark = SparkSession.builder.appName('collegedecisiontree').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/College.csv',inferSchema=True,header=True)

In [0]:
df.printSchema()

In [0]:
df.head(1)

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
df.columns

In [0]:
assembler = VectorAssembler(inputCols=[
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'
],outputCol='features')

In [0]:
output = assembler.transform(df)

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='Private',outputCol='Private_index')

In [0]:
output_fixed = indexer.fit(output).transform(output)

In [0]:
output_fixed.printSchema()

In [0]:
final_df = output_fixed.select('features','Private_index')

In [0]:
train_df,test_df = final_df.randomSplit([0.7,0.3])

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [0]:
from pyspark.ml import Pipeline

In [0]:
dtc = DecisionTreeClassifier(labelCol='Private_index',featuresCol='features')
rfc = RandomForestClassifier(labelCol='Private_index',featuresCol='features')
gbt = GBTClassifier(labelCol='Private_index',featuresCol='features')

In [0]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

In [0]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbt_preds = gbt_model.transform(test_df)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
binary_eval = BinaryClassificationEvaluator(labelCol='Private_index')

In [0]:
print('DTC Accuracy')
print(binary_eval.evaluate(dtc_preds))

In [0]:
print('RFC Accuracy')
print(binary_eval.evaluate(rfc_preds))

In [0]:
gbt_preds.printSchema()

In [0]:
rfc_preds.printSchema()

In [0]:
binary_eval_gbt = BinaryClassificationEvaluator(labelCol='Private_index',rawPredictionCol='prediction')

In [0]:
print('GBT Correct Accuracy')
print(binary_eval_gbt.evaluate(gbt_preds))

print('GBT Incorrect Accuracy')
print(binary_eval.evaluate(gbt_preds))

In [0]:
rfc = RandomForestClassifier(numTrees=150,labelCol='Private_index',featuresCol='features')
rfc_model = rfc.fit(train_df)
rfc_preds = rfc_model.transform(test_df)


In [0]:
print('RFC Accuracy')
print(binary_eval.evaluate(rfc_preds))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
acc_eval = MulticlassClassificationEvaluator(labelCol='Private_index',metricName='accuracy')

In [0]:
rfc_acc = acc_eval.evaluate(rfc_preds)

In [0]:
rfc_acc