In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Project').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/dog_food.csv',inferSchema=True,header=True)

In [0]:
df.printSchema()

In [0]:
for item in df.head(1)[0]:
  print(item)

In [0]:
df.summary().show()

In [0]:
from pyspark.sql import functions as F

from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer

In [0]:
assembler = VectorAssembler(inputCols=['A'
                                       ,'B'
                                       ,'C'
                                       ,'D'
                                      ]
                           ,outputCol='features')

In [0]:
output = assembler.transform(df)

In [0]:
final_df = output.select('features','Spoiled')

In [0]:
train_df,test_df = final_df.randomSplit([0.7,0.3])

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [0]:
dtc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')
rfc = RandomForestClassifier(labelCol='Spoiled',featuresCol='features')
gbt = GBTClassifier(labelCol='Spoiled',featuresCol='features')

In [0]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

In [0]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbt_preds = gbt_model.transform(test_df)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
binary_eval = BinaryClassificationEvaluator(labelCol='Spoiled')

In [0]:
print('DTC Accuracy')
print(binary_eval.evaluate(dtc_preds))

In [0]:
print('RFC Accuracy')
print(binary_eval.evaluate(rfc_preds))

In [0]:
binary_eval_gbt = BinaryClassificationEvaluator(labelCol='Spoiled',rawPredictionCol='prediction')

In [0]:
print('GBT Correct Accuracy')
print(binary_eval_gbt.evaluate(gbt_preds))

In [0]:
dtc_model.featureImportances

In [0]:
rfc_model.featureImportances

In [0]:
gbt_model.featureImportances

In [0]:
rfc = RandomForestClassifier(numTrees=150,labelCol='Spoiled',featuresCol='features')
rfc_model = rfc.fit(train_df)
rfc_preds = rfc_model.transform(test_df)


In [0]:
print('RFC Accuracy')
print(binary_eval.evaluate(rfc_preds))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(labelCol='Spoiled',metricName='accuracy')

rfc_acc = acc_eval.evaluate(rfc_preds)

rfc_acc