In [None]:
#This company has four preservatives which they add to their products, A,B,C, and D. 
#Find out which one is spoiling the food
#1. IMport Data
#2. Transform any features to numeric if need be
#3. Transform data to Spark Required
#4. Create Random Forest Model (Instantiate, and train)
    #label column is whether or not it has gone bad
#5. Determine feature importance

In [3]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dtproj').getOrCreate()

# 1. Import Data

In [4]:
folder = '/home/ubuntu/data/raw'
file = '/dog_food.csv'
data = spark.read.csv(folder+file, header=True, inferSchema=True)

In [6]:
data.describe().show()
data.printSchema()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+

root
 |-- A: integer (nullable = true)
 |-- B

In [8]:
import pandas as pd
import numpy as np
#Feature Correlations

from pyspark.sql.functions import corr, covar_pop
important_columns = ['A','B', 'C',
                  'D', 'Spoiled']


corr_df = pd.DataFrame(columns=important_columns, index=important_columns)
for feature in important_columns:
    for feature2 in important_columns:
        #new_data.select(corr(feature,feature2)).show()
        corr_df[feature2][feature] = data.select(corr(feature,feature2)).head()[0]
corr_df.head()


Unnamed: 0,A,B,C,D,Spoiled
A,1.0,0.00726671,0.0575878,-0.045834,0.0599725
B,0.00726671,1.0,-0.117222,-0.0364361,-0.0864745
C,0.0575878,-0.117222,1.0,-0.0344772,0.85862
D,-0.045834,-0.0364361,-0.0344772,1.0,-0.0160666
Spoiled,0.0599725,-0.0864745,0.85862,-0.0160666,1.0


# Transform to Spark Format

In [16]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')
output = assembler.transform(data)
data_final = output.select('features','Spoiled')

In [17]:
data_final.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)



# Modeling

In [23]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier
dt_clf = DecisionTreeClassifier(labelCol='Spoiled')
gbt_clf = GBTClassifier(labelCol='Spoiled')
rf_clf = RandomForestClassifier(labelCol='Spoiled')

In [19]:
train_data, test_data = data_final.randomSplit([0.7,0.3])

In [25]:
rf_fitted = rf_clf.fit(data_final)

In [26]:
rf_preds = rf_fitted.transform(test_data)

# Model Evaluation

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol='Spoiled', metricName='accuracy')
acc_evaluator.evaluate(rf_preds)

0.9779411764705882

In [30]:
rf_fitted.featureImportances
#Feature 3 (position 2) has a 90% explanation of the variance

SparseVector(4, {0: 0.032, 1: 0.0338, 2: 0.9066, 3: 0.0277})