## Import

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TreeConsulting").getOrCreate()

## Load data

- Predict why some batches of dog food are spoiled faster than it should be
- WHich of the chemical has the strongest effect?

In [3]:
import os
os.listdir()

['.ipynb_checkpoints',
 'College.csv',
 'dog_food.csv',
 'My_code_along.ipynb',
 'My_consulting_proj.ipynb',
 'My_doc_example.ipynb',
 'sample_libsvm_data.txt',
 'Tree Methods Code Along.ipynb',
 'Tree_Methods_Consulting_Project.ipynb',
 'Tree_Methods_Consulting_Project_SOLUTION.ipynb',
 'Tree_Methods_Doc_Example.ipynb']

In [4]:
df = spark.read.csv("dog_food.csv", inferSchema=True, header=True)

In [5]:
df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [6]:
print(df.count(), len(df.columns))

490 5


In [7]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [10]:
df.groupby("Spoiled").count().show()

+-------+-----+
|Spoiled|count|
+-------+-----+
|    0.0|  350|
|    1.0|  140|
+-------+-----+



## Data preparation

In [8]:
from pyspark.ml.feature import VectorAssembler

In [11]:
VecAss = VectorAssembler(inputCols=["A", "B", "C", "D"], outputCol="features")

In [12]:
output = VecAss.transform(df)

In [14]:
output.show(5)

+---+---+----+---+-------+------------------+
|  A|  B|   C|  D|Spoiled|          features|
+---+---+----+---+-------+------------------+
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0|[5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0|[6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0|[4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
+---+---+----+---+-------+------------------+
only showing top 5 rows



In [16]:
data = output.select("features", df["Spoiled"].alias("label"))

In [18]:
data.show(5)

+------------------+-----+
|          features|label|
+------------------+-----+
|[4.0,2.0,12.0,3.0]|  1.0|
|[5.0,6.0,12.0,7.0]|  1.0|
|[6.0,2.0,13.0,6.0]|  1.0|
|[4.0,2.0,12.0,1.0]|  1.0|
|[4.0,2.0,12.0,3.0]|  1.0|
+------------------+-----+
only showing top 5 rows



## Training

In [19]:
from pyspark.ml.classification import RandomForestClassifier

In [20]:
rfc = RandomForestClassifier(numTrees=4)

In [21]:
model = rfc.fit(data)

## Feature importances

In [23]:
print(model.featureImportances)

(4,[0,1,2,3],[0.012915068637268403,0.013053030945126071,0.9472315581224422,0.02680034229516335])
