In [6]:
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
df_sp = spark.read.csv('mushrooms.csv', header = True)

In [4]:
df_sp.head()

Row(class=u'p', cap-shape=u'x', cap-surface=u's', cap-color=u'n', bruises=u't', odor=u'p', gill-attachment=u'f', gill-spacing=u'c', gill-size=u'n', gill-color=u'k', stalk-shape=u'e', stalk-root=u'e', stalk-surface-above-ring=u's', stalk-surface-below-ring=u's', stalk-color-above-ring=u'w', stalk-color-below-ring=u'w', veil-type=u'p', veil-color=u'w', ring-number=u'o', ring-type=u'p', spore-print-color=u'k', population=u's', habitat=u'u')

In [5]:
df_sp.columns # string type, has to transform to numeric

['class',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [32]:
num_class = df_sp.select('class').groupBy('class').count()
num_class.show()

+-----+-----+
|class|count|
+-----+-----+
|    e| 4208|
|    p| 3916|
+-----+-----+



In [8]:
stringIndexer = StringIndexer(inputCol='class', outputCol='label')
df_sp = stringIndexer.fit(df_sp).transform(df_sp)
df_sp.select('class', 'label').show(3)

+-----+-----+
|class|label|
+-----+-----+
|    p|  1.0|
|    e|  0.0|
|    e|  0.0|
+-----+-----+
only showing top 3 rows



In [33]:
df_sp.select('class', 'label').show(10)

+-----+-----+
|class|label|
+-----+-----+
|    p|  1.0|
|    e|  0.0|
|    e|  0.0|
|    p|  1.0|
|    e|  0.0|
|    e|  0.0|
|    e|  0.0|
|    e|  0.0|
|    p|  1.0|
|    e|  0.0|
+-----+-----+
only showing top 10 rows



In [9]:
stringIndexer = StringIndexer(inputCol='cap-shape', outputCol='indexed_cap-shape')
df_sp = stringIndexer.fit(df_sp).transform(df_sp)

In [11]:
encoder = OneHotEncoder(inputCol = 'indexed_cap-shape', outputCol='feature_cap-shape')
df_sp = encoder.transform(df_sp)

df_sp.select('cap-shape', 'indexed_cap-shape', 'feature_cap-shape').show(3)

+---------+-----------------+-----------------+
|cap-shape|indexed_cap-shape|feature_cap-shape|
+---------+-----------------+-----------------+
|        x|              0.0|    (5,[0],[1.0])|
|        x|              0.0|    (5,[0],[1.0])|
|        b|              3.0|    (5,[3],[1.0])|
+---------+-----------------+-----------------+
only showing top 3 rows



In [89]:
df_sp.select('cap-shape', 'indexed_cap-shape', 'feature_cap-shape').show(5)

+---------+-----------------+-----------------+
|cap-shape|indexed_cap-shape|feature_cap-shape|
+---------+-----------------+-----------------+
|        x|              0.0|    (5,[0],[1.0])|
|        x|              0.0|    (5,[0],[1.0])|
|        b|              3.0|    (5,[3],[1.0])|
|        x|              0.0|    (5,[0],[1.0])|
|        x|              0.0|    (5,[0],[1.0])|
+---------+-----------------+-----------------+
only showing top 5 rows



In [93]:
df_sp.select('cap-shape').distinct().show()

+---------+
|cap-shape|
+---------+
|        x|
|        f|
|        k|
|        c|
|        b|
|        s|
+---------+



In [15]:
for header in df_sp.columns[2:-3]:
    print header, df_sp.select(header).distinct().count() # number of distinct values
    if df_sp.select(header).distinct().count() < 2: continue
        
    indexed_header_name = 'indexed_' + header
    stringIndexer = StringIndexer(inputCol=header, outputCol=indexed_header_name)
    df_sp = stringIndexer.fit(df_sp).transform(df_sp)
    
    encoder = OneHotEncoder(inputCol=indexed_header_name, outputCol='feature_'+header)
    df_sp = encoder.transform(df_sp)

cap-surface 4
cap-color 10
bruises 2
odor 9
gill-attachment 2
gill-spacing 2
gill-size 2
gill-color 12
stalk-shape 2
stalk-root 5
stalk-surface-above-ring 4
stalk-surface-below-ring 4
stalk-color-above-ring 9
stalk-color-below-ring 9
veil-type 1
veil-color 4
ring-number 3
ring-type 5
spore-print-color 9
population 6
habitat 7


In [16]:
df_sp.columns

['class',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat',
 'label',
 'indexed_cap-shape',
 'feature_cap-shape',
 'indexed_cap-surface',
 'feature_cap-surface',
 'indexed_cap-color',
 'feature_cap-color',
 'indexed_bruises',
 'feature_bruises',
 'indexed_odor',
 'feature_odor',
 'indexed_gill-attachment',
 'feature_gill-attachment',
 'indexed_gill-spacing',
 'feature_gill-spacing',
 'indexed_gill-size',
 'feature_gill-size',
 'indexed_gill-color',
 'feature_gill-color',
 'indexed_stalk-shape',
 'feature_stalk-shape',
 'indexed_stalk-root',
 'feature_stalk-root',
 'indexed_stalk-surface-above-ring',
 'feature_stalk-surface-above-ring',
 'indexed_stalk-surface-

In [18]:
lst_features = [name for name in df_sp.columns if name.startswith('feature_')]
vecAssembler = VectorAssembler(inputCols=lst_features, outputCol='features')

In [19]:
df_sp = vecAssembler.transform(df_sp)

In [20]:
df_sp.columns

['class',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat',
 'label',
 'indexed_cap-shape',
 'feature_cap-shape',
 'indexed_cap-surface',
 'feature_cap-surface',
 'indexed_cap-color',
 'feature_cap-color',
 'indexed_bruises',
 'feature_bruises',
 'indexed_odor',
 'feature_odor',
 'indexed_gill-attachment',
 'feature_gill-attachment',
 'indexed_gill-spacing',
 'feature_gill-spacing',
 'indexed_gill-size',
 'feature_gill-size',
 'indexed_gill-color',
 'feature_gill-color',
 'indexed_stalk-shape',
 'feature_stalk-shape',
 'indexed_stalk-root',
 'feature_stalk-root',
 'indexed_stalk-surface-above-ring',
 'feature_stalk-surface-above-ring',
 'indexed_stalk-surface-

In [22]:
dt = RandomForestClassifier(maxDepth= 5, labelCol='label')
df_train, df_test = df_sp.randomSplit([0.8, 0.2])

In [23]:
model = dt.fit(df_train)

In [26]:
df_predicted = model.transform(df_test.select('features', 'label'))

In [35]:
df_predicted.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(95,[3,7,9,17,18,...|  0.0|[19.0964739688749...|[0.95482369844374...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.0516374686367...|[0.95258187343183...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.0620191596105...|[0.95310095798052...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.0620191596105...|[0.95310095798052...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.1068556598487...|[0.95534278299243...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.1068556598487...|[0.95534278299243...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.0516374686367...|[0.95258187343183...|       0.0|
|(95,[3,7,9,17,18,...|  0.0|[19.0620191596105...|[0.95310095798052...|       0.0|
|(95,[3,7,12,17,18...|  0.0|[19.1068556598487...|[0.95534278299243...|       0.0|
|(95,[3,7,12,17,

In [27]:
evaluator = MulticlassClassificationEvaluator()
print evaluator.evaluate(df_predicted)
df_predicted.select('prediction').groupby('prediction').count().show()

0.990282901728
+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  853|
|       1.0|  794|
+----------+-----+



In [46]:
import pandas as pd
import numpy as np

In [69]:
df = pd.read_csv('mushrooms.csv')

In [70]:
label = df['class']

In [73]:
label = [1 if x == 'p' else 0 for x in label]

In [75]:
df = df_sp.toPandas()

In [76]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,feature_ring-number,indexed_ring-type,feature_ring-type,indexed_spore-print-color,feature_spore-print-color,indexed_population,feature_population,indexed_habitat,feature_habitat,features
0,p,x,s,n,t,p,f,c,n,k,...,"(1.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."
1,e,x,s,y,t,a,f,c,b,k,...,"(1.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",3.0,"(0.0, 0.0, 0.0, 1.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,e,b,s,w,t,l,f,c,b,n,...,"(1.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",3.0,"(0.0, 0.0, 0.0, 1.0, 0.0)",5.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,p,x,y,w,t,p,f,c,n,n,...,"(1.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,e,x,s,g,f,n,f,w,b,k,...,"(1.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [77]:
df = df[lst_features]

In [78]:
df['label'] = label

In [83]:
X = df[lst_features]
Y = df['label']

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
clf = RandomForestClassifier(max_depth= 5)

In [79]:
from sklearn.model_selection import KFold

In [80]:
kf = KFold(n_splits=5)