In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [42]:
# Original Dataset
df = pd.read_csv('mushroom_original.csv')
df.head()

Unnamed: 0,Class,Cap Shape,Cap Surface,Cap Colour,Bruises,Odour,Gill Attachment,Gill Spacing,Gill Size,Gill Colour,...,Stalk Surface Above Ring,Stalk Surface Below Ring,Stalk Colour Above Ring,Stalk Colour Below Ring,Veil Colour,Ring Number,Ring Type,Spore Print Colour,Population,Habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g


In [43]:
# Cleaning dataset for CBA classifier
df.dropna()
for col in df.columns:
    df = df.drop(df[df[col] == "?"].index)
df.loc[df["Class"] == 'e', "Class"] = 1
df.loc[df["Class"] == 'p', "Class"] = 0
df_class = df['Class']
df = df.drop(['Class'], axis=1)
df.insert(loc = 0, column = 'Class', value = df_class) 
df = df.applymap(str)

df.head()

Unnamed: 0,Class,Cap Shape,Cap Surface,Cap Colour,Bruises,Odour,Gill Attachment,Gill Spacing,Gill Size,Gill Colour,...,Stalk Surface Above Ring,Stalk Surface Below Ring,Stalk Colour Above Ring,Stalk Colour Below Ring,Veil Colour,Ring Number,Ring Type,Spore Print Colour,Population,Habitat
0,0,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,1,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,1,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,0,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,1,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g


In [38]:
df.dtypes

Class                       object
Cap Shape                   object
Cap Surface                 object
Cap Colour                  object
Bruises                     object
Odour                       object
Gill Attachment             object
Gill Spacing                object
Gill Size                   object
Gill Colour                 object
Stalk Shape                 object
Stalk Root                  object
Stalk Surface Above Ring    object
Stalk Surface Below Ring    object
Stalk Colour Above Ring     object
Stalk Colour Below Ring     object
Veil Colour                 object
Ring Number                 object
Ring Type                   object
Spore Print Colour          object
Population                  object
Habitat                     object
dtype: object

In [39]:
# Cleaning data for other classifier methods
numofcols = len(df.columns)

for col in df.columns[1:]: 
    le = LabelEncoder()
    label = le.fit_transform(df[col])
    col_name = str(col)+'_Label'
    df[col_name] = label
    
    ohe = OneHotEncoder()
    feature_arr = ohe.fit_transform(df[[col_name]]).toarray()
    feature_labels = list(le.classes_)
    feature_labels = [col+'_'+x for x in feature_labels]
    
    features = pd.DataFrame(feature_arr, columns=feature_labels)
    
    df.reset_index(drop=True, inplace=True)
    features.reset_index(drop=True, inplace=True)
    
    df = pd.concat([df, features], axis=1)
    df = df.drop([col, col_name], axis=1)
    
df['Class'] = df['Class'].astype(str).astype(int)
cor_target = abs(df.corr()['Class'])
att_drop = []
print(len(cor_target))
for i in range(len(cor_target)):
    if cor_target[i] < 0.3:
        att_drop.append(i)

df = df.drop(df.columns[att_drop], axis = 1)
df['Class'] = df['Class'].astype(str).astype(int)

df.head()

98


Unnamed: 0,Class,Bruises_f,Bruises_t,Odour_f,Odour_n,Stalk Shape_e,Stalk Shape_t,Stalk Root_b,Stalk Surface Above Ring_k,Stalk Surface Above Ring_s,...,Stalk Colour Above Ring_b,Stalk Colour Above Ring_n,Stalk Colour Below Ring_b,Stalk Colour Below Ring_n,Ring Type_e,Ring Type_l,Ring Type_p,Spore Print Colour_h,Spore Print Colour_k,Spore Print Colour_n
0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [40]:
df.dtypes

Class                           int32
Bruises_f                     float64
Bruises_t                     float64
Odour_f                       float64
Odour_n                       float64
Stalk Shape_e                 float64
Stalk Shape_t                 float64
Stalk Root_b                  float64
Stalk Surface Above Ring_k    float64
Stalk Surface Above Ring_s    float64
Stalk Surface Below Ring_k    float64
Stalk Surface Below Ring_s    float64
Stalk Colour Above Ring_b     float64
Stalk Colour Above Ring_n     float64
Stalk Colour Below Ring_b     float64
Stalk Colour Below Ring_n     float64
Ring Type_e                   float64
Ring Type_l                   float64
Ring Type_p                   float64
Spore Print Colour_h          float64
Spore Print Colour_k          float64
Spore Print Colour_n          float64
dtype: object