# Binary Prediction of Poisonous Mushrooms

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
!kaggle competitions download -c playground-series-s4e8

playground-series-s4e8.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
from zipfile import ZipFile
with ZipFile('playground-series-s4e8.zip') as f:
    f.extractall('poisonous_mushroom_data')

In [4]:
data_dir = 'poisonous_mushroom_data/'

In [5]:
import pandas as pd

In [6]:
raw_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sub_df = pd.read_csv(data_dir + 'sample_submission.csv')

In [7]:
raw_df

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [8]:
raw_df.dtypes

id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
dtype: object

In [9]:
raw_df.isna().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [10]:
numeric_cols = ['cap-diameter', 'stem-height', 'stem-width']
categorical_cols = raw_df.select_dtypes(include='object').columns.tolist()

In [11]:
categorical_cols.remove('class')

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
categorical_cols

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [14]:
raw_df['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

## Exploring and preprocessing

### Cap

In [15]:
raw_df['cap-shape'].value_counts().head(15)

cap-shape
x    1436026
f     676238
s     365146
b     318646
o     108835
p     106967
c     104520
d         65
e         60
n         41
t         36
w         36
g         34
y         33
r         32
Name: count, dtype: int64

In [16]:
cap_shape_lst = ['x','f','s','b','o','p','c']
raw_df['cap-shape'] = raw_df['cap-shape'].apply(lambda x: x if x in cap_shape_lst else 'x')

In [17]:
total = raw_df['cap-shape'].value_counts()
e = raw_df[raw_df['class']=='e']['cap-shape'].value_counts()

In [18]:
e/total * 100

cap-shape
b    22.696346
c    45.722350
f    48.805598
o    32.518951
p    54.059663
s    45.054307
x    48.981514
Name: count, dtype: float64

In [19]:
raw_df['cap-shape_b'] = raw_df['cap-shape'].apply(lambda x: 1 if x == 'b' else 0)
raw_df['cap-shape_o'] = raw_df['cap-shape'].apply(lambda x: 1 if x == 'o' else 0)

In [20]:
test_df['cap-shape'] = test_df['cap-shape'].apply(lambda x: x if x in cap_shape_lst else 'x')

In [21]:
test_df['cap-shape_b'] = test_df['cap-shape'].apply(lambda x: 1 if x == 'b' else 0)
test_df['cap-shape_o'] = test_df['cap-shape'].apply(lambda x: 1 if x == 'o' else 0)

In [22]:
categorical_cols.remove('cap-shape')
categorical_cols += ['cap-shape_b', 'cap-shape_o']

In [23]:
cap_surface_lst = raw_df['cap-surface'].value_counts().head(10).index

In [24]:
raw_df['cap-surface'] = raw_df['cap-surface'].apply(lambda x: x if x in cap_surface_lst else 'unk')

In [25]:
total = raw_df['cap-surface'].value_counts()
e = raw_df[raw_df['class']=='e']['cap-surface'].value_counts()

In [26]:
e/total*100

cap-surface
d      41.211708
e      52.156843
g      40.281501
h      52.987063
i      17.808533
k       9.844423
s      63.065693
t      35.760465
unk    49.140665
w      34.596686
y      53.801712
Name: count, dtype: float64

In [27]:
raw_df['cap-surface_i'] = raw_df['cap-surface'].apply(lambda x: 1 if x=='i' else 0)
raw_df['cap-surface_k'] = raw_df['cap-surface'].apply(lambda x: 1 if x=='k' else 0)
raw_df['cap-surface_t'] = raw_df['cap-surface'].apply(lambda x: 1 if x=='t' else 0)
raw_df['cap-surface_w'] = raw_df['cap-surface'].apply(lambda x: 1 if x=='w' else 0)
raw_df['cap-surface_s'] = raw_df['cap-surface'].apply(lambda x: 1 if x=='s' else 0)

In [28]:
test_df['cap-surface'] = test_df['cap-surface'].apply(lambda x: x if x in cap_surface_lst else 'unk')
test_df['cap-surface_i'] = test_df['cap-surface'].apply(lambda x: 1 if x=='i' else 0)
test_df['cap-surface_k'] = test_df['cap-surface'].apply(lambda x: 1 if x=='k' else 0)
test_df['cap-surface_t'] = test_df['cap-surface'].apply(lambda x: 1 if x=='t' else 0)
test_df['cap-surface_w'] = test_df['cap-surface'].apply(lambda x: 1 if x=='w' else 0)
test_df['cap-surface_s'] = test_df['cap-surface'].apply(lambda x: 1 if x=='s' else 0)

In [29]:
categorical_cols.remove('cap-surface')
categorical_cols += ['cap-surface_i','cap-surface_k','cap-surface_t','cap-surface_w','cap-surface_s']

In [30]:
cap_color_lst = raw_df['cap-color'].value_counts().head(12).index

In [31]:
raw_df['cap-color'] = raw_df['cap-color'].apply(lambda x: x if x in cap_color_lst else 'n')

In [32]:
total = raw_df['cap-color'].value_counts()
e = raw_df[raw_df['class']=='e']['cap-color'].value_counts()

In [33]:
e/total*100

cap-color
b    87.395821
e    20.675148
g    58.808016
k    40.151616
l    51.071844
n    50.907450
o    28.273888
p    31.006773
r     9.306457
u    46.068168
w    50.294116
y    37.705075
Name: count, dtype: float64

In [34]:
raw_df['cap-color_b'] = raw_df['cap-color'].apply(lambda x: 1 if x=='b' else 0)
raw_df['cap-color_e'] = raw_df['cap-color'].apply(lambda x: 1 if x=='e' else 0)
raw_df['cap-color_o'] = raw_df['cap-color'].apply(lambda x: 1 if x=='o' else 0)
raw_df['cap-color_p'] = raw_df['cap-color'].apply(lambda x: 1 if x=='p' else 0)
raw_df['cap-color_r'] = raw_df['cap-color'].apply(lambda x: 1 if x=='r' else 0)

In [35]:
test_df['cap-color'] = test_df['cap-color'].apply(lambda x: x if x in cap_color_lst else 'n')
test_df['cap-color_b'] = test_df['cap-color'].apply(lambda x: 1 if x=='b' else 0)
test_df['cap-color_e'] = test_df['cap-color'].apply(lambda x: 1 if x=='e' else 0)
test_df['cap-color_o'] = test_df['cap-color'].apply(lambda x: 1 if x=='o' else 0)
test_df['cap-color_p'] = test_df['cap-color'].apply(lambda x: 1 if x=='p' else 0)
test_df['cap-color_r'] = test_df['cap-color'].apply(lambda x: 1 if x=='r' else 0)

In [36]:
categorical_cols.remove('cap-color')
categorical_cols += ['cap-color_b','cap-color_e','cap-color_o','cap-color_p','cap-color_r']

### Bruise

In [37]:
raw_df['does-bruise-or-bleed'].value_counts(dropna=False)

does-bruise-or-bleed
f           2569743
t            547085
w                14
c                11
h                 9
NaN               8
y                 7
a                 7
b                 7
x                 7
s                 6
k                 6
p                 4
e                 4
l                 4
d                 4
g                 3
o                 3
z                 3
n                 2
i                 2
has-ring          1
3.43              1
r                 1
4.42              1
2.9               1
u                 1
Name: count, dtype: int64

In [38]:
bruise_lst = ['f','t']
raw_df['does-bruise-or-bleed'] = raw_df['does-bruise-or-bleed'].apply(lambda x: x if x in bruise_lst else 'f')

In [39]:
total = raw_df['does-bruise-or-bleed'].value_counts()
e = raw_df[raw_df['class']=='e']['does-bruise-or-bleed'].value_counts()

In [40]:
e/total*100

does-bruise-or-bleed
f    44.423821
t    49.337671
Name: count, dtype: float64

In [41]:
categorical_cols.remove('does-bruise-or-bleed')

### Gill

In [42]:
raw_df['gill-attachment'].value_counts(dropna=False).head(8)

gill-attachment
a      646034
d      589236
NaN    523936
x      360878
e      301858
s      295439
p      279110
f      119953
Name: count, dtype: int64

In [43]:
gill_attachment_lst = raw_df['gill-attachment'].value_counts().head(7).index
raw_df['gill-attachment'] = raw_df['gill-attachment'].apply(lambda x: x if x in gill_attachment_lst else 'unk')

In [44]:
total = raw_df['gill-attachment'].value_counts()
e = raw_df[raw_df['class']=='e']['gill-attachment'].value_counts()

In [45]:
e/total*100

gill-attachment
a      35.365631
d      42.862283
e      65.103459
f      48.954174
p      70.926516
s      41.084285
unk    36.108818
x      46.154933
Name: count, dtype: float64

In [46]:
raw_df['gill-attachment_a'] = raw_df['gill-attachment'].apply(lambda x: 1 if x=='a' else 0)
raw_df['gill-attachment_e'] = raw_df['gill-attachment'].apply(lambda x: 1 if x=='e' else 0)
raw_df['gill-attachment_p'] = raw_df['gill-attachment'].apply(lambda x: 1 if x=='p' else 0)

In [47]:
test_df['gill-attachment'] = test_df['gill-attachment'].apply(lambda x: x if x in gill_attachment_lst else 'unk')

In [48]:
test_df['gill-attachment_a'] = test_df['gill-attachment'].apply(lambda x: 1 if x=='a' else 0)
test_df['gill-attachment_e'] = test_df['gill-attachment'].apply(lambda x: 1 if x=='e' else 0)
test_df['gill-attachment_p'] = test_df['gill-attachment'].apply(lambda x: 1 if x=='p' else 0)

In [49]:
categorical_cols.remove('gill-attachment')
categorical_cols += ['gill-attachment_a','gill-attachment_e','gill-attachment_p']

In [50]:
raw_df['gill-spacing'].value_counts(dropna=False)

gill-spacing
c              1331054
NaN            1258435
d               407932
f               119380
e                   24
a                   17
s                   16
b                   12
x                    8
t                    8
p                    7
k                    4
g                    4
l                    3
h                    3
r                    2
6.67                 2
0                    2
y                    2
3.81                 1
1.6                  1
n                    1
1.88                 1
does f               1
4.09                 1
1.36                 1
3.24                 1
5.55                 1
5.42                 1
5.7                  1
3.62                 1
6.4                  1
3.57                 1
12.27                1
1                    1
cap-surface          1
w                    1
24.38                1
i                    1
0.73                 1
4.04                 1
5.22                 1
3.92                 

In [51]:
gill_spacing_lst = ['c','d','f']
raw_df['gill-spacing'] = raw_df['gill-spacing'].apply(lambda x: x if x in gill_spacing_lst else 'unk')

In [52]:
total = raw_df['gill-spacing'].value_counts()
e = raw_df[raw_df['class']=='e']['gill-spacing'].value_counts()

In [53]:
e/total*100

gill-spacing
c      42.755440
unk    42.916019
d      59.775404
f      48.983079
Name: count, dtype: float64

In [54]:
categorical_cols.remove('gill-spacing')

In [55]:
raw_df['gill-color'].value_counts(dropna=False).head(8)

gill-color
w    931538
n    543386
y    469464
p    343626
g    212164
o    157119
k    127970
f    119694
Name: count, dtype: int64

In [56]:
gill_color_lst = raw_df['gill-color'].value_counts(dropna=False).head(8).index

In [57]:
raw_df['gill-color'] = raw_df['gill-color'].apply(lambda x: x if x in gill_color_lst else 'unk')

In [58]:
total = raw_df['gill-color'].value_counts()
e = raw_df[raw_df['class']=='e']['gill-color'].value_counts()

In [59]:
e/total*100

gill-color
f      49.012482
g      51.900417
k      42.262249
n      28.512144
o      48.262145
p      40.979146
unk    41.068194
w      57.156015
y      42.095667
Name: count, dtype: float64

In [60]:
raw_df['gill-color_n'] = raw_df['gill-color'].apply(lambda x: 1 if x=='n' else 0)

In [61]:
test_df['gill-color'] = test_df['gill-color'].apply(lambda x: x if x in gill_color_lst else 'unk')
test_df['gill-color_n'] = test_df['gill-color'].apply(lambda x: 1 if x=='n' else 0)

In [62]:
categorical_cols.remove('gill-color')
categorical_cols += ['gill-color_n']

### Stem

In [63]:
raw_df['stem-root'].value_counts(dropna=False).head(15)

stem-root
NaN    2757023
b       165801
s       116946
r        47803
c        28592
f          597
d           24
y           14
w           12
p           12
g           12
k           11
l           10
n           10
t           10
Name: count, dtype: int64

In [64]:
stem_root_lst = ['b','s','r','c']
raw_df['stem-root'] = raw_df['stem-root'].apply(lambda x: x if x in stem_root_lst else 'unk')

In [65]:
total = raw_df['stem-root'].value_counts()
e = raw_df[raw_df['class']=='e']['stem-root'].value_counts()

In [66]:
e/total*100

stem-root
unk    45.996904
b      64.812637
s      30.134421
r       0.476958
c       0.405708
Name: count, dtype: float64

In [67]:
raw_df['stem-root_s'] = raw_df['stem-root'].apply(lambda x: 1 if x=='s' else 0)
raw_df['stem-root_r'] = raw_df['stem-root'].apply(lambda x: 1 if x=='r' else 0)
raw_df['stem-root_c'] = raw_df['stem-root'].apply(lambda x: 1 if x=='c' else 0)

In [68]:
test_df['stem-root'] = test_df['stem-root'].apply(lambda x: x if x in stem_root_lst else 'unk')
test_df['stem-root_s'] = test_df['stem-root'].apply(lambda x: 1 if x=='s' else 0)
test_df['stem-root_r'] = test_df['stem-root'].apply(lambda x: 1 if x=='r' else 0)
test_df['stem-root_c'] = test_df['stem-root'].apply(lambda x: 1 if x=='c' else 0)

In [69]:
categorical_cols.remove('stem-root')
categorical_cols += ['stem-root_s','stem-root_r','stem-root_c']

In [70]:
raw_df['stem-surface'].value_counts(dropna=False).head(10)

stem-surface
NaN    1980861
s       327610
y       255500
i       224346
t       147974
g        78080
k        73383
h        28283
f          512
w           49
Name: count, dtype: int64

In [71]:
stem_surface_lst = raw_df['stem-surface'].value_counts().head(7).index
raw_df['stem-surface'] = raw_df['stem-surface'].apply(lambda x: x if x in stem_surface_lst else 'unk')

In [72]:
total = raw_df['stem-surface'].value_counts()
e = raw_df[raw_df['class']=='e']['stem-surface'].value_counts()

In [73]:
e/total*100

stem-surface
g       0.389344
h       0.364176
i      36.145062
k      30.204543
s      60.449315
t      45.874275
unk    48.719654
y      29.924070
Name: count, dtype: float64

In [74]:
raw_df['stem-surface_g'] = raw_df['stem-surface'].apply(lambda x: 1 if x=='g' else 0)
raw_df['stem-surface_h'] = raw_df['stem-surface'].apply(lambda x: 1 if x=='h' else 0)
raw_df['stem-surface_k'] = raw_df['stem-surface'].apply(lambda x: 1 if x=='k' else 0)
raw_df['stem-surface_y'] = raw_df['stem-surface'].apply(lambda x: 1 if x=='y' else 0)

In [75]:
test_df['stem-surface'] = test_df['stem-surface'].apply(lambda x: x if x in stem_surface_lst else 'unk')
test_df['stem-surface_g'] = test_df['stem-surface'].apply(lambda x: 1 if x=='g' else 0)
test_df['stem-surface_h'] = test_df['stem-surface'].apply(lambda x: 1 if x=='h' else 0)
test_df['stem-surface_k'] = test_df['stem-surface'].apply(lambda x: 1 if x=='k' else 0)
test_df['stem-surface_y'] = test_df['stem-surface'].apply(lambda x: 1 if x=='y' else 0)

In [76]:
categorical_cols.remove('stem-surface')
categorical_cols += ['stem-surface_g','stem-surface_h','stem-surface_k','stem-surface_y']

In [77]:
raw_df['stem-color'].value_counts().head(6)

stem-color
w    1196637
n    1003464
y     373971
g     132019
o     111541
e     103373
Name: count, dtype: int64

In [78]:
stem_color_lst = raw_df['stem-color'].value_counts().head(6).index
raw_df['stem-color'] = raw_df['stem-color'].apply(lambda x: x if x in stem_color_lst else 'unk')

In [79]:
total = raw_df['stem-color'].value_counts()
e = raw_df[raw_df['class']=='e']['stem-color'].value_counts()

In [80]:
e/total*100

stem-color
e      29.870469
g      60.857907
n      38.907225
o      40.334048
unk    27.362458
w      58.228602
y      30.622428
Name: count, dtype: float64

In [81]:
raw_df['stem-color_e'] = raw_df['stem-color'].apply(lambda x: 1 if x=='e' else 0)
raw_df['stem-color_y'] = raw_df['stem-color'].apply(lambda x: 1 if x=='y' else 0)
raw_df['stem-color_unk'] = raw_df['stem-color'].apply(lambda x: 1 if x=='unk' else 0)

In [82]:
test_df['stem-color'] = test_df['stem-color'].apply(lambda x: x if x in stem_color_lst else 'unk')
test_df['stem-color_e'] = test_df['stem-color'].apply(lambda x: 1 if x=='e' else 0)
test_df['stem-color_y'] = test_df['stem-color'].apply(lambda x: 1 if x=='y' else 0)
test_df['stem-color_unk'] = test_df['stem-color'].apply(lambda x: 1 if x=='unk' else 0)

In [83]:
categorical_cols.remove('stem-color')
categorical_cols += ['stem-color_e','stem-color_y','stem-color_unk']

### Veil

In [84]:
raw_df['veil-type'].value_counts(dropna=False).head(10)

veil-type
NaN    2957493
u       159373
w           11
a            9
e            8
f            8
b            5
c            5
g            4
y            4
Name: count, dtype: int64

In [85]:
raw_df['veil-type'] = raw_df['veil-type'].apply(lambda x: x if x=='u' else 'unk')

In [86]:
total = raw_df['veil-type'].value_counts()
e = raw_df[raw_df['class']=='e']['veil-type'].value_counts()

In [87]:
e/total*100

veil-type
unk    45.870667
u      34.441844
Name: count, dtype: float64

In [88]:
raw_df['veil-type_u'] = raw_df['veil-type'].apply(lambda x: 1 if x=='u' else 0)

In [89]:
test_df['veil-type'] = test_df['veil-type'].apply(lambda x: x if x=='u' else 'unk')
test_df['veil-type_u'] = test_df['veil-type'].apply(lambda x: 1 if x=='u' else 0)

In [90]:
categorical_cols.remove('veil-type')
categorical_cols += ['veil-type_u']

In [91]:
raw_df['veil-color'].value_counts(dropna=False).head(10)

veil-color
NaN    2740947
w       279070
y        30473
n        30039
u        14026
k        13080
e         9169
g           30
p           23
r           14
Name: count, dtype: int64

In [92]:
veil_color_lst = ['w','y','n']
raw_df['veil-color'] = raw_df['veil-color'].apply(lambda x: x if x in veil_color_lst else 'unk')

In [93]:
total = raw_df['veil-color'].value_counts()
e = raw_df[raw_df['class']=='e']['veil-color'].value_counts()

In [94]:
e/total*100

veil-color
unk    44.743809
w      49.602250
y      99.478227
n       0.372849
Name: count, dtype: float64

In [95]:
raw_df['veil-color_y'] = raw_df['veil-color'].apply(lambda x: 1 if x=='y' else 0)
raw_df['veil-color_n'] = raw_df['veil-color'].apply(lambda x: 1 if x=='n' else 0)

In [96]:
test_df['veil-color'] = test_df['veil-color'].apply(lambda x: x if x in veil_color_lst else 'unk')
test_df['veil-color_y'] = test_df['veil-color'].apply(lambda x: 1 if x=='y' else 0)
test_df['veil-color_n'] = test_df['veil-color'].apply(lambda x: 1 if x=='n' else 0)

In [97]:
categorical_cols.remove('veil-color')
categorical_cols += ['veil-color_y','veil-color_n']

### Ring

In [98]:
raw_df['has-ring'].value_counts(dropna=False).head(10)

has-ring
f      2368820
t       747982
NaN         24
r           16
h           13
c           11
l           11
s           11
p           11
g            8
Name: count, dtype: int64

In [99]:
has_ring_lst = ['f','t']
raw_df['has-ring'] = raw_df['has-ring'].apply(lambda x: x if x in has_ring_lst else 'f')

In [100]:
total = raw_df['has-ring'].value_counts()
e = raw_df[raw_df['class']=='e']['has-ring'].value_counts()

In [101]:
e/total*100

has-ring
f    46.674515
t    40.889620
Name: count, dtype: float64

In [102]:
categorical_cols.remove('has-ring')

In [103]:
raw_df['ring-type'].value_counts(dropna=False).head(10)

ring-type
f      2477170
NaN     128880
e       120006
z       113780
l        73443
r        67909
p        67678
g        63687
m         3992
t           98
Name: count, dtype: int64

In [104]:
ring_type_lst = ['f','e','z']
raw_df['ring-type'] = raw_df['ring-type'].apply(lambda x: x if x in ring_type_lst else 'unk')

In [105]:
total = raw_df['ring-type'].value_counts()
e = raw_df[raw_df['class']=='e']['ring-type'].value_counts()

In [106]:
e/total*100

ring-type
f      45.326158
unk    58.580898
e      42.059564
z       0.384075
Name: count, dtype: float64

In [107]:
raw_df['ring-type_z'] = raw_df['ring-type'].apply(lambda x: 1 if x=='z' else 0)

In [108]:
test_df['ring-type'] = test_df['ring-type'].apply(lambda x: x if x in ring_type_lst else 'unk')
test_df['ring-type_z'] = test_df['ring-type'].apply(lambda x: 1 if x=='z' else 0)

In [109]:
categorical_cols.remove('ring-type')
categorical_cols += ['ring-type_z']

### Other

In [110]:
raw_df['spore-print-color'].value_counts(dropna=False).head(10)

spore-print-color
NaN    2849682
k       107310
p        68237
w        50173
n        22646
r         7975
u         7256
g         3492
y           36
s           21
Name: count, dtype: int64

In [111]:
spore_lst = ['k','p','w','n']
raw_df['spore-print-color'] = raw_df['spore-print-color'].apply(lambda x: x if x in spore_lst else 'unk')

In [112]:
total = raw_df['spore-print-color'].value_counts()
e = raw_df[raw_df['class']=='e']['spore-print-color'].value_counts()

In [113]:
e/total*100

spore-print-color
k      17.463424
n       0.273779
p      28.768850
unk    46.906465
w      54.943894
Name: count, dtype: float64

In [114]:
raw_df['spore_k'] = raw_df['spore-print-color'].apply(lambda x: 1 if x=='k' else 0)
raw_df['spore_n'] = raw_df['spore-print-color'].apply(lambda x: 1 if x=='n' else 0)
raw_df['spore_p'] = raw_df['spore-print-color'].apply(lambda x: 1 if x=='p' else 0)

In [115]:
test_df['spore-print-color'] = test_df['spore-print-color'].apply(lambda x: x if x in spore_lst else 'unk')
test_df['spore_k'] = test_df['spore-print-color'].apply(lambda x: 1 if x=='k' else 0)
test_df['spore_n'] = test_df['spore-print-color'].apply(lambda x: 1 if x=='n' else 0)
test_df['spore_p'] = test_df['spore-print-color'].apply(lambda x: 1 if x=='p' else 0)

In [116]:
categorical_cols.remove('spore-print-color')
categorical_cols += ['spore_k','spore_n','spore_p']

In [117]:
raw_df['habitat'].value_counts(dropna=False).head(10)

habitat
d    2177573
g     454908
l     171892
m     150969
h     120137
w      18530
p      17180
u       5264
e         55
s         52
Name: count, dtype: int64

In [118]:
habitat_lst = ['d','g','l','m','h']
raw_df['habitat'] = raw_df['habitat'].apply(lambda x: x if x in habitat_lst else 'd')

In [119]:
total = raw_df['habitat'].value_counts()
e = raw_df[raw_df['class']=='e']['habitat'].value_counts()

In [120]:
e/total*100

habitat
d    47.316023
g    32.539766
l    60.507761
m    46.238632
h    33.085561
Name: count, dtype: float64

In [121]:
raw_df['habitat_g'] = raw_df['habitat'].apply(lambda x: 1 if x=='g' else 0)
raw_df['habitat_h'] = raw_df['habitat'].apply(lambda x: 1 if x=='h' else 0)

In [122]:
test_df['habitat'] = test_df['habitat'].apply(lambda x: x if x in habitat_lst else 'd')
test_df['habitat_g'] = test_df['habitat'].apply(lambda x: 1 if x=='g' else 0)
test_df['habitat_h'] = test_df['habitat'].apply(lambda x: 1 if x=='h' else 0)

In [123]:
categorical_cols.remove('habitat')
categorical_cols += ['habitat_g','habitat_h']

In [124]:
raw_df['season'].value_counts(dropna=False)

season
a    1543321
u    1153588
w     278189
s     141847
Name: count, dtype: int64

In [125]:
total = raw_df['season'].value_counts()
e = raw_df[raw_df['class']=='e']['season'].value_counts()

In [126]:
e/total*100

season
a    42.841379
u    41.753902
w    65.399063
s    61.170134
Name: count, dtype: float64

In [127]:
raw_df['season_w'] = raw_df['season'].apply(lambda x: 1 if x=='w' else 0)
raw_df['season_s'] = raw_df['season'].apply(lambda x: 1 if x=='s' else 0)
raw_df['season_a'] = raw_df['season'].apply(lambda x: 1 if x=='a' else 0)
raw_df['season_u'] = raw_df['season'].apply(lambda x: 1 if x=='u' else 0)

In [128]:
test_df['season_w'] = test_df['season'].apply(lambda x: 1 if x=='w' else 0)
test_df['season_s'] = test_df['season'].apply(lambda x: 1 if x=='s' else 0)
test_df['season_a'] = test_df['season'].apply(lambda x: 1 if x=='a' else 0)
test_df['season_u'] = test_df['season'].apply(lambda x: 1 if x=='u' else 0)

In [129]:
categorical_cols.remove('season')
categorical_cols += ['season_a','season_u','season_w','season_s']

In [130]:
len(categorical_cols)

39

### Numeric cols

In [131]:
raw_df[numeric_cols].isna().sum()

cap-diameter    4
stem-height     0
stem-width      0
dtype: int64

In [132]:
avg_cap_diameter = raw_df['cap-diameter'].mean()
avg_cap_diameter

6.309848357732786

In [133]:
raw_df = raw_df.fillna({'cap-diameter': avg_cap_diameter})

In [134]:
from sklearn.impute import SimpleImputer

In [135]:
imputer = SimpleImputer()
imputer.fit(raw_df[numeric_cols])

In [136]:
raw_df[numeric_cols] = imputer.transform(raw_df[numeric_cols])
test_df[numeric_cols] = imputer.transform(test_df[numeric_cols])

In [137]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(raw_df[numeric_cols])

In [138]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [139]:
raw_df[numeric_cols].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,3116945.0,3116945.0,3116945.0
mean,9.935327e-16,1.106246e-15,1.969617e-15
std,1.0,1.0,1.0
min,-1.348207,-2.351448,-1.37778
25%,-0.6418839,-0.6216615,-0.7638569
50%,-0.1201926,-0.1734726,-0.1857563
75%,0.41438,0.3932456,0.5529279
max,15.96422,30.5108,11.33302


In [140]:
test_df[numeric_cols].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,2077964.0,2077964.0,2077964.0
mean,-0.00078492,-0.0006756784,-0.0006684311
std,1.00591,0.9997121,1.000581
min,-1.354647,-2.351448,-1.37778
25%,-0.6440308,-0.6216615,-0.7638569
50%,-0.1223395,-0.1734726,-0.1869915
75%,0.4122331,0.3932456,0.5516927
max,128.9608,18.869,11.33426


In [141]:
input_cols = numeric_cols + categorical_cols
target_col = 'class'

In [142]:
inputs = raw_df[input_cols]
targets = raw_df[target_col]
test_inputs = test_df[input_cols]

In [143]:
from sklearn.model_selection import train_test_split

In [144]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.25, random_state=42)

## Model

### Random Forest

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [146]:
def test_params(train_inputs, train_targets, val_inputs, val_targets, **params):
    model = RandomForestClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [147]:
sample_df = raw_df[input_cols+[target_col]].sample(800000)

In [148]:
sample_train_inputs, sample_val_inputs, sample_train_targets, sample_val_targets = train_test_split(
    sample_df[input_cols],
    sample_df[target_col],
    test_size=0.25,
    random_state=42
)

In [149]:
for i in [12,20,28,36]:
    train_acc, val_acc = test_params(sample_train_inputs, sample_train_targets, 
                                     sample_val_inputs, sample_val_targets,
                                     n_jobs=-1, random_state=1212,
                                     n_estimators=250, max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 12: train_acc = 0.923955, val_acc = 0.922525
Test 20: train_acc = 0.955877, val_acc = 0.952280
Test 28: train_acc = 0.975980, val_acc = 0.966545
Test 36: train_acc = 0.990380, val_acc = 0.974160


In [150]:
rf_model = RandomForestClassifier(n_jobs=-1, random_state=1212,
                                  n_estimators=250, max_depth=36)

In [151]:
rf_model

In [153]:
rf_model.fit(sample_train_inputs, sample_train_targets)

In [154]:
rf_model.score(sample_val_inputs, sample_val_targets)

0.97416

In [155]:
rf_model.score(val_inputs, val_targets)

0.9773701710776054

In [156]:
preds = rf_model.predict(test_inputs)

In [157]:
preds

array(['e', 'p', 'p', ..., 'p', 'e', 'e'], dtype=object)

In [158]:
sub_df['class'] = preds

In [159]:
sub_df.to_csv('poisonous_mushroom_data/sub1.csv', index=None)

In [160]:
rf_model.fit(train_inputs, train_targets)

In [161]:
rf_model.score(val_inputs, val_targets)

0.974957554633571

In [162]:
preds = rf_model.predict(test_inputs)

In [163]:
preds

array(['e', 'p', 'p', ..., 'p', 'e', 'e'], dtype=object)

In [164]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub2.csv', index=None)

### XGBoost

In [165]:
from xgboost import XGBClassifier

In [166]:
def test_params(train_inputs, train_targets, val_inputs, val_targets, **params):
    model = XGBClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [170]:
sample_train_inputs, sample_val_inputs, sample_train_targets, sample_val_targets = train_test_split(
    sample_df[input_cols],
    sample_df[target_col],
    test_size=0.25,
    random_state=42
)

In [172]:
test_params(sample_train_inputs, sample_train_targets, sample_val_inputs, sample_val_targets,
            n_jobs=-1, random_state=42)

(0.96815, 0.966815)

In [173]:
for i in [50,150,250,350]:
    train_acc, val_acc = test_params(sample_train_inputs, sample_train_targets, 
                                     sample_val_inputs, sample_val_targets,
                                     n_jobs=-1, random_state=42,
                                     n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 50: train_acc = 0.947175, val_acc = 0.946160
Test 150: train_acc = 0.972990, val_acc = 0.971415
Test 250: train_acc = 0.975892, val_acc = 0.973440
Test 350: train_acc = 0.977363, val_acc = 0.973700


In [176]:
for i in [3,6,9,12]:
    train_acc, val_acc = test_params(sample_train_inputs, sample_train_targets, 
                                     sample_val_inputs, sample_val_targets,
                                     n_jobs=-1, random_state=42,
                                     n_estimators=350, max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 3: train_acc = 0.961738, val_acc = 0.961420
Test 6: train_acc = 0.977363, val_acc = 0.973700
Test 9: train_acc = 0.984350, val_acc = 0.973885
Test 12: train_acc = 0.990592, val_acc = 0.973785


In [177]:
for i in [0.005,0.01,0.05,0.1,0.2,0.4]:
    train_acc, val_acc = test_params(sample_train_inputs, sample_train_targets, 
                                     sample_val_inputs, sample_val_targets,
                                     n_jobs=-1, random_state=42,
                                     n_estimators=350, max_depth=9,
                                     learning_rate=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.005: train_acc = 0.921700, val_acc = 0.921325
Test 0.01: train_acc = 0.938430, val_acc = 0.937955
Test 0.05: train_acc = 0.972087, val_acc = 0.970205
Test 0.1: train_acc = 0.977427, val_acc = 0.974270
Test 0.2: train_acc = 0.981857, val_acc = 0.974455
Test 0.4: train_acc = 0.986358, val_acc = 0.973720


In [181]:
for i in [0.3,0.5,0.7,0.9,0.99]:
    train_acc, val_acc = test_params(sample_train_inputs, sample_train_targets, 
                                     sample_val_inputs, sample_val_targets,
                                     n_jobs=-1, random_state=42,
                                     n_estimators=350, max_depth=9,
                                     learning_rate=0.2, subsample=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.3: train_acc = 0.979352, val_acc = 0.973810
Test 0.5: train_acc = 0.980640, val_acc = 0.974135
Test 0.7: train_acc = 0.981810, val_acc = 0.974330
Test 0.9: train_acc = 0.982223, val_acc = 0.974590
Test 0.99: train_acc = 0.982028, val_acc = 0.974485


In [182]:
xgb_model = XGBClassifier(n_jobs=-1, random_state=42,
                          n_estimators=350, max_depth=9,
                          learning_rate=0.2, subsample=0.9)

In [184]:
xgb_model.fit(sample_train_inputs, sample_train_targets)

In [185]:
xgb_model.score(sample_val_inputs, sample_val_targets)

0.97459

In [187]:
raw_df[target_col] = raw_df[target_col].apply(lambda x: 1 if x=='p' else 0)

In [190]:
targets = raw_df[target_col]

In [191]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.25, random_state=42)

In [192]:
xgb_model.score(val_inputs, val_targets)

0.9760214158208607

In [193]:
preds = xgb_model.predict(test_inputs)

In [196]:
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')
preds

0          e
1          p
2          p
3          p
4          e
          ..
2077959    p
2077960    p
2077961    p
2077962    e
2077963    e
Length: 2077964, dtype: object

In [197]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub3.csv', index=None)

In [198]:
xgb_model.fit(train_inputs, train_targets)

In [199]:
xgb_model.score(val_inputs, val_targets)

0.9757827207896955

In [200]:
preds = xgb_model.predict(test_inputs)
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')
preds

0          e
1          p
2          p
3          p
4          e
          ..
2077959    p
2077960    p
2077961    p
2077962    e
2077963    e
Length: 2077964, dtype: object

In [201]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub4.csv', index=None)