Notebook này chạy trên local.

# Binary Prediction of Poisonous Mushrooms

[Link_to_competition](https://www.kaggle.com/competitions/playground-series-s4e8)

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
!kaggle competitions download -c playground-series-s4e8

playground-series-s4e8.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
from zipfile import ZipFile
with ZipFile('playground-series-s4e8.zip') as f:
    f.extractall('poisonous_mushroom_data')

In [4]:
data_dir = 'poisonous_mushroom_data/'

In [5]:
import pandas as pd

In [6]:
raw_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sub_df = pd.read_csv(data_dir + 'sample_submission.csv')

In [7]:
raw_df

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [8]:
raw_df.dtypes

id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
dtype: object

In [9]:
raw_df.isna().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [10]:
numeric_cols = ['cap-diameter', 'stem-height', 'stem-width']
categorical_cols = raw_df.select_dtypes(include='object').columns.tolist()

In [11]:
categorical_cols.remove('class')

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
categorical_cols

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [14]:
raw_df['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

In [15]:
def view_categoricals_feature(cat_col: str):
    d = dict()    
    total = raw_df[cat_col].value_counts()
    idx = raw_df['class'].value_counts().index.tolist()
    for t in idx:
        x = raw_df[raw_df['class']==t][cat_col].value_counts()
        d[t] = x/total*100
    
    return pd.DataFrame(data=d, columns=idx)

## Exploring and preprocessing

### Cap

In [16]:
raw_df['cap-shape'].value_counts().head(15)

cap-shape
x    1436026
f     676238
s     365146
b     318646
o     108835
p     106967
c     104520
d         65
e         60
n         41
t         36
w         36
g         34
y         33
r         32
Name: count, dtype: int64

In [17]:
cap_shape_lst = ['x','f','s','b','o','p','c']
raw_df['cap-shape'] = raw_df['cap-shape'].apply(lambda x: x if x in cap_shape_lst else 'x')

In [18]:
view_categoricals_feature('cap-shape')

Unnamed: 0_level_0,p,e
cap-shape,Unnamed: 1_level_1,Unnamed: 2_level_1
b,77.303654,22.696346
c,54.27765,45.72235
f,51.194402,48.805598
o,67.481049,32.518951
p,45.940337,54.059663
s,54.945693,45.054307
x,51.018486,48.981514


In [19]:
test_df['cap-shape'] = test_df['cap-shape'].apply(lambda x: x if x in cap_shape_lst else 'x')

In [20]:
cap_surface_lst = raw_df['cap-surface'].value_counts().head(11).index

In [21]:
raw_df['cap-surface'] = raw_df['cap-surface'].apply(lambda x: x if x in cap_surface_lst else 'rem')

In [22]:
view_categoricals_feature('cap-surface')

Unnamed: 0_level_0,p,e
cap-surface,Unnamed: 1_level_1,Unnamed: 2_level_1
d,58.788292,41.211708
e,47.843157,52.156843
g,59.718499,40.281501
h,47.012937,52.987063
i,82.191467,17.808533
k,90.155577,9.844423
l,42.881027,57.118973
rem,51.39222,48.60778
s,36.934307,63.065693
t,64.239535,35.760465


In [23]:
test_df['cap-surface'] = test_df['cap-surface'].apply(lambda x: x if x in cap_surface_lst else 'rem')

In [24]:
cap_color_lst = raw_df['cap-color'].value_counts().head(12).index
cap_color_lst

Index(['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'], dtype='object', name='cap-color')

In [25]:
raw_df['cap-color'] = raw_df['cap-color'].apply(lambda x: x if x in cap_color_lst else 'n')

In [26]:
view_categoricals_feature('cap-color')

Unnamed: 0_level_0,p,e
cap-color,Unnamed: 1_level_1,Unnamed: 2_level_1
b,12.604179,87.395821
e,79.324852,20.675148
g,41.191984,58.808016
k,59.848384,40.151616
l,48.928156,51.071844
n,49.09255,50.90745
o,71.726112,28.273888
p,68.993227,31.006773
r,90.693543,9.306457
u,53.931832,46.068168


In [27]:
test_df['cap-color'] = test_df['cap-color'].apply(lambda x: x if x in cap_color_lst else 'n')

### Bruise

In [28]:
raw_df['does-bruise-or-bleed'].value_counts(dropna=False)

does-bruise-or-bleed
f           2569743
t            547085
w                14
c                11
h                 9
NaN               8
y                 7
a                 7
b                 7
x                 7
s                 6
k                 6
p                 4
e                 4
l                 4
d                 4
g                 3
o                 3
z                 3
n                 2
i                 2
has-ring          1
3.43              1
r                 1
4.42              1
2.9               1
u                 1
Name: count, dtype: int64

In [29]:
bruise_lst = ['f','t']
raw_df['does-bruise-or-bleed'] = raw_df['does-bruise-or-bleed'].apply(lambda x: x if x in bruise_lst else 'f')

In [30]:
view_categoricals_feature('does-bruise-or-bleed')

Unnamed: 0_level_0,p,e
does-bruise-or-bleed,Unnamed: 1_level_1,Unnamed: 2_level_1
f,55.576179,44.423821
t,50.662329,49.337671


In [31]:
test_df['does-bruise-or-bleed'] = test_df['does-bruise-or-bleed'].apply(lambda x: x if x in bruise_lst else 'f')

### Gill

In [32]:
raw_df['gill-attachment'].value_counts(dropna=False).head(10)

gill-attachment
a      646034
d      589236
NaN    523936
x      360878
e      301858
s      295439
p      279110
f      119953
c          74
u          56
Name: count, dtype: int64

In [33]:
gill_attachment_lst = raw_df['gill-attachment'].value_counts().head(7).index
raw_df['gill-attachment'] = raw_df['gill-attachment'].apply(lambda x: x if x in gill_attachment_lst else 'rem')

In [34]:
view_categoricals_feature('gill-attachment')

Unnamed: 0_level_0,p,e
gill-attachment,Unnamed: 1_level_1,Unnamed: 2_level_1
a,64.634369,35.365631
d,57.137717,42.862283
e,34.896541,65.103459
f,51.045826,48.954174
p,29.073484,70.926516
rem,63.891182,36.108818
s,58.915715,41.084285
x,53.845067,46.154933


In [35]:
test_df['gill-attachment'] = test_df['gill-attachment'].apply(lambda x: x if x in gill_attachment_lst else 'rem')

In [36]:
raw_df['gill-spacing'].value_counts(dropna=False).head(10)

gill-spacing
c      1331054
NaN    1258435
d       407932
f       119380
e           24
a           17
s           16
b           12
x            8
t            8
Name: count, dtype: int64

In [37]:
gill_spacing_lst = ['c','d','f']
raw_df['gill-spacing'] = raw_df['gill-spacing'].apply(lambda x: x if x in gill_spacing_lst else 'rem')

In [38]:
view_categoricals_feature('gill-spacing')

Unnamed: 0_level_0,p,e
gill-spacing,Unnamed: 1_level_1,Unnamed: 2_level_1
c,57.24456,42.75544
rem,57.083981,42.916019
d,40.224596,59.775404
f,51.016921,48.983079


In [39]:
test_df['gill-spacing'] = test_df['gill-spacing'].apply(lambda x: x if x in gill_spacing_lst else 'rem')

In [40]:
raw_df['gill-color'].value_counts(dropna=False)

gill-color
w         931538
n         543386
y         469464
p         343626
g         212164
           ...  
does w         1
4              1
18.12          1
0.92           1
8.37           1
Name: count, Length: 64, dtype: int64

In [41]:
gill_color_lst = raw_df['gill-color'].value_counts(dropna=False).head(12).index

In [42]:
raw_df['gill-color'] = raw_df['gill-color'].apply(lambda x: x if x in gill_color_lst else 'w')

In [43]:
view_categoricals_feature('gill-color')

Unnamed: 0_level_0,p,e
gill-color,Unnamed: 1_level_1,Unnamed: 2_level_1
b,28.171101,71.828899
e,69.855657,30.144343
f,50.987518,49.012482
g,48.099583,51.900417
k,57.737751,42.262249
n,71.487856,28.512144
o,51.737855,48.262145
p,59.020854,40.979146
r,71.391264,28.608736
u,60.243618,39.756382


In [44]:
test_df['gill-color'] = test_df['gill-color'].apply(lambda x: x if x in gill_color_lst else 'w')

### Stem

In [45]:
raw_df['stem-root'].value_counts(dropna=False).head(15)

stem-root
NaN    2757023
b       165801
s       116946
r        47803
c        28592
f          597
d           24
y           14
w           12
p           12
g           12
k           11
l           10
n           10
t           10
Name: count, dtype: int64

In [46]:
stem_root_lst = ['b','s','r','c']
raw_df['stem-root'] = raw_df['stem-root'].apply(lambda x: x if x in stem_root_lst else 'rem')

In [47]:
view_categoricals_feature('stem-root')

Unnamed: 0_level_0,p,e
stem-root,Unnamed: 1_level_1,Unnamed: 2_level_1
b,35.187363,64.812637
c,99.594292,0.405708
r,99.523042,0.476958
rem,54.003096,45.996904
s,69.865579,30.134421


In [48]:
test_df['stem-root'] = test_df['stem-root'].apply(lambda x: x if x in stem_root_lst else 'rem')

In [49]:
raw_df['stem-surface'].value_counts(dropna=False).head(10)

stem-surface
NaN    1980861
s       327610
y       255500
i       224346
t       147974
g        78080
k        73383
h        28283
f          512
w           49
Name: count, dtype: int64

In [50]:
stem_surface_lst = raw_df['stem-surface'].value_counts().head(7).index
raw_df['stem-surface'] = raw_df['stem-surface'].apply(lambda x: x if x in stem_surface_lst else 'rem')

In [51]:
view_categoricals_feature('stem-surface')

Unnamed: 0_level_0,p,e
stem-surface,Unnamed: 1_level_1,Unnamed: 2_level_1
g,99.610656,0.389344
h,99.635824,0.364176
i,63.854938,36.145062
k,69.795457,30.204543
rem,51.280346,48.719654
s,39.550685,60.449315
t,54.125725,45.874275
y,70.07593,29.92407


In [52]:
test_df['stem-surface'] = test_df['stem-surface'].apply(lambda x: x if x in stem_surface_lst else 'rem')

In [53]:
raw_df['stem-color'].value_counts().head(11)

stem-color
w    1196637
n    1003464
y     373971
g     132019
o     111541
e     103373
u      67017
p      54690
k      33676
r      22329
l       9994
Name: count, dtype: int64

In [54]:
stem_color_lst = raw_df['stem-color'].value_counts().head(10).index
raw_df['stem-color'] = raw_df['stem-color'].apply(lambda x: x if x in stem_color_lst else 'rem')

In [55]:
view_categoricals_feature('stem-color')

Unnamed: 0_level_0,p,e
stem-color,Unnamed: 1_level_1,Unnamed: 2_level_1
e,70.129531,29.870469
g,39.142093,60.857907
k,78.679178,21.320822
n,61.092775,38.907225
o,59.665952,40.334048
p,88.635948,11.364052
r,85.534507,14.465493
rem,34.781655,65.218345
u,62.545324,37.454676
w,41.771398,58.228602


In [56]:
test_df['stem-color'] = test_df['stem-color'].apply(lambda x: x if x in stem_color_lst else 'rem')

### Veil

In [57]:
raw_df['veil-type'].value_counts(dropna=False).head(10)

veil-type
NaN    2957493
u       159373
w           11
a            9
e            8
f            8
b            5
c            5
g            4
y            4
Name: count, dtype: int64

In [58]:
raw_df['veil-type'] = raw_df['veil-type'].apply(lambda x: x if x=='u' else 'rem')

In [59]:
view_categoricals_feature('veil-type')

Unnamed: 0_level_0,p,e
veil-type,Unnamed: 1_level_1,Unnamed: 2_level_1
rem,54.129333,45.870667
u,65.558156,34.441844


In [60]:
test_df['veil-type'] = test_df['veil-type'].apply(lambda x: x if x=='u' else 'rem')

In [61]:
raw_df['veil-color'].value_counts(dropna=False).head(10)

veil-color
NaN    2740947
w       279070
y        30473
n        30039
u        14026
k        13080
e         9169
g           30
p           23
r           14
Name: count, dtype: int64

In [62]:
veil_color_lst = ['w','y','n','u','k','e']
raw_df['veil-color'] = raw_df['veil-color'].apply(lambda x: x if x in veil_color_lst else 'rem')

In [63]:
view_categoricals_feature('veil-color')

Unnamed: 0_level_0,p,e
veil-color,Unnamed: 1_level_1,Unnamed: 2_level_1
e,99.901843,0.098157
k,99.915902,0.084098
n,99.627151,0.372849
rem,54.667818,45.332182
u,99.408242,0.591758
w,50.39775,49.60225
y,0.521773,99.478227


In [64]:
test_df['veil-color'] = test_df['veil-color'].apply(lambda x: x if x in veil_color_lst else 'rem')

### Ring

In [65]:
raw_df['has-ring'].value_counts(dropna=False).head(10)

has-ring
f      2368820
t       747982
NaN         24
r           16
h           13
c           11
l           11
s           11
p           11
g            8
Name: count, dtype: int64

In [66]:
has_ring_lst = ['f','t']
raw_df['has-ring'] = raw_df['has-ring'].apply(lambda x: x if x in has_ring_lst else 'f')

In [67]:
view_categoricals_feature('has-ring')

Unnamed: 0_level_0,p,e
has-ring,Unnamed: 1_level_1,Unnamed: 2_level_1
f,53.325485,46.674515
t,59.11038,40.88962


In [68]:
test_df['has-ring'] = test_df['has-ring'].apply(lambda x: x if x in has_ring_lst else 'f')

In [69]:
raw_df['ring-type'].value_counts(dropna=False).head(10)

ring-type
f      2477170
NaN     128880
e       120006
z       113780
l        73443
r        67909
p        67678
g        63687
m         3992
t           98
Name: count, dtype: int64

In [70]:
ring_type_lst = ['f','e','z','l','r','p','g']
raw_df['ring-type'] = raw_df['ring-type'].apply(lambda x: x if x in ring_type_lst else 'rem')

In [71]:
view_categoricals_feature('ring-type')

Unnamed: 0_level_0,p,e
ring-type,Unnamed: 1_level_1,Unnamed: 2_level_1
e,57.940436,42.059564
f,54.673842,45.326158
g,42.146749,57.853251
l,38.796073,61.203927
p,51.414049,48.585951
r,39.482248,60.517752
rem,38.428177,61.571823
z,99.615925,0.384075


In [72]:
test_df['ring-type'] = test_df['ring-type'].apply(lambda x: x if x in ring_type_lst else 'rem')

### Other

In [73]:
raw_df['spore-print-color'].value_counts(dropna=False).head(10)

spore-print-color
NaN    2849682
k       107310
p        68237
w        50173
n        22646
r         7975
u         7256
g         3492
y           36
s           21
Name: count, dtype: int64

In [74]:
spore_lst = ['k','p','w','n','r','u','g']
raw_df['spore-print-color'] = raw_df['spore-print-color'].apply(lambda x: x if x in spore_lst else 'rem')

In [75]:
view_categoricals_feature('spore-print-color')

Unnamed: 0_level_0,p,e
spore-print-color,Unnamed: 1_level_1,Unnamed: 2_level_1
g,2.835052,97.164948
k,82.536576,17.463424
n,99.726221,0.273779
p,71.23115,28.76885
r,99.724138,0.275862
rem,52.905971,47.094029
u,99.696803,0.303197
w,45.056106,54.943894


In [76]:
test_df['spore-print-color'] = test_df['spore-print-color'].apply(lambda x: x if x in spore_lst else 'rem')

In [77]:
raw_df['habitat'].value_counts(dropna=False).head(10)

habitat
d    2177573
g     454908
l     171892
m     150969
h     120137
w      18530
p      17180
u       5264
e         55
s         52
Name: count, dtype: int64

In [78]:
habitat_lst = ['d','g','l','m','h']
raw_df['habitat'] = raw_df['habitat'].apply(lambda x: x if x in habitat_lst else 'd')

In [79]:
view_categoricals_feature('habitat')

Unnamed: 0_level_0,p,e
habitat,Unnamed: 1_level_1,Unnamed: 2_level_1
d,52.683977,47.316023
g,67.460234,32.539766
h,66.914439,33.085561
l,39.492239,60.507761
m,53.761368,46.238632


In [80]:
test_df['habitat'] = test_df['habitat'].apply(lambda x: x if x in habitat_lst else 'd')

In [81]:
raw_df['season'].value_counts(dropna=False)

season
a    1543321
u    1153588
w     278189
s     141847
Name: count, dtype: int64

In [82]:
view_categoricals_feature('season')

Unnamed: 0_level_0,p,e
season,Unnamed: 1_level_1,Unnamed: 2_level_1
a,57.158621,42.841379
u,58.246098,41.753902
w,34.600937,65.399063
s,38.829866,61.170134


In [83]:
len(categorical_cols)

17

### Encode

In [84]:
from sklearn.preprocessing import LabelEncoder

In [85]:
for cat_col in categorical_cols:
    encoder = LabelEncoder()
    encoder.fit(raw_df[cat_col])
    raw_df[cat_col] = encoder.transform(raw_df[cat_col])
    test_df[cat_col] = encoder.transform(test_df[cat_col])

### Numeric cols

In [86]:
numeric_cols = numeric_cols + categorical_cols
len(numeric_cols)

20

In [87]:
raw_df[numeric_cols].isna().sum()

cap-diameter            4
stem-height             0
stem-width              0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [88]:
avg_cap_diameter = raw_df['cap-diameter'].mean()
avg_cap_diameter

6.309848357732786

In [89]:
raw_df = raw_df.fillna({'cap-diameter': avg_cap_diameter})

In [90]:
test_df = test_df.fillna({'cap-diameter': avg_cap_diameter})

In [91]:
avg_stem_height = raw_df['stem-height'].mean()
test_df = test_df.fillna({'stem-height': avg_stem_height})

In [92]:
raw_df[numeric_cols].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0
mean,6.309848,6.348333,11.15379,4.060597,6.256271,6.032857,0.1755196,3.0768,1.418834,7.348291,2.844256,4.15105,6.171596,0.05113116,3.186041,0.2399728,1.614152,4.823069,0.5822156,1.053464
std,4.657928,2.699755,8.095477,2.190271,3.298277,3.069461,0.380411,2.486081,1.380239,3.100741,0.7375861,1.343073,3.304713,0.2202652,0.684769,0.4270666,1.644103,0.8773727,1.11015,1.104258
min,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.32,4.67,4.97,2.0,3.0,5.0,0.0,1.0,0.0,5.0,3.0,4.0,3.0,0.0,3.0,0.0,1.0,5.0,0.0,0.0
50%,5.75,5.88,9.65,5.0,7.0,5.0,0.0,3.0,1.0,7.0,3.0,4.0,9.0,0.0,3.0,0.0,1.0,5.0,0.0,1.0
75%,8.24,7.41,15.63,6.0,9.0,9.0,0.0,5.0,3.0,10.0,3.0,4.0,9.0,0.0,3.0,0.0,1.0,5.0,1.0,2.0
max,80.67,88.72,102.9,6.0,11.0,11.0,1.0,7.0,3.0,11.0,4.0,7.0,10.0,1.0,6.0,1.0,7.0,7.0,4.0,3.0


In [93]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(raw_df[numeric_cols])

In [94]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [95]:
raw_df[numeric_cols].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0,3116945.0
mean,0.0778751,0.0715547,0.1083944,0.6767661,0.5687519,0.5484415,0.1755196,0.4395428,0.4729448,0.6680265,0.7110641,0.5930071,0.6171596,0.05113116,0.5310068,0.2399728,0.2305931,0.6890098,0.1455539,0.3511547
std,0.057762,0.03043006,0.07867325,0.3650452,0.2998433,0.2790419,0.380411,0.3551544,0.4600798,0.2818855,0.1843965,0.1918676,0.3304713,0.2202652,0.1141282,0.4270666,0.2348719,0.125339,0.2775374,0.3680859
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.04079861,0.05263751,0.04829932,0.3333333,0.2727273,0.4545455,0.0,0.1428571,0.0,0.4545455,0.75,0.5714286,0.3,0.0,0.5,0.0,0.1428571,0.7142857,0.0,0.0
50%,0.07093254,0.06627592,0.09378037,0.8333333,0.6363636,0.4545455,0.0,0.4285714,0.3333333,0.6363636,0.75,0.5714286,0.9,0.0,0.5,0.0,0.1428571,0.7142857,0.0,0.3333333
75%,0.1018105,0.08352119,0.151895,1.0,0.8181818,0.8181818,0.0,0.7142857,1.0,0.9090909,0.75,0.5714286,0.9,0.0,0.5,0.0,0.1428571,0.7142857,0.25,0.6666667
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [96]:
test_df[numeric_cols].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
count,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0,2077964.0
mean,0.07782977,0.07153414,0.1083418,0.6768382,0.5686309,0.548998,0.1752807,0.4395138,0.4731508,0.6682512,0.7111223,0.5929752,0.6173561,0.05119097,0.5311614,0.2405042,0.2308218,0.6890412,0.1459084,0.3511604
std,0.05810335,0.03042129,0.07871896,0.3649727,0.2998027,0.2793189,0.3802071,0.355162,0.4601191,0.2818084,0.1844336,0.1915896,0.3304735,0.2203871,0.1144538,0.4273898,0.2351206,0.1253816,0.2778578,0.3681928
min,-0.0003720238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0406746,0.05263751,0.04829932,0.3333333,0.2727273,0.4545455,0.0,0.1428571,0.0,0.4545455,0.75,0.5714286,0.3,0.0,0.5,0.0,0.1428571,0.7142857,0.0,0.0
50%,0.07080853,0.06627592,0.09368319,0.8333333,0.6363636,0.4545455,0.0,0.4285714,0.3333333,0.6363636,0.75,0.5714286,0.9,0.0,0.5,0.0,0.1428571,0.7142857,0.0,0.3333333
75%,0.1016865,0.08352119,0.1517979,1.0,0.8181818,0.8181818,0.0,0.7142857,1.0,0.9090909,0.75,0.5714286,0.9,0.0,0.5,0.0,0.1428571,0.7142857,0.25,0.6666667
max,7.52691,0.6457394,1.000097,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [97]:
input_cols = numeric_cols
target_col = 'class'

In [103]:
inputs = raw_df[input_cols]
targets = raw_df[target_col].apply(lambda x: 1 if x=='p' else 0)
test_inputs = test_df[input_cols]

In [99]:
from sklearn.model_selection import train_test_split

In [104]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.25, random_state=42)

## Model

### Random Forest

In [102]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [114]:
sample_df = raw_df[input_cols+[target_col]].sample(600000)

In [115]:
sample_train_inputs, sample_val_inputs, sample_train_targets, sample_val_targets = train_test_split(
    sample_df[input_cols],
    sample_df[target_col].apply(lambda x: 1 if x=='p' else 0),
    test_size=0.2,
    random_state=42
)

In [116]:
classifiers = {
    'RF': RandomForestClassifier(n_jobs=-1),
    'XGB': XGBClassifier(n_jobs=-1),
    'LGBM': LGBMClassifier(n_jobs=-1)
}
result = {'Classifier': [], 'Accuracy': []}

In [117]:
for clf_name, clf in classifiers.items():
    clf.fit(sample_train_inputs, sample_train_targets)
    acc = clf.score(sample_val_inputs, sample_val_targets)
    result['Classifier'].append(clf_name)
    result['Accuracy'].append(acc)

result_df = pd.DataFrame(result)
result_df

Unnamed: 0,Classifier,Accuracy
0,RF,0.992108
1,XGB,0.991558
2,LGBM,0.989417


In [118]:
def test_params(**params):
    model = RandomForestClassifier(**params)
    model.fit(sample_train_inputs, sample_train_targets)
    train_acc = model.score(sample_train_inputs, sample_train_targets)
    val_acc = model.score(sample_val_inputs, sample_val_targets)
    return train_acc, val_acc

In [119]:
for i in [70, 90, 110, 150, 200, 250, 300, 400]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 70: train_acc = 0.999942, val_acc = 0.991992
Test 90: train_acc = 0.999979, val_acc = 0.992142
Test 110: train_acc = 0.999988, val_acc = 0.992292
Test 150: train_acc = 0.999998, val_acc = 0.992192
Test 200: train_acc = 1.000000, val_acc = 0.992133
Test 250: train_acc = 1.000000, val_acc = 0.992200
Test 300: train_acc = 1.000000, val_acc = 0.992142
Test 400: train_acc = 1.000000, val_acc = 0.992192


In [121]:
for i in range(33, 42):
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=110,
                                     max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 33: train_acc = 0.999471, val_acc = 0.992117
Test 34: train_acc = 0.999640, val_acc = 0.992142
Test 35: train_acc = 0.999708, val_acc = 0.992217
Test 36: train_acc = 0.999804, val_acc = 0.992158
Test 37: train_acc = 0.999875, val_acc = 0.992100
Test 38: train_acc = 0.999952, val_acc = 0.992125
Test 39: train_acc = 0.999956, val_acc = 0.992175
Test 40: train_acc = 0.999967, val_acc = 0.992142
Test 41: train_acc = 0.999979, val_acc = 0.992150


In [122]:
for i in ["gini", "entropy", "log_loss"]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=110,
                                     max_depth=35, criterion=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test gini: train_acc = 0.999692, val_acc = 0.992175
Test entropy: train_acc = 0.999865, val_acc = 0.992225
Test log_loss: train_acc = 0.999823, val_acc = 0.992242


In [123]:
for i in ["sqrt", "log2", None, 6, 9, 13, 17]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=110,
                                     max_depth=35, criterion='log_loss',
                                     max_features=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test sqrt: train_acc = 0.999867, val_acc = 0.992200
Test log2: train_acc = 0.999838, val_acc = 0.992108
Test None: train_acc = 0.999921, val_acc = 0.990567
Test 6: train_acc = 0.999879, val_acc = 0.991908
Test 9: train_acc = 0.999912, val_acc = 0.991825
Test 13: train_acc = 0.999898, val_acc = 0.991567
Test 17: train_acc = 0.999929, val_acc = 0.991117


In [124]:
for i in [None, 0, 17, 42, 123, 1212]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=110,
                                     max_depth=35, criterion='log_loss',
                                     random_state=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test None: train_acc = 0.999892, val_acc = 0.992192
Test 0: train_acc = 0.999865, val_acc = 0.992183
Test 17: train_acc = 0.999862, val_acc = 0.992258
Test 42: train_acc = 0.999858, val_acc = 0.992167
Test 123: train_acc = 0.999842, val_acc = 0.992167
Test 1212: train_acc = 0.999902, val_acc = 0.992275


In [127]:
for i in ['balanced', 'balanced_subsample', None, {0:1,1:1}, {0:1,1:1.2}, {0:1,1:1.5}, {0:1,1:2}]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=110,
                                     max_depth=35, criterion='log_loss',
                                     random_state=17, class_weight=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test balanced: train_acc = 0.999858, val_acc = 0.992225
Test balanced_subsample: train_acc = 0.999825, val_acc = 0.992108
Test None: train_acc = 0.999862, val_acc = 0.992258
Test {0: 1, 1: 1}: train_acc = 0.999862, val_acc = 0.992258
Test {0: 1, 1: 1.2}: train_acc = 0.999875, val_acc = 0.992167
Test {0: 1, 1: 1.5}: train_acc = 0.999854, val_acc = 0.992133
Test {0: 1, 1: 2}: train_acc = 0.999892, val_acc = 0.992167


In [128]:
rf_model = RandomForestClassifier(n_jobs=-1, random_state=17,
                                  n_estimators=110, max_depth=35,
                                  criterion='log_loss')

In [129]:
rf_model

In [130]:
%%time

rf_model.fit(sample_train_inputs, sample_train_targets)

CPU times: total: 4min 30s
Wall time: 42.7 s


In [131]:
rf_model.score(sample_val_inputs, sample_val_targets)

0.9922583333333334

In [132]:
rf_model.score(val_inputs, val_targets)

0.9929687630335828

In [133]:
preds = rf_model.predict(test_inputs)

In [137]:
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')

In [138]:
sub_df['class'] = preds

In [139]:
sub_df.to_csv('poisonous_mushroom_data/sub1.csv', index=None)

In [140]:
%%time
rf_model.fit(train_inputs, train_targets)

CPU times: total: 36min 26s
Wall time: 6min 12s


In [141]:
rf_model.score(val_inputs, val_targets)

0.9921346137311242

In [142]:
preds = rf_model.predict(test_inputs)

In [143]:
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')

In [144]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub2.csv', index=None)

### XGBoost

In [145]:
from xgboost import XGBClassifier

In [148]:
def test_params(**params):
    model = XGBClassifier(**params)
    model.fit(sample_train_inputs, sample_train_targets)
    train_acc = model.score(sample_train_inputs, sample_train_targets)
    val_acc = model.score(sample_val_inputs, sample_val_targets)
    return train_acc, val_acc

In [149]:
test_params(n_jobs=-1)

(0.9918020833333333, 0.9915583333333333)

In [150]:
for i in [70,90, 120, 150, 250, 350]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 70: train_acc = 0.990708, val_acc = 0.990725
Test 90: train_acc = 0.991602, val_acc = 0.991325
Test 120: train_acc = 0.992154, val_acc = 0.991658
Test 150: train_acc = 0.992542, val_acc = 0.991850
Test 250: train_acc = 0.993381, val_acc = 0.991917
Test 350: train_acc = 0.993960, val_acc = 0.991808


In [152]:
for i in range(4,9):
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=250,
                                     max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 4: train_acc = 0.991325, val_acc = 0.991300
Test 5: train_acc = 0.992492, val_acc = 0.991633
Test 6: train_acc = 0.993381, val_acc = 0.991917
Test 7: train_acc = 0.994204, val_acc = 0.991792
Test 8: train_acc = 0.995085, val_acc = 0.991717


In [155]:
for i in [0.005,0.01,0.05,0.1,0.2,0.3,0.4]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=250,
                                     learning_rate=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.005: train_acc = 0.919094, val_acc = 0.919092
Test 0.01: train_acc = 0.961629, val_acc = 0.961417
Test 0.05: train_acc = 0.987758, val_acc = 0.988058
Test 0.1: train_acc = 0.991242, val_acc = 0.990933
Test 0.2: train_acc = 0.992738, val_acc = 0.991975
Test 0.3: train_acc = 0.993381, val_acc = 0.991917
Test 0.4: train_acc = 0.993781, val_acc = 0.991800


In [158]:
for i in [0, 0.1, 0.2, 0.4, 0.7, 1, 1.2, 1.5, 2]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=250,
                                     learning_rate=0.2, reg_alpha=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0: train_acc = 0.992738, val_acc = 0.991975
Test 0.1: train_acc = 0.992706, val_acc = 0.991758
Test 0.2: train_acc = 0.992731, val_acc = 0.991750
Test 0.4: train_acc = 0.992663, val_acc = 0.991967
Test 0.7: train_acc = 0.992715, val_acc = 0.991925
Test 1: train_acc = 0.992752, val_acc = 0.992033
Test 1.2: train_acc = 0.992767, val_acc = 0.992058
Test 1.5: train_acc = 0.992687, val_acc = 0.991817
Test 2: train_acc = 0.992644, val_acc = 0.991825


In [159]:
for i in [0, 0.1, 0.2, 0.4, 0.7, 1, 1.1, 1.2, 1.5, 2]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=250,
                                     learning_rate=0.2, reg_alpha=1.2,
                                     reg_lambda=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0: train_acc = 0.992827, val_acc = 0.991892
Test 0.1: train_acc = 0.992750, val_acc = 0.991908
Test 0.2: train_acc = 0.992848, val_acc = 0.991783
Test 0.4: train_acc = 0.992783, val_acc = 0.991933
Test 0.7: train_acc = 0.992673, val_acc = 0.991842
Test 1: train_acc = 0.992767, val_acc = 0.992058
Test 1.1: train_acc = 0.992708, val_acc = 0.991842
Test 1.2: train_acc = 0.992610, val_acc = 0.991742
Test 1.5: train_acc = 0.992650, val_acc = 0.992000
Test 2: train_acc = 0.992687, val_acc = 0.991933


In [162]:
xgb_model = XGBClassifier(n_jobs=-1, n_estimators=250,
                          learning_rate=0.2, reg_alpha=1.2)

In [163]:
%%time
xgb_model.fit(sample_train_inputs, sample_train_targets)

CPU times: total: 30 s
Wall time: 6.8 s


In [164]:
xgb_model.score(sample_val_inputs, sample_val_targets)

0.9920583333333334

In [165]:
xgb_model.score(val_inputs, val_targets)

0.9916944395607498

In [166]:
preds = xgb_model.predict(test_inputs)

In [167]:
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')
preds

0          e
1          p
2          p
3          p
4          e
          ..
2077959    p
2077960    p
2077961    p
2077962    e
2077963    e
Length: 2077964, dtype: object

In [168]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub3.csv', index=None)

In [169]:
%%time
xgb_model.fit(train_inputs, train_targets)

CPU times: total: 3min 51s
Wall time: 52.5 s


In [170]:
xgb_model.score(val_inputs, val_targets)

0.9918150703829516

In [171]:
preds = xgb_model.predict(test_inputs)
preds = pd.Series(preds)
preds = preds.apply(lambda x: 'p' if x==1 else 'e')
preds

0          e
1          p
2          p
3          p
4          e
          ..
2077959    p
2077960    p
2077961    p
2077962    e
2077963    e
Length: 2077964, dtype: object

In [172]:
sub_df['class'] = preds
sub_df.to_csv('poisonous_mushroom_data/sub4.csv', index=None)