In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce


In [2]:
df = pd.read_csv("./Mushroom/raw_mushroom.csv")
df.head()

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


In [3]:
type_map = {'e':'edible','p':'poisonous'}
cap_shape_map = {'b':'bell','c':'conical','x':'convex','f':'flat','k':'knobbed','s':'sunken'} 
cap_surface_map = {'f':'fibrous','g':'grooves','y':'scaly','s':'smooth'} 
cap_color_map = {'n':'brown','b':'buff','c':'cinnamon','g':'gray','r':'green','p':'pink','u':'purple','e':'red','w':'white','y':'yellow'} 
bruises_map = {'t':'bruises','f':'no'} 
odor_map = {'a':'almond','l':'anise','c':'creosote','y':'fishy','f':'foul','m':'musty','n':'none','p':'pungent','s':'spicy'} 
gill_attachment_map = {'a':'attached','d':'descending','f':'free','n':'notched'} 
gill_spacing_map = {'c':'close','w':'crowded','d':'distant'} 
gill_size_map = {'b':'broad','n':'narrow'} 
gill_color_map = {'k':'black','n':'brown','b':'buff','h':'chocolate','g':'gray','r':'green','o':'orange','p':'pink','u':'purple','e':'red','w':'white','y':'yellow'} 
stalk_shape_map = {'e':'enlarging','t':'tapering'} 
stalk_root_map = {'b':'bulbous','c':'club','u':'cup','e':'equal','z':'rhizomorphs','r':'rooted','missing':'?'}
stalk_surface_above_ring_map = {'f':'fibrous','y':'scaly','k':'silky','s':'smooth'} 
stalk_surface_below_ring_map = {'f':'fibrous','y':'scaly','k':'silky','s':'smooth'} 
stalk_color_above_ring_map = {'n':'brown','b':'buff','c':'cinnamon','g':'gray','o':'orange','p':'pink','e':'red','w':'white','y':'yellow'} 
stalk_color_below_ring_map = {'n':'brown','b':'buff','c':'cinnamon','g':'gray','o':'orange','p':'pink','e':'red','w':'white','y':'yellow'} 
veil_type_map = {'p':'partial','u':'universal'} 
veil_color_map = {'n':'brown','o':'orange','w':'white','y':'yellow'} 
ring_number_map = {'n':'none','o':'one','t':'two'} 
ring_type_map = {'c':'cobwebby','e':'evanescent','f':'flaring','l':'large','n':'none','p':'pendant','s':'sheathing','z':'zone'} 
spore_print_color_map = {'k':'black','n':'brown','b':'buff','h':'chocolate','r':'green','o':'orange','u':'purple','w':'white','y':'yellow'} 
population_map = {'a':'abundant','c':'clustered','n':'numerous','s':'scattered','v':'several','y':'solitary'} 
habitat_map = {'g':'grasses','l':'leaves','m':'meadows','p':'paths','u':'urban','w':'waste','d':'woods'}

features = ['type',
            'cap-shape','cap-surface','cap-color','bruises','odor',
            'gill-attachment', 'gill-spacing','gill-size','gill-color','stalk-shape',
            'stalk-root','stalk-surface-above-ring', 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring',
            'veil-type','veil-color','ring-number','ring-type','spore-print-color',
            'population','habitation'
           ]

In [4]:
df.columns = features

df['type']=df['type'].replace(type_map)
df['cap-shape']=df['cap-shape'].replace(cap_shape_map)
df['cap-surface'] = df['cap-surface'].replace(cap_surface_map)
df['cap-color']=df['cap-color'].replace(cap_color_map)
df['bruises'] = df['bruises'].replace(bruises_map)
df['odor']=df['odor'].replace(odor_map)

df['gill-attachment'] = df['gill-attachment'].replace(gill_attachment_map)
df['gill-spacing'] = df['gill-spacing'].replace(gill_spacing_map)
df['gill-size'] = df['gill-size'].replace(gill_size_map)
df['gill-color'] = df['gill-color'].replace(gill_color_map)
df['stalk-shape'] = df['stalk-shape'].replace(stalk_shape_map)

df['stalk-root'] = df['stalk-root'].replace(stalk_root_map)
df['stalk-surface-above-ring'] = df['stalk-surface-above-ring'].replace(stalk_surface_above_ring_map)
df['stalk-surface-below-ring'] = df['stalk-surface-below-ring'].replace(stalk_surface_below_ring_map)
df['stalk-color-above-ring'] = df['stalk-color-above-ring'].replace(stalk_color_above_ring_map)
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].replace(stalk_color_below_ring_map)

df['veil-type'] = df['veil-type'].replace(veil_type_map)
df['veil-color']= df['veil-color'].replace(veil_color_map)
df['ring-number'] =  df['ring-number'].replace(ring_number_map)
df['ring-type'] =  df['ring-type'].replace(ring_type_map)
df['spore-print-color'] =  df['spore-print-color'].replace(spore_print_color_map)

df['population'] =  df['population'].replace(population_map)
df['habitation'] =  df['habitation'].replace(habitat_map)

In [5]:
df.head()

Unnamed: 0,type,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitation
0,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
1,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
2,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
3,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
4,edible,convex,scaly,yellow,bruises,almond,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,black,numerous,grasses


In [6]:
df_type = pd.DataFrame()
df_type['type'] = df['type']
df_type.insert(0, 'ID', range(len(df)))
df_type = df_type.set_index('ID')
df_type.head()

Unnamed: 0_level_0,type
ID,Unnamed: 1_level_1
0,edible
1,edible
2,poisonous
3,edible
4,edible


In [7]:
df_type.to_csv("./Mushroom/mushroom_labels.csv")

In [8]:
df =  df.drop(['type'],axis=1)
df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitation
0,convex,smooth,yellow,bruises,almond,free,close,broad,black,enlarging,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
1,bell,smooth,white,bruises,anise,free,close,broad,brown,enlarging,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
2,convex,scaly,white,bruises,pungent,free,close,narrow,brown,enlarging,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
3,convex,smooth,gray,no,none,free,crowded,broad,black,tapering,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
4,convex,scaly,yellow,bruises,almond,free,close,broad,brown,enlarging,...,smooth,white,white,partial,white,one,pendant,black,numerous,grasses


In [11]:
df_onehot = pd.get_dummies(df, columns=features[1:], prefix = features[1:])
df_onehot.head()

Unnamed: 0,cap-shape_bell,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_knobbed,cap-shape_sunken,cap-surface_fibrous,cap-surface_grooves,cap-surface_scaly,cap-surface_smooth,...,population_scattered,population_several,population_solitary,habitation_grasses,habitation_leaves,habitation_meadows,habitation_paths,habitation_urban,habitation_waste,habitation_woods
0,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [12]:
df_onehot.to_csv("./Mushroom/mushroom_data.csv", index=False)


In [13]:
len(df_onehot)

8123