## Decision Tree Classifier를 이용한 버섯 식용 여부 예측 

원본 데이터 셋 링크: https://www.kaggle.com/datasets/uciml/mushroom-classification/data  

버섯에 대한 메타 데이터가 주어지고, 먹을 수 있는지 없는지 여부를 표시한 데이터가 있습니다. 지금까지 배운 내용들을 활용해서 예측 모델을 만들고, 아래 포맷으로 submission을 만들어서 제출해보세요.

### 요구사항

- 데이터 특성 파악하기
- 데이터 전처리하기 
- decision tree 계열 알고리즘을 사용하여 분류 모델 학습시키고 교차 검증하기
- 테스트 셋에 대하여 인퍼런스 한 뒤, 적어도 한번 결과 제출하기 

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

Attribute Information: (classes: edible=e, poisonous=p)

- cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
- cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
- cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
- bruises: bruises=t,no=f
- odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
- gill-attachment: attached=a,descending=d,free=f,notched=n
- gill-spacing: close=c,crowded=w,distant=d
- gill-size: broad=b,narrow=n
- gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
- stalk-shape: enlarging=e,tapering=t
- stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
- stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- veil-type: partial=p,universal=u
- veil-color: brown=n,orange=o,white=w,yellow=y
- ring-number: none=n,one=o,two=t
- ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
- spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
- population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
- habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

### 데이터 가져오기

In [2]:
train_path = "mushroom train set 경로"
test_path = "mushroom test set 경로"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [1]:
train_df

In [2]:
test_df

In [3]:
train_df.info()

In [4]:
train_df.isnull().sum()

### 전처리

In [7]:
raw = pd.concat([train_df, test_df])

In [8]:
df = raw[:]

In [10]:
df = df.reset_index(drop=True)

In [6]:
df

In [7]:
df.shape

In [8]:
df.columns

In [9]:
# 사이즈 설정
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 13))

capShape_bar = sns.countplot(x="cap-shape", hue="class", data=df, ax=axes[0][0])

capSurface_bar = sns.countplot(x="cap-surface", hue="class", data=df, ax=axes[0][1])

capColor_bar = sns.countplot(x="cap-color", hue="class", data=df, ax=axes[0][2])

bruises_bar = sns.countplot(x="bruises", hue="class", data=df, ax=axes[1][0])
# bruises_bar.set_xticklabels(["True", "False"])

odor_bar = sns.countplot(x="odor", hue="class", data=df, ax=axes[1][1])

gillAttachment_bar = sns.countplot(x="gill-attachment", hue="class", data=df, ax=axes[1][2])

gillSpacing_bar = sns.countplot(x="gill-spacing", hue="class", data=df, ax=axes[2][0])

gillSize_bar = sns.countplot(x="gill-size", hue="class", data=df, ax=axes[2][1])

gillColor_bar = sns.countplot(x="gill-color", hue="class", data=df, ax=axes[2][2])

fig.tight_layout()
plt.show()

In [10]:
# 사이즈 설정
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 13))

stalkShape_bar = sns.countplot(x="stalk-shape", hue="class", data=df, ax=axes[0][0])

stalkRoot_bar = sns.countplot(x="stalk-root", hue="class", data=df, ax=axes[0][1])

stalkSurARing_bar = sns.countplot(x="stalk-surface-above-ring", hue="class", data=df, ax=axes[0][2])

stalkSurBRing_bar = sns.countplot(x="stalk-surface-below-ring", hue="class", data=df, ax=axes[1][0])

stalkCoARing_bar = sns.countplot(x="stalk-color-above-ring", hue="class", data=df, ax=axes[1][1])

stalkCoBRing_bar = sns.countplot(x="stalk-color-below-ring", hue="class", data=df, ax=axes[1][2])

veilType_bar = sns.countplot(x="veil-type", hue="class", data=df, ax=axes[2][0])

veilColor_bar = sns.countplot(x="veil-color", hue="class", data=df, ax=axes[2][1])

class_bar = sns.countplot(x="class", data=df, ax=axes[2][2])

fig.tight_layout()
plt.show()

In [11]:
# 사이즈 설정
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

ringNumber_bar = sns.countplot(x="ring-number", hue="class", data=df, ax=axes[0][0])

ringType_bar = sns.countplot(x="ring-type", hue="class", data=df, ax=axes[0][1])

sporePColor_bar = sns.countplot(x="spore-print-color", hue="class", data=df, ax=axes[0][2])

population_bar = sns.countplot(x="population", hue="class", data=df, ax=axes[1][0])

habitat_bar = sns.countplot(x="habitat", hue="class", data=df, ax=axes[1][1])

fig.tight_layout()
plt.show()

### Top3 만 뽑아서 Label Encoding

In [12]:
## cap-shape
# df["cap-shape"]

In [13]:
# sns.countplot(x="cap-color", data=df)

In [20]:
# cap-shape
df.loc[(df["cap-shape"]=='b')|(df["cap-shape"]=='s')|(df["cap-shape"]=='c'), "cap-shape"] = "ETC"

In [21]:
# cap-color
df.loc[(df["cap-color"]=='y')|(df["cap-color"]=='w')|(df["cap-color"]=='b')|(df["cap-color"]=='p')|(df["cap-color"]=='c')|(df["cap-color"]=='u')|(df["cap-color"]=='r'), "cap-color"] = "ETC"

In [22]:
# odor
df.loc[(df["odor"]=='s')|(df["odor"]=='a')|(df["odor"]=='l')|(df["odor"]=='p')|(df["odor"]=='c')|(df["odor"]=='m'), "odor"] = "ETC"

In [23]:
# gill-color
df.loc[(df["gill-color"]=='n')|(df["gill-color"]=='g')|(df["gill-color"]=='h')|(df["gill-color"]=='u')|(df["gill-color"]=='k')|(df["gill-color"]=='e')|(df["gill-color"]=='y')|(df["gill-color"]=='o')|(df["gill-color"]=='r'), "gill-color"] = "ETC"

In [24]:
# stalk-root
df.loc[(df["stalk-root"]=='c')|(df["stalk-root"]=='r'), "stalk-root"] = "ETC"

In [25]:
# stalk-color-above-ring
df.loc[(df["stalk-color-above-ring"]=='n')|(df["stalk-color-above-ring"]=='b')|(df["stalk-color-above-ring"]=='o')|(df["stalk-color-above-ring"]=='e')|(df["stalk-color-above-ring"]=='c')|(df["stalk-color-above-ring"]=='y'), "stalk-color-above-ring"] = "ETC"

In [26]:
# stalk-color-below-ring
df.loc[(df["stalk-color-below-ring"]=='n')|(df["stalk-color-below-ring"]=='b')|(df["stalk-color-below-ring"]=='o')|(df["stalk-color-below-ring"]=='e')|(df["stalk-color-below-ring"]=='c')|(df["stalk-color-below-ring"]=='y'), "stalk-color-below-ring"] = "ETC"

In [27]:
# ring-type
df.loc[(df["ring-type"]=='f')|(df["ring-type"]=='n'), "ring-type"] = "ETC"

In [28]:
# spore-print-color
df.loc[(df["spore-print-color"]=='h')|(df["spore-print-color"]=='r')|(df["spore-print-color"]=='r')|(df["spore-print-color"]=='u')|(df["spore-print-color"]=='o')|(df["spore-print-color"]=='y')|(df["spore-print-color"]=='b'), "spore-print-color"] = "ETC"

In [29]:
# population
df.loc[(df["population"]=='n')|(df["population"]=='a')|(df["population"]=='c'), "population"] = "ETC"

In [30]:
# habitat
df.loc[(df["habitat"]=='l')|(df["habitat"]=='u')|(df["habitat"]=='m')|(df["habitat"]=='w'), "habitat"] = "ETC"

In [14]:
# 바뀐 habitat value_count 찍어보기
df["habitat"].value_counts()

In [15]:
df

In [20]:
# label encoding 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for i, column in enumerate(df.columns):
    if i < 2:
        continue
    df[column] = label_encoder.fit_transform(df[column])
    print(column, label_encoder.classes_)

### heatmap

In [16]:
# 히트맵용 label encoding 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for i, column in enumerate(df.columns):
    if i < 1:
        continue
    df[column] = label_encoder.fit_transform(df[column])
    print(column, label_encoder.classes_)

In [17]:
df

In [37]:
df_heatmap = df.iloc[:, 1:]

In [18]:
df_heatmap.corr()

In [19]:
plt.figure(figsize=(10,8))
# 양의 선형관계 -> 빨강, 음의 선형관계 -> 파랑, 0에 가까움 -> 하양 인 cmap 설정(?)
cmap=sns.diverging_palette(240,10,n=9, as_cmap=True)
sns.heatmap(
    data=df_heatmap.corr(),
#     annot=True,
#     fmt=".2f",
    linewidth=0.5,
    annot_kws={"size":8},
    cmap=cmap
)

### scaling

In [40]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

In [41]:
column = df.columns

In [42]:
column = column[2:]

In [21]:
column

In [44]:
df[column] = standard_scaler.fit_transform(df[column])

In [22]:
df

### train, test set 나누기

In [46]:
train_df = df[:6500]
test_df = df[6500:]

In [47]:
train_df

Unnamed: 0,mushroom_id,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,1,1.020564,0.140128,1.271378,1.185917,-1.481934,0.162896,-0.438864,1.494683,...,0.586385,0.753195,0.767702,0.0,0.142037,-0.256132,0.944826,-0.524335,-0.836896,-1.395586
1,1,0,1.020564,0.140128,-1.239645,1.185917,-1.481934,0.162896,-0.438864,-0.669038,...,0.586385,0.753195,0.767702,0.0,0.142037,-0.256132,0.944826,0.354315,-1.909285,0.689324
2,2,0,-1.873013,0.140128,-1.239645,1.185917,-1.481934,0.162896,-0.438864,-0.669038,...,0.586385,0.753195,0.767702,0.0,0.142037,-0.256132,0.944826,0.354315,-1.909285,-1.395586
3,3,1,1.020564,0.953270,-1.239645,1.185917,-1.481934,0.162896,-0.438864,1.494683,...,0.586385,0.753195,0.767702,0.0,0.142037,-0.256132,0.944826,-0.524335,-0.836896,-1.395586
4,4,0,1.020564,0.140128,0.434370,-0.843230,0.718218,0.162896,2.278612,-0.669038,...,0.586385,0.753195,0.767702,0.0,0.142037,-0.256132,-1.217362,0.354315,-1.909285,0.689324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,6495,1,-0.908487,0.953270,1.271378,-0.843230,1.818294,0.162896,-0.438864,1.494683,...,0.586385,0.753195,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6496,6496,1,-0.908487,0.140128,-0.402637,-0.843230,1.818294,0.162896,-0.438864,1.494683,...,-0.893053,0.753195,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6497,6497,1,-0.908487,0.140128,-0.402637,-0.843230,-0.381858,0.162896,-0.438864,1.494683,...,-0.893053,0.753195,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6498,6498,1,-0.908487,0.953270,1.271378,-0.843230,-1.481934,0.162896,-0.438864,1.494683,...,-0.893053,-0.165567,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131


In [48]:
test_df = test_df.drop(["class"], axis=1)

In [49]:
test_df

Unnamed: 0,mushroom_id,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6500,6500,-0.908487,0.140128,1.271378,-0.84323,-0.381858,0.162896,-0.438864,1.494683,-0.021527,...,0.586385,-0.165567,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6501,6501,0.056038,0.953270,-0.402637,-0.84323,-1.481934,0.162896,-0.438864,1.494683,-0.021527,...,0.586385,-0.165567,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6502,6502,-0.908487,0.953270,-0.402637,-0.84323,-0.381858,0.162896,-0.438864,1.494683,-0.021527,...,-0.893053,0.753195,0.767702,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6503,6503,-0.908487,0.140128,-0.402637,-0.84323,-0.381858,0.162896,-0.438864,1.494683,-0.021527,...,-0.893053,0.753195,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
6504,6504,-0.908487,0.953270,-0.402637,-0.84323,1.818294,0.162896,-0.438864,1.494683,-0.021527,...,0.586385,0.753195,-0.136184,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-0.353131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,0.056038,0.140128,1.271378,-0.84323,0.718218,-6.138869,-0.438864,-0.669038,-0.922978,...,0.586385,-2.003092,-1.943955,0.0,-3.979055,-0.256132,0.944826,-1.402985,-1.909285,-1.395586
8120,8120,1.020564,0.140128,1.271378,-0.84323,0.718218,-6.138869,-0.438864,-0.669038,-0.922978,...,0.586385,-2.003092,-1.943955,0.0,-8.100146,-0.256132,0.944826,-1.402985,0.235493,-1.395586
8121,8121,-0.908487,0.140128,1.271378,-0.84323,0.718218,-6.138869,-0.438864,-0.669038,-0.922978,...,0.586385,-2.003092,-1.943955,0.0,-3.979055,-0.256132,0.944826,-1.402985,-1.909285,-1.395586
8122,8122,0.056038,0.953270,1.271378,-0.84323,1.818294,0.162896,-0.438864,1.494683,-0.021527,...,-0.893053,0.753195,0.767702,0.0,0.142037,-0.256132,-1.217362,1.232966,0.235493,-1.395586


### csv 파일로 저장

In [23]:
preprocessed_train_path = "preprocessed train set path"
preprocessed_test_path = "preprocessed test set path"
train_df.to_csv(preprocessed_train_path, index=False)
test_df.to_csv(preprocessed_test_path, index=False)

### train, test set 읽어오기

In [51]:
import pandas as pd

In [52]:
train_df = pd.read_csv("./data/mushroom_preprocessed_train.csv")
test_df = pd.read_csv("./data/mushroom_preprocessed_test.csv")

In [24]:
test_df

### RandomForest 모델 학습시키기

In [54]:
x_train = train_df.drop(["mushroom_id", "class"], axis=1)
y_train = train_df["class"]
x_test = test_df.drop(["mushroom_id"], axis=1)

In [55]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=10, shuffle=True, random_state=1234)

In [56]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=20,
    criterion="entropy",
    max_depth=7,
    max_features=3,
    random_state=1234
)

In [57]:
scores = cross_val_score(model, x_train, y_train, cv=kf)

In [25]:
scores

In [59]:
scores.mean()

1.0

In [26]:
model.fit(x_train, y_train)

### 시각화

In [27]:
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt

plt.figure(figsize=(30,10))
feature_names = x_train.columns.to_list()
_ = plot_tree(
    model.estimators_[9],
    filled=True,
    rounded=True,
    feature_names=feature_names
)

In [62]:
# stratified KFold validation
from sklearn.model_selection import StratifiedKFold

stratified_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [65]:
param_grid = {
    "n_estimators": [10, 20, 30, 50],
    "max_depth": [3, 5, 7, 10],
    "max_features": [3, 5, 9]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=stratified_kf
)

In [28]:
grid_search.fit(x_train, y_train)

In [29]:
grid_search.best_params_

In [30]:
grid_search.best_score_

In [106]:
model = RandomForestClassifier(
    n_estimators=20,
    criterion="entropy",
    max_depth=5,
    max_features=3,
    random_state=1234
)

In [107]:
scores = cross_val_score(model, x_train, y_train, cv=stratified_kf)

In [31]:
scores

In [32]:
scores.mean()

In [33]:
model.fit(x_train, y_train)

In [34]:
model.feature_importances_

In [36]:
ser = pd.Series(model.feature_importances_, index=df.columns[2:])

top10 = ser.sort_values(ascending=False)[:10]
print(top10)

In [37]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
plt.title('Feature Importances Top 10')
sns.barplot(x=top10, y=top10.index)
plt.show()

### 제출(RandomForest Version)

In [77]:
y_test = model.predict(x_test)

In [78]:
test_df["class"] = y_test

In [79]:
test_df[["mushroom_id", "class"]].to_csv("./data/mushroom_submission_rf.csv", index=False)