## Decision Tree Classifier를 이용한 버섯 식용 여부 예측 

원본 데이터 셋 링크: https://www.kaggle.com/datasets/uciml/mushroom-classification/data  

버섯에 대한 메타 데이터가 주어지고, 먹을 수 있는지 없는지 여부를 표시한 데이터가 있습니다. 지금까지 배운 내용들을 활용해서 예측 모델을 만들고, 아래 포맷으로 submission을 만들어서 제출해보세요.

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

Attribute Information: (classes: edible=e, poisonous=p)

- cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
- cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
- cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
- bruises: bruises=t,no=f
- odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
- gill-attachment: attached=a,descending=d,free=f,notched=n
- gill-spacing: close=c,crowded=w,distant=d
- gill-size: broad=b,narrow=n
- gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
- stalk-shape: enlarging=e,tapering=t
- stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
- stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- veil-type: partial=p,universal=u
- veil-color: brown=n,orange=o,white=w,yellow=y
- ring-number: none=n,one=o,two=t
- ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
- spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
- population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
- habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

### train, test set 읽어오기

In [80]:
import pandas as pd

train_df = pd.read_csv("./data/mushroom_preprocessed_train.csv")
test_df = pd.read_csv("./data/mushroom_preprocessed_test.csv")

x_train = train_df.drop(["mushroom_id", "class"], axis=1)
y_train = train_df["class"]
x_test = test_df.drop(["mushroom_id"], axis=1)

In [1]:
train_df

In [2]:
test_df

### Boosting Tree

In [83]:
from sklearn.model_selection import StratifiedKFold

stratified_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

In [84]:
import lightgbm as lgb

model = lgb.LGBMClassifier()

In [85]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [10, 30],
    "learning_rate": [0.01, 0.001],
    "num_leaves": [10, 20, 30],
    "subsample": [0.8, 1.0]
}

In [86]:
import lightgbm as lgb

model = lgb.LGBMClassifier()

In [3]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=stratified_kf,
    scoring="accuracy"
)
grid_search.fit(x_train, y_train)

In [4]:
grid_search.best_params_

In [5]:
grid_search.best_score_

In [6]:
model = lgb.LGBMClassifier(**grid_search.best_params_)
model.fit(x_train, y_train)

In [91]:
# importance
from lightgbm import plot_importance

In [7]:
plot_importance(model)

### 제출(BoostingTree version)

In [8]:
y_test = model.predict(x_test)

In [9]:
test_df["class"] = y_test

In [10]:
test_df[["mushroom_id", "class"]].to_csv("./data/mushroom_submission_bt.csv", index=False)