# 랜덤포레스트 분류

In [None]:
import os
import pandas as pd
import numpy as np
import hds
from plt_rcs import *
plt.rc(group='figure', figsize=(4, 4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
[i for i in os.listdir() if 'Wine' in i][0]

In [None]:
objs = pd.read_pickle('WhiteWine.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_train, X_valid, y_train, y_valid = X_train, X_valid, y_train, y_valid

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(oob_score=True, random_state=0)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)
# 1.0
model.score(X_valid, y_valid)
# 0.8653061224489796

In [None]:
pd.Series(data=model.feature_importances_, index=model.feature_names_in_).sort_values(ascending=False)
# alcohol                 0.181730
# density                 0.126302
# volatile acidity        0.094409
# chlorides               0.094020
# pH                      0.093747
# total sulfur dioxide    0.090092
# residual sugar          0.088768
# sulphates               0.085994
# citric acid             0.074391
# fixed acidity           0.070548
# dtype: float64

In [None]:
hds.plot.feature_importance(model)

## OOB 정확도 시각화

In [None]:
model.oob_score_
# 0.8771878646441074

In [None]:
def oob_score(ntree):
    model.set_params(n_estimators=ntree)
    model.fit(X_train, y_train)
    return model.oob_score_

In [None]:
ntrees = range(1, 101, 10)

In [None]:
oob_acc = [oob_score(ntree) for ntree in ntrees]

In [None]:
sns.lineplot(x=ntrees, y=oob_acc, color='red', linewidth=1)
plt.show()

## 분류 모델 성능 평가

In [None]:
y_pred = model.predict(X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred)

## 곡선 시각화

In [None]:
y_prob = model.predict_proba(X_valid)

In [None]:
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob)

In [None]:
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(k_neighbors=5, random_state=0)

In [None]:
X_bal, y_bal = smote.fit_resample(X_train, y_train)

In [None]:
model = RandomForestClassifier(random_state=0)

In [None]:
model.fit(X_bal, y_bal)

In [None]:
model.score(X_bal, y_bal)

In [None]:
model.score(X_valid, y_valid)

In [None]:
y_pred_bal = model.predict(X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_bal)

In [None]:
y_prob_bal = model.predict_proba(X_valid)

In [None]:
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_bal, color='blue')

In [None]:
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob, color='red')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_bal, color='blue')