#### 의사결정나무 clssification
- data : breast-cancer-wisconsin.csv 
- 암 환자 예측

In [2]:
# 데이터 준비
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
data = pd.read_csv('breast-cancer-wisconsin.csv')
X = data[data.columns[1:10]]
y = data[['Class']]

In [4]:
# train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [18]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [19]:
# 모델적용
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [15]:
# 오차행렬 confusionmatrix - train data
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print('confusion matrix :\n', confusion_train)

confusion matrix :
 [[333   0]
 [  0 179]]


In [16]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측 report :\n", cfreport_train)

분류예측 report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [20]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9473684210526315

In [21]:
# train data 에는 100% 정확도를 보였고, test data 는 94.73% 의 에측결과가 나타난다. train data에 과대적합 된 것 같다.

In [22]:
# 오차행렬 -test data
confusion_test = confusion_matrix(y_test, pred_test)
print("confusion matrix:\n", confusion_test)

confusion matrix:
 [[105   6]
 [  3  57]]


In [24]:
# 분류예측 레포트 - test data
cfreport_test = classification_report(y_test, pred_test)
print("분류에측 report:\n", cfreport_test)

분류에측 report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.90      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.94      0.95      0.94       171
weighted avg       0.95      0.95      0.95       171



In [26]:
# Grid Search
param_grid = {'max_depth':range(2,20,2),
             'min_samples_leaf':range(1,50,2)}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [28]:
print("Best Parameter: {}".format(grid_search.best_params_))
print("Best Score: {:.4f}".format(grid_search.best_score_))
print("Testset Score {:.4f}".format(grid_search.score(X_scaled_test,y_test)))

Best Parameter: {'max_depth': 18, 'min_samples_leaf': 1}
Best Score: 0.9628
Testset Score 0.9591


In [32]:
# random search
from scipy.stats import randint
param_distribs = {'max_depth': randint(low=1, high=20),
                 'min_samples_leaf': randint(low=1, high=50)}
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(DecisionTreeClassifier(),
                                  param_distributions = param_distribs,
                                  n_iter=20, cv=5)
random_search.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=20,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc0b5ea1390>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc0b5ea1610>})

In [34]:
print("Best Parameter:{}".format(random_search.best_params_))
print("Best Score:{:.4f}".format(random_search.best_score_))
print("Testset Score:{:.4f}".format(random_search.score(X_scaled_train,y_train)))

Best Parameter:{'max_depth': 6, 'min_samples_leaf': 1}
Best Score:0.9569
Testset Score:0.9961
