In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import sys, warnings

## Load Data

In [2]:
dat = pd.read_csv('dat.csv')
dat = dat.drop(columns = "Unnamed: 0")
dat.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,is_cancer
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
dfX = dat.drop(columns = "is_cancer")
y_tr = dat['is_cancer']

## Train test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(dfX, y_tr, test_size=0.3, random_state=0)

In [5]:
display('X_train shape', X_train.shape, 'X_test shape', X_test.shape, 'y_train shape', y_train.shape, 'y_test shape', y_test.shape)

'X_train shape'

(398, 30)

'X_test shape'

(171, 30)

'y_train shape'

(398,)

'y_test shape'

(171,)

## modeling

In [6]:
rf = RandomForestClassifier(oob_score = True)

In [7]:
rf.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
print('Oob score', rf.oob_score_)

Oob score 0.9221105527638191


In [9]:
print('test score',accuracy_score(y_test, rf.predict(X_test)))

test score 0.9415204678362573


## parameter tuning

In [10]:
rf_params = {
    'n_estimators' : [10, 20, 50, 100], ## ntree
    'max_features' : [5, 10, 15, 20] ## mtry
}

In [11]:
model_rf = GridSearchCV(rf, param_grid = rf_params, scoring='accuracy', cv = 5, n_jobs=-1) 

In [12]:
model_rf.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=True, random_state=None,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_f

In [13]:
print('best_score', model_rf.best_score_, 'best params', model_rf.best_params_)

best_score 0.957286432160804 best params {'max_features': 5, 'n_estimators': 10}


#### **최적의 파라미터를 찾았다면 최적의 파라미터로 새로 모델을 적합시키고 예측을 해야한다.**

In [14]:
best_rf = model_rf.best_estimator_

In [15]:
best_rf.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
print('test score',accuracy_score(y_test, best_rf.predict(X_test)))

test score 0.9473684210526315


## confusion matrix

In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, best_rf.predict(X_test))

array([[ 61,   2],
       [  7, 101]], dtype=int64)

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test, best_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93        63
           1       0.98      0.94      0.96       108

    accuracy                           0.95       171
   macro avg       0.94      0.95      0.94       171
weighted avg       0.95      0.95      0.95       171



## End