In [1]:
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 데이터 불러오기

In [2]:
data = pd.read_csv('5th_dat.csv')

In [3]:
data.head() #y : is_cancer

Unnamed: 0.1,Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,is_cancer
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
data.is_cancer.value_counts()

1    357
0    212
Name: is_cancer, dtype: int64

# 데이터 전처리

### train-test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = ['is_cancer']), data.is_cancer, test_size = 0.3, random_state = 0)

# 모델링

In [26]:
%%time
gbm = GradientBoostingClassifier()
gbm_score = gbm.fit(X_train, y_train).score(X_test, y_test)
print(gbm_score)

0.9649122807017544
Wall time: 159 ms


In [22]:
%%time
xgb = XGBClassifier()
xgb_score = xgb.fit(X_train, y_train).score(X_test, y_test)
print(xgb_score)

Wall time: 127 ms


In [30]:
%%time
lgbm = LGBMClassifier()
lgbm_score = lgbm.fit(X_train, y_train).score(X_test, y_test)
print(lgbm_score)

0.9590643274853801
Wall time: 146 ms


In [33]:
%%time
#RandomForest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_score = rf.fit(X_train, y_train).score(X_test, y_test)
print(rf_score)

0.9415204678362573
Wall time: 23.9 ms




# 파라미터 튜닝

In [7]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10)

### GBM

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
gbm = GradientBoostingClassifier()

gbm_param_grid = {'learning_rate' : [0.1,0.3,0.5],
                 'max_depth' : [3,6,9],
                 'min_samples_split' : [5,10,15],
                 'n_estimators' : [100,500,1500]}

GBM = RandomizedSearchCV(gbm,param_distributions = gbm_param_grid, n_iter = 10, cv=kfold, scoring="accuracy", verbose = 1)
GBM.fit(X_train,y_train)
GBM_best = GBM.best_estimator_

# Best score
display(GBM.best_score_, GBM_best)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   20.3s finished


0.957286432160804

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=15,
              min_weight_fraction_leaf=0.0, n_estimators=1500,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

### XGB

In [9]:
xgb = XGBClassifier()

xgb_param_grid = {'n_estimators' : [100,200],
                 'learniing_rate' : [0.05,0.1],
                 'gamma' : [0,5],
                 'max_depth' : [6,8],
                 'min_child_weight' : [6,8],
                 'colsample_bytree' : [0.5,1],
                 'subsample' : [0.5,1]}

XGB = RandomizedSearchCV(xgb,param_distributions = xgb_param_grid, n_iter = 10, cv=kfold, scoring="accuracy", verbose = 1)
XGB.fit(X_train,y_train)
XGB_best = XGB.best_estimator_

# Best score
display(XGB.best_score_, XGB_best)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.8s finished


0.9597989949748744

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learniing_rate=0.05,
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

### LGBM

In [14]:
lgbm = LGBMClassifier()

lgb_param_grid = {'num_leaves' : [70,80],
                'min_data_in_leaf' : [100,200],
                 'max_depth' : [7,8]}

LGB = RandomizedSearchCV(lgbm,param_distributions = lgb_param_grid, n_iter = 10, cv=kfold, scoring="accuracy", verbose = 1)
LGB.fit(X_train,y_train)
LGB_best = LGB.best_estimator_

# Best score
display(LGB.best_score_, LGB_best)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    3.1s finished


0.9623115577889447

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=7,
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=100,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=70,
        objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

Wall time: 3.26 s


In [12]:
gbm_score = gbm.fit(X_train, y_train).score(X_test, y_test)
xgb_score = xgb.fit(X_train, y_train).score(X_test, y_test)
lgbm_score = lgbm.fit(X_train, y_train).score(X_test, y_test)

In [13]:
display(gbm_score, xgb_score, lgbm_score)

0.9649122807017544

0.9707602339181286

0.9590643274853801