In [1]:
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [2]:
#데이터를 불러오고 형식 확인

df = pd.read_csv('./cancer_dataset_ver1.csv')
print(df.head())
df.info()

       A1BG      A1CF     A2LD1       A2M     A2ML1    A4GALT     A4GNT  \
0  3.575803  1.815535  3.407354  6.848847  1.949370  3.389573  1.935099   
1  3.602704  1.903611  3.400847  6.504326  2.367093  3.035315  1.941405   
2  4.089806  1.998825  3.733770  4.904531  2.208550  3.328281  1.923295   
3  3.929732  1.930019  3.668371  3.957686  2.845188  2.855448  1.855836   
4  3.645441  1.917237  3.594290  5.312689  2.115304  2.795185  1.731550   

       AAA1      AAAS      AACS  ...     ZWINT      ZXDA      ZXDB      ZXDC  \
0  1.982839  4.669173  3.067800  ...  3.395375  6.346860  3.659673  2.529919   
1  1.807694  4.240750  3.758705  ...  3.265353  5.657011  4.127937  2.186070   
2  1.946061  4.597667  3.490880  ...  3.559725  5.708799  4.387543  1.922107   
3  1.727771  4.024512  2.849850  ...  3.847016  6.144403  4.096720  2.144641   
4  1.644886  4.463236  3.022631  ...  3.386223  5.636870  3.211165  1.893340   

     ZYG11A    ZYG11B       ZYX     ZZEF1      ZZZ3  label  
0  1.78

In [3]:
#전처리(오버샘플링)

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
smote = SMOTE()
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트: ', X_train.shape, y_train.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트: ', X_train_over.shape, y_train_over.shape)

SMOTE 적용 전 학습용 피처/레이블 데이터 세트:  (339, 20228) (339,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트:  (422, 20228) (422,)


In [4]:
#성능지표 함수

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test,pred)

    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy,precision, recall,f1))

In [5]:
#랜덤포레스트 구현

rf_clf = RandomForestClassifier(n_estimators = 1000, max_depth = 100)
rf_clf.fit(X_train_over, y_train_over)
pred = rf_clf.predict(X_test)
get_clf_eval(y_test,pred)

오차 행렬
[[24  8]
 [ 6 47]]
정확도: 0.8353, 정밀도: 0.8545, 재현율: 0.8868, F1:0.8704


In [6]:
#랜덤포레스트를 이용하여 변수 중요도 추출
ftr_importances_values = rf_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train_over.columns)
ftr_top5000 = ftr_importances.sort_values(ascending=False)[:5000]

In [7]:
#top5000을 이용해 LGBM 모델 구현

df = pd.read_csv('./cancer_dataset_ver1.csv')
df_new = df[ftr_top5000.index].copy()
df_new['label'] = df['label']
df_new.head()
X = df_new.iloc[:, :-1]
y = df_new.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
smote = SMOTE()
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
X_tr, X_val, y_tr, y_val = train_test_split(X_train_over, y_train_over, test_size = 0.2)
lgbm = LGBMClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 100)
evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm.fit(X_tr, y_tr, early_stopping_rounds = 100, eval_metric = "logloss", eval_set = evals, verbose = True)
preds = lgbm.predict(X_test)
get_clf_eval(y_test, preds)



[1]	training's binary_logloss: 0.628102	valid_1's binary_logloss: 0.680244
[2]	training's binary_logloss: 0.575942	valid_1's binary_logloss: 0.637026
[3]	training's binary_logloss: 0.530128	valid_1's binary_logloss: 0.621354
[4]	training's binary_logloss: 0.490561	valid_1's binary_logloss: 0.612378
[5]	training's binary_logloss: 0.451589	valid_1's binary_logloss: 0.594998
[6]	training's binary_logloss: 0.417421	valid_1's binary_logloss: 0.577629
[7]	training's binary_logloss: 0.3881	valid_1's binary_logloss: 0.55921
[8]	training's binary_logloss: 0.360496	valid_1's binary_logloss: 0.544647
[9]	training's binary_logloss: 0.333716	valid_1's binary_logloss: 0.536104
[10]	training's binary_logloss: 0.308153	valid_1's binary_logloss: 0.523808
[11]	training's binary_logloss: 0.287626	valid_1's binary_logloss: 0.51293
[12]	training's binary_logloss: 0.267442	valid_1's binary_logloss: 0.517546
[13]	training's binary_logloss: 0.248798	valid_1's binary_logloss: 0.50678
[14]	training's binary_log

[108]	training's binary_logloss: 0.000641751	valid_1's binary_logloss: 0.41118
[109]	training's binary_logloss: 0.00060456	valid_1's binary_logloss: 0.411875
[110]	training's binary_logloss: 0.000567988	valid_1's binary_logloss: 0.41003
[111]	training's binary_logloss: 0.000536591	valid_1's binary_logloss: 0.414702
[112]	training's binary_logloss: 0.000499434	valid_1's binary_logloss: 0.412798
[113]	training's binary_logloss: 0.000468815	valid_1's binary_logloss: 0.41271
[114]	training's binary_logloss: 0.000440706	valid_1's binary_logloss: 0.40936
[115]	training's binary_logloss: 0.000416882	valid_1's binary_logloss: 0.408749
[116]	training's binary_logloss: 0.000391238	valid_1's binary_logloss: 0.410169
[117]	training's binary_logloss: 0.0003691	valid_1's binary_logloss: 0.407345
[118]	training's binary_logloss: 0.000347508	valid_1's binary_logloss: 0.414392
[119]	training's binary_logloss: 0.000324296	valid_1's binary_logloss: 0.412624
[120]	training's binary_logloss: 0.000305731	va

In [8]:
#SKFold 이용하여 측정

from sklearn.model_selection import StratifiedKFold

df = pd.read_csv('./cancer_dataset_ver1.csv')
df_new = df[ftr_top5000.index].copy()
df_new['label'] = df['label']
df_new.head()
X = df_new.iloc[:, :-1]
y = df_new.iloc[:,-1]

skfold = StratifiedKFold(n_splits = 5)

scores = []
n_iter=0
for train_index, test_index in skfold.split(X, y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index],y.iloc[test_index]
    
    lgbm_clf = LGBMClassifier(n_estimators = 300, max_depth = 100, learning_rate = 0.1)
    lgbm_clf.fit(X_train, y_train)
    pred = lgbm_clf.predict(X_test)
    n_iter +=1
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print('\n{0} 교차 검증 정확도: {1}, 학습 데이터 크기: {2}, 검증 데이터 크기: {3}'.format(n_iter, accuracy, train_size, test_size))
    scores.append(accuracy)
print('\n## 교차 검증별 정확도: ', np.round(scores, 4))
print('\n## 평균 검증 정확도: ', np.round(np.mean(scores),4))




1 교차 검증 정확도: 0.8353, 학습 데이터 크기: 339, 검증 데이터 크기: 85

2 교차 검증 정확도: 0.8706, 학습 데이터 크기: 339, 검증 데이터 크기: 85

3 교차 검증 정확도: 0.8353, 학습 데이터 크기: 339, 검증 데이터 크기: 85

4 교차 검증 정확도: 0.8235, 학습 데이터 크기: 339, 검증 데이터 크기: 85

5 교차 검증 정확도: 0.7976, 학습 데이터 크기: 340, 검증 데이터 크기: 84

## 교차 검증별 정확도:  [0.8353 0.8706 0.8353 0.8235 0.7976]

## 평균 검증 정확도:  0.8325
