In [14]:
import os
import pandas as pd

In [33]:
#!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/00/37/a392e669a83fef72b916009c438a924d2a9d70bc8aea62662b207105ed98/lightgbm-2.2.3-py2.py3-none-win_amd64.whl (515kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.3


In [15]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

## 전처리를 완료하여 저장된 데이터를 읽어서 다양한 모델을 적용

In [16]:
datafilename = "data/oasis_longitudinal_processed.csv"

existfile = os.path.isfile(datafilename)
if existfile :
    df = pd.read_csv(datafilename, header=0, encoding='utf-8')
else :
    print("파일 '{}'가 존재하지 않습니다. 확인 후 진행해주세요".format(datafilename))
    
# print the concise summery of the dataset
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
Subject ID    373 non-null int64
MRI ID        373 non-null int64
Group         373 non-null int64
Visit         373 non-null int64
MR Delay      373 non-null int64
M/F           373 non-null int64
Hand          373 non-null int64
Age           373 non-null int64
EDUC          373 non-null int64
SES           373 non-null int64
MMSE          373 non-null int64
CDR           373 non-null int64
eTIV          373 non-null int64
nWBV          373 non-null int64
ASF           373 non-null int64
dtypes: int64(15)
memory usage: 43.8 KB


## 모델링을 위한 변수 선택

In [17]:
# 모델링을 위한 변수 선택
feature_col_names = ["M/F", "Age", "EDUC", "SES", "MMSE", "eTIV", "nWBV", "ASF"]
predicted_class_names = ['Group']

X = df[feature_col_names].values
y = df[predicted_class_names].values


## train, test set 분리

In [18]:
# train, test set 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print("total count : {}, df_train count : {}, df_test count : {}".format(len(df), len(X_train), len(X_test)))

total count : 373, df_train count : 261, df_test count : 112


## Modeling

### 다양한 모델을 사용한 코드 작성

#### Logistic Regression 
- Logistic Regression을 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [19]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(penalty='l2', C=10.0)
LR = LR.fit(X_train, y_train.ravel())
y_test_pred = LR.predict(X_test)

print('\n=== Modeling : Logistic Regression ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))



=== Modeling : Logistic Regression ===
Accuracy Score : 0.7410714285714286
AUC Score : 0.742948717948718
<Confusion Matrix>
[[40 12]
 [17 43]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.70      0.77      0.73        52
          1       0.78      0.72      0.75        60

avg / total       0.74      0.74      0.74       112



In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1.0, 10.0,100.0]
}
gridsearch = GridSearchCV(estimator=LogisticRegression(random_state=1234), param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
gridsearch.fit(X_train, y_train.ravel())     

print(gridsearch.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.8s finished


{'C': 10.0, 'penalty': 'l1'}


#### k-NN (k-Nearest Neighbors)
- k Neighbors Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [21]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors=3, weights='uniform')
KNN = KNN.fit(X_train, y_train.ravel())
y_test_pred = KNN.predict(X_test)

print('\n=== Modeling : k-Nearerst Neighbors Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : k-Nearerst Neighbors Classifier ===
Accuracy Score : 0.6607142857142857
AUC Score : 0.6666666666666666
<Confusion Matrix>
[[39 13]
 [25 35]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.61      0.75      0.67        52
          1       0.73      0.58      0.65        60

avg / total       0.67      0.66      0.66       112



#### Decision Tree Classifier
- Decision Tree Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [22]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(criterion='gini',
                             max_features=None, 
                             max_depth=None,
                             min_samples_split=2,
                             min_samples_leaf=1)
DTC = DTC.fit(X_train, y_train.ravel())
y_test_pred = DTC.predict(X_test)

print('\n=== Modeling : Decision Tree Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Decision Tree Classifier ===
Accuracy Score : 0.7589285714285714
AUC Score : 0.7583333333333333
<Confusion Matrix>
[[39 13]
 [14 46]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.74      0.75      0.74        52
          1       0.78      0.77      0.77        60

avg / total       0.76      0.76      0.76       112



#### Random Forrest Classifier
- Random Forrest Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [23]:
from sklearn.ensemble import RandomForestClassifier

RC = RandomForestClassifier(n_estimators=100, max_features=3)
RC = RC.fit(X_train, y_train.ravel())
y_test_pred = RC.predict(X_test)

print('\n=== Modeling : Random Forrest Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Random Forrest Classifier ===
Accuracy Score : 0.8482142857142857
AUC Score : 0.8506410256410255
<Confusion Matrix>
[[46  6]
 [11 49]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.81      0.88      0.84        52
          1       0.89      0.82      0.85        60

avg / total       0.85      0.85      0.85       112



In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_features': ['auto', 'log2']
}
gridsearch = GridSearchCV(estimator=RandomForestClassifier(random_state=1234), param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
gridsearch.fit(X_train, y_train.ravel())     

print(gridsearch.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s


{'max_features': 'auto', 'n_estimators': 45}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    6.1s finished


#### Gradient Boosting Classifier
- Gradient Boosting Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(learning_rate=0.1,
                                 max_features=3,
                                 subsample=0.5,
                                 n_estimators=200)
GBC = GBC.fit(X_train, y_train.ravel())
y_test_pred = GBC.predict(X_test)

print('\n=== Modeling : Gradient Boosting Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Gradient Boosting Classifier ===
Accuracy Score : 0.8392857142857143
AUC Score : 0.8384615384615384
<Confusion Matrix>
[[43  9]
 [ 9 51]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.83      0.83      0.83        52
          1       0.85      0.85      0.85        60

avg / total       0.84      0.84      0.84       112



#### XgBoost Classifier
- XgBoost Classifier 를 xgnoost 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [26]:
from xgboost import XGBClassifier

XGB = XGBClassifier(learning_rate=0.1,
                    max_features=3,
                    subsample=0.5,
                    n_estimators=200)
XGB = XGB.fit(X_train, y_train.ravel())
y_test_pred = XGB.predict(X_test)

print('\n=== Modeling : Xgboost Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Xgboost Classifier ===
Accuracy Score : 0.8214285714285714
AUC Score : 0.8179487179487179
<Confusion Matrix>
[[40 12]
 [ 8 52]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.83      0.77      0.80        52
          1       0.81      0.87      0.84        60

avg / total       0.82      0.82      0.82       112



  if diff:


#### Linear SVM (Support Vector Machine)
- Linear SVM Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [27]:
from sklearn.svm import LinearSVC

LinSVC = LinearSVC(penalty='l2', C=10.0)
LinSVC = LinSVC.fit(X_train, y_train.ravel())
y_test_pred = LinSVC.predict(X_test)

print('\n=== Modeling : Linear SVC ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))



=== Modeling : Linear SVC ===
Accuracy Score : 0.5357142857142857
AUC Score : 0.5
<Confusion Matrix>
[[ 0 52]
 [ 0 60]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        52
          1       0.54      1.00      0.70        60

avg / total       0.29      0.54      0.37       112



  'precision', 'predicted', average, warn_for)


#### Non-linear SVM (Support Vector Machine)
- Non-linear SVM Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [28]:
from sklearn.svm import SVC

rbfSVC = SVC(kernel='rbf', gamma=1.0, C=10.0)
rbfSVC = rbfSVC.fit(X_train, y_train.ravel())
y_test_pred = rbfSVC.predict(X_test)

print('\n=== Modeling : Non-linear SVC ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Non-linear SVC ===
Accuracy Score : 0.4642857142857143
AUC Score : 0.5
<Confusion Matrix>
[[52  0]
 [60  0]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.46      1.00      0.63        52
          1       0.00      0.00      0.00        60

avg / total       0.22      0.46      0.29       112



  'precision', 'predicted', average, warn_for)


#### Voting Classifier
- Voting Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [36]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier

vote_est = [('etc', ExtraTreesClassifier()),
            ('gb', GradientBoostingClassifier()),
            ('abc', AdaBoostClassifier()),
            ('rfc', RandomForestClassifier(criterion='gini', max_depth=8, max_features='auto', n_estimators=200)),
            ('svc', SVC(probability=True)),
            ('xgb', XGBClassifier())
#            ('lgbm', LGBMClassifier())
           ]

vClf = VotingClassifier(estimators=vote_est, voting='hard')
vClf = vClf.fit(X_train, y_train.ravel())
y_test_pred = vClf.predict(X_test)

print('\n=== Modeling : Voting Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : Voting Classifier ===
Accuracy Score : 0.8303571428571429
AUC Score : 0.8326923076923077
<Confusion Matrix>
[[45  7]
 [12 48]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.79      0.87      0.83        52
          1       0.87      0.80      0.83        60

avg / total       0.83      0.83      0.83       112



  if diff:
  if diff:


#### ExtraTrees Classifier
- ExtraTrees Classifier 를 sklearn 으로 부터 불러 들임 (import)
- 클래스의 인스턴스 생성, 및 하이퍼 파라미터 세팅
- 적합 (fit) 및 예측(predict) 함수 실행
- 교차 검증 (cross-validation) 방법을 사용하여 최적의 하이퍼 파라미터 선택

In [29]:
from sklearn.ensemble import ExtraTreesClassifier

etClf = ExtraTreesClassifier(n_estimators=100, max_features=8)
etClf = etClf.fit(X_train, y_train.ravel())
y_test_pred = etClf.predict(X_test)

print('\n=== Modeling : ExtraTrees Classifier ===')
print('Accuracy Score : {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('AUC Score : {}'.format(metrics.roc_auc_score(y_test, y_test_pred)))
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('<Classiffication Report>')
print(metrics.classification_report(y_test, y_test_pred))


=== Modeling : ExtraTrees Classifier ===
Accuracy Score : 0.8392857142857143
AUC Score : 0.841025641025641
<Confusion Matrix>
[[45  7]
 [11 49]]
<Classiffication Report>
             precision    recall  f1-score   support

          0       0.80      0.87      0.83        52
          1       0.88      0.82      0.84        60

avg / total       0.84      0.84      0.84       112

