# AdaBoost

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),
                                  columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
                                                                                         if x[1] >0 else x[0] ,  axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

# 데이터 셋을 구하는 함수 설정
def get_human_dataset() :
    feature_name_df = pd.read_csv('./Data/human_activity/features.txt', sep = '\s+', 
                                  header=None, names=['column_index','column_name'])
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    X_train = pd.read_csv('./Data/human_activity/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./Data/human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('./Data/human_activity/train/y_train.txt', sep='\s+', names=['action'])
    y_test = pd.read_csv('./Data/human_activity/test/y_test.txt', sep='\s+', names=['action'])
    
    return X_train, X_test, y_train,y_test

In [17]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [22]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

clf = AdaBoostClassifier(n_estimators=30, random_state=10, learning_rate= 0.1)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('정확도 : {0:.2f} '.format(accuracy_score(y_test,pred)))

정확도 : 0.77 


In [None]:
# n_estimators(또는 learning_rate)를 늘리고, learning_rate(또는 n_estimators)을 줄인다면 서로 효과가 상쇄됩다.
# → 때문에 이 두 파라미터를 잘 조정하는 것이 알고리즘의 핵심입니다.

# GBM

In [23]:
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

In [28]:
strat_time = time.time()

In [25]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train,y_train)
gb_pred = gb_clf.predict(X_test)


NameError: name 'accuracy_scorec' is not defined

In [30]:
gb_accuracy = accuracy_score(y_test,gb_pred)

print('GBM 정확도 : {:.2f}'.format(gb_accuracy))
print('GBM 수행 시간 : {:.1f}초'.format(time.time()-strat_time))

GBM 정확도 : 0.94
GBM 수행 시간 : 15.0초


In [33]:
from sklearn.model_selection import GridSearchCV
param = {
    'n_estimators' : [100,500],
    'learning_rate' : [0.05, 0.1]
}

grid_cv =GridSearchCV(gb_clf, param_grid=param, cv=2, verbose = 1, n_jobs =-1)
grid_cv.fit(X_train, y_train.values)
print('최적 파라미터 : ', grid_cv.best_params)
print('최고 예측 정확도 : {0:.4f}'.format(grid_cv.best_scores))

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed: 18.5min remaining: 55.4min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 78.3min finished


KeyboardInterrupt: 

# XGBoost

In [None]:
import xgboost as xgb ## XGBoost 불러오기
from xgboost import plot_importance ## Feature Importance를 불러오기 위함
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = load_breast_canser()
X_features = dataset.data
y_label = dataset.target

cancer_df = pd.DataFrame(data = X_features, columns = dataset.feature_names)
cancer_df['target'] = y_label
cancer_df.head(3)

In [None]:
print(dataset.target_name)
print(cancer_df['target'].value_counts())