In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn._config import set_config
set_config(print_changed_only=False)

In [2]:
import pandas as pd

feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+', header=None, names=['column_index','column_name'])

In [3]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] >0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

new_feature_name_df = get_new_feature_name_df(feature_name_df)
feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
train_X = pd.read_csv('./human_activity/train/X_train.txt', sep='\s+', names=feature_name)
test_X = pd.read_csv('./human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    
train_y = pd.read_csv('./human_activity/train/y_train.txt', sep='\s+', header=None, names=['action'])
test_y = pd.read_csv('./human_activity/test/y_test.txt', sep='\s+', header=None, names=['action'])

train_X.shape, test_X.shape, train_y.shape, test_y.shape

((7352, 561), (2947, 561), (7352, 1), (2947, 1))

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

GradientBoostingClassifier?

[0;31mInit signature:[0m
[0mGradientBoostingClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss[0m[0;34m=[0m[0;34m'log_loss'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msubsample[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'friedman_mse'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m

In [5]:
from sklearn.metrics import accuracy_score
import time

start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(train_X, train_y)

predict = gb_clf.predict(test_X)
accuracy = accuracy_score(test_y, predict)

print('GBM 정확도: {0:.4f}'.format(accuracy))
print("GBM 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))

GBM 정확도: 0.9389
GBM 수행 시간: 764.3 초 


In [6]:
from sklearn.model_selection import GridSearchCV

params = {
    # 'n_estimators' : [100, 500],
    'n_estimators' : [10, 50],
    'learning_rate' : [0.05, 0.1]
}

grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2 ,verbose=1)
grid_cv.fit(train_X, train_y)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
최적 하이퍼 파라미터:
 {'learning_rate': 0.1, 'n_estimators': 50}
최고 예측 정확도: 0.8977


In [7]:
predict = grid_cv.best_estimator_.predict(test_X)

accuracy = accuracy_score(test_y, predict)
print('GBM 정확도: {0:.4f}'.format(accuracy))

GBM 정확도: 0.9301
