In [5]:
import pandas as pd

url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt"
feature_name_df = pd.read_csv(url, sep='\s+', header=None, names=['column_ndex', 'column_name'])
feature_name = feature_name_df.iloc[:, 1].values.tolist()
X_train = pd.read_csv('https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt', sep='\s+', header=None)
X_test = pd.read_csv('https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt', sep='\s+', header=None)
X_train.columns = feature_name
X_test.columns = feature_name

In [7]:
y_train_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt"
y_test_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt"

y_train = pd.read_csv(y_train_url, sep='\s+', header=None, names=['action'])
y_test = pd.read_csv(y_test_url, sep='\s+', header=None, names=['action'])

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings

warnings.filterwarnings('ignore')

In [9]:
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)

print('ACC : ', accuracy_score(y_test, gb_pred))
print('Fit time : ', time.time() - start_time)

ACC :  0.9385816084153377
Fit time :  593.3393287658691


In [10]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100, 500],
    'learning_rate' : [0.05, 0.1]
}

start_time = time.time()
grid = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
print('Fit time : ', time.time() - start_time)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed: 25.7min remaining:  8.6min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 30.3min finished


Fit time :  4795.241358757019


In [11]:
grid.best_score_

0.9011153427638738

In [12]:
grid.best_params_

{'learning_rate': 0.1, 'n_estimators': 500}

In [15]:
accuracy_score(y_test, grid.best_estimator_.predict(X_test))

0.9419748897183576

In [16]:
from xgboost import XGBClassifier

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depths=3)
xgb.fit(X_train.values, y_train)
print('Fit time : ', time.time() - start_time)

Parameters: { max_depths } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Fit time :  65.03522944450378


In [17]:
accuracy_score(y_test, grid.best_estimator_.predict(X_test.values))

0.9419748897183576

In [19]:
from xgboost import XGBClassifier

evals = [(X_test.values, y_test)]

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print('Fit time : ', time.time() - start_time)

[0]	validation_0-merror:0.17916
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.16288
[2]	validation_0-merror:0.15100
[3]	validation_0-merror:0.14388
[4]	validation_0-merror:0.14252
[5]	validation_0-merror:0.13336
[6]	validation_0-merror:0.12521
[7]	validation_0-merror:0.12453
[8]	validation_0-merror:0.11978
[9]	validation_0-merror:0.11707
[10]	validation_0-merror:0.11130
[11]	validation_0-merror:0.10892
[12]	validation_0-merror:0.10960
[13]	validation_0-merror:0.10655
[14]	validation_0-merror:0.10485
[15]	validation_0-merror:0.10248
[16]	validation_0-merror:0.10078
[17]	validation_0-merror:0.09773
[18]	validation_0-merror:0.09841
[19]	validation_0-merror:0.09637
[20]	validation_0-merror:0.09569
[21]	validation_0-merror:0.09603
[22]	validation_0-merror:0.09637
[23]	validation_0-merror:0.09671
[24]	validation_0-merror:0.09399
[25]	validation_0-merror:0.09264
[26]	validation_0-merror:0.09196
[27]	validation_0-merror:0.09230
[28]	validation_0-m

In [20]:
accuracy_score(y_test, grid.best_estimator_.predict(X_test.values))

0.9419748897183576

In [21]:
from lightgbm import LGBMClassifier

start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train.values, y_train, early_stopping_rounds=100, eval_set=evals)
print('Fit time : ', time.time() - start_time)

[1]	valid_0's multi_logloss: 1.4404
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 1.21574
[3]	valid_0's multi_logloss: 1.04795
[4]	valid_0's multi_logloss: 0.913299
[5]	valid_0's multi_logloss: 0.812686
[6]	valid_0's multi_logloss: 0.725964
[7]	valid_0's multi_logloss: 0.652995
[8]	valid_0's multi_logloss: 0.591598
[9]	valid_0's multi_logloss: 0.539383
[10]	valid_0's multi_logloss: 0.499944
[11]	valid_0's multi_logloss: 0.462273
[12]	valid_0's multi_logloss: 0.429676
[13]	valid_0's multi_logloss: 0.401908
[14]	valid_0's multi_logloss: 0.377718
[15]	valid_0's multi_logloss: 0.357455
[16]	valid_0's multi_logloss: 0.339918
[17]	valid_0's multi_logloss: 0.325799
[18]	valid_0's multi_logloss: 0.314716
[19]	valid_0's multi_logloss: 0.301914
[20]	valid_0's multi_logloss: 0.292755
[21]	valid_0's multi_logloss: 0.284754
[22]	valid_0's multi_logloss: 0.276745
[23]	valid_0's multi_logloss: 0.270387
[24]	valid_0's multi_logloss: 0.265765
[25]	valid_0's 

In [22]:
accuracy_score(y_test, grid.best_estimator_.predict(X_test.values))

0.9419748897183576