## Data loading

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 12,6

import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
def modelfit(alg, X, y, cv=5):
    scores = cross_val_score(alg, X, y, cv=cv)
    
    X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.2, random_state=0)

    #Fit the algorithm on the data
    alg.fit(X_train, y_train)

    #Predict training set:
    train_predictions = alg.predict(X_train)
    
    #Predict test set:
    test_predictions = alg.predict(X_test)

    #Print model report:
    print('\nModel Report')
    print('Training Accuracy : %.8g' % accuracy_score(y_train, train_predictions))
    print('Testing  Accuracy : %.8g' % accuracy_score(y_test, test_predictions))
    print('Cross  val  score : %.8g' % scores.mean())

#     xgb.plot_importance(alg,max_num_features=20)
    

In [3]:
%%time
df = pd.read_csv('./data/processed_data/all/resample_to_50_per_act/SVM/SVM_w50.csv')
df_data = np.array(df.drop('label',axis=1))
df_labels = df['label']

CPU times: user 108 ms, sys: 4 ms, total: 112 ms
Wall time: 113 ms


## XGBoost

In [4]:
# labels = df_labels.unique()
# # xgboost要求数据的标签不能为字符串，则替换为int编号
# label_index = 0
# for label in labels:
#     df_labels = df_labels.replace(label, label_index)
#     label_index += 1
# df_labels.unique()

### 调参

这里的调参参看的是余音大神的[GitHub分享](https://github.com/lytforgood/MachineLearningTrick/blob/master/xgboost%E8%B0%83%E5%8F%82%E6%BC%94%E7%A4%BA.md)and [机器学习系列(12)_XGBoost参数调优完全指南（附Python代码）](http://blog.csdn.net/han_xiaoyang/article/details/52665396)。

#### 默认Xgboost参数

In [5]:
%%time
xgb0 = XGBClassifier()
cv0 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
modelfit(xgb0, df_data, df_labels, cv0)


Model Report
Training Accuracy : 0.95298507
Testing  Accuracy : 0.89860835
Cross  val  score : 0.89403579
CPU times: user 44.6 s, sys: 0 ns, total: 44.6 s
Wall time: 44.6 s


#### 第一步：确定学习速率和tree_based 参数调优的估计器数目

In [6]:
%%time
param_test1 = {'n_estimators':[100,200,500,1000,1500]}
est1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    scale_pos_weight=1,
    seed=27)
cv1 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch1 = GridSearchCV(est1, param_grid=param_test1, n_jobs=4, iid=False, cv=cv1)
gsearch1.fit(df_data, df_labels)

CPU times: user 44.7 s, sys: 64 ms, total: 44.8 s
Wall time: 5min 44s


In [7]:
gsearch1.best_params_

{'n_estimators': 500}

In [8]:
gsearch1.best_score_

0.92743538767395628

In [9]:
%%time
param_test1_2 = {'n_estimators':[400,500,600]}
gsearch1_2 = GridSearchCV(est1, param_grid=param_test1_2, n_jobs=4, iid=False, cv=cv1)
gsearch1_2.fit(df_data, df_labels)

CPU times: user 49.9 s, sys: 64 ms, total: 50 s
Wall time: 3min 28s


In [10]:
gsearch1_2.best_params_

{'n_estimators': 600}

In [11]:
gsearch1_2.best_score_

0.92823061630218684

In [13]:
%%time
param_test1_3 = {'n_estimators':[600,700,800,900]}
gsearch1_3 = GridSearchCV(est1, param_grid=param_test1_3, n_jobs=4, iid=False, cv=cv1)
gsearch1_3.fit(df_data, df_labels)

CPU times: user 49.9 s, sys: 72 ms, total: 50 s
Wall time: 5min 10s


In [14]:
gsearch1_3.best_params_

{'n_estimators': 600}

In [15]:
gsearch1_3.best_score_

0.92823061630218684

In [16]:
%%time
param_test1_4 = {'n_estimators':[550,600,650]}
gsearch1_4 = GridSearchCV(est1, param_grid=param_test1_4, n_jobs=4, iid=False, cv=cv1)
gsearch1_4.fit(df_data, df_labels)

CPU times: user 48.8 s, sys: 84 ms, total: 48.9 s
Wall time: 3min 44s


In [17]:
gsearch1_4.best_params_

{'n_estimators': 600}

In [18]:
gsearch1_4.best_score_

0.92823061630218684

In [19]:
%%time
param_test1_5 = {'n_estimators':[580,600,620]}
gsearch1_5 = GridSearchCV(est1, param_grid=param_test1_5, n_jobs=4, iid=False, cv=cv1)
gsearch1_5.fit(df_data, df_labels)

CPU times: user 48.7 s, sys: 72 ms, total: 48.8 s
Wall time: 3min 41s


In [20]:
gsearch1_5.best_params_

{'n_estimators': 600}

In [21]:
gsearch1_5.best_score_

0.92823061630218684

In [22]:
%%time
param_test1_6 = {'n_estimators':[590,600,610]}
gsearch1_6 = GridSearchCV(est1, param_grid=param_test1_6, n_jobs=4, iid=False, cv=cv1)
gsearch1_6.fit(df_data, df_labels)

CPU times: user 49.5 s, sys: 104 ms, total: 49.6 s
Wall time: 3min 40s


In [23]:
gsearch1_6.best_params_

{'n_estimators': 600}

In [24]:
gsearch1_6.best_score_

0.92823061630218684

#### 第二步： max_depth 和 min_weight 参数调优

In [25]:
%%time
param_test2 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
est2 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    scale_pos_weight=1,
    seed=27)
cv2 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch2 = GridSearchCV(est2, param_grid=param_test2, n_jobs=4, iid=False, cv=cv2)
gsearch2.fit(df_data, df_labels)

CPU times: user 55.5 s, sys: 136 ms, total: 55.7 s
Wall time: 12min 3s


In [26]:
gsearch2.best_params_

{'max_depth': 5, 'min_child_weight': 1}

In [27]:
gsearch2.best_score_

0.92823061630218684

In [28]:
%%time
param_test2_2 = {
    'max_depth':[4,5,6]
}
gsearch2_2= GridSearchCV(est2, param_grid=param_test2_2, n_jobs=4, iid=False, cv=cv2)
gsearch2_2.fit(df_data, df_labels)

CPU times: user 49.9 s, sys: 104 ms, total: 50 s
Wall time: 3min 42s


In [29]:
gsearch2_2.best_params_

{'max_depth': 5}

In [30]:
gsearch2_2.best_score_

0.92823061630218684

#### 第三步：gamma参数调优

In [31]:
%%time
param_test3 = {
    'gamma':[i/10.0 for i in range(0,7)]
}
est3 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    scale_pos_weight=1,
    seed=27)
cv3 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch3 = GridSearchCV(est3, param_grid=param_test3, n_jobs=4, iid=False, cv=cv3)
gsearch3.fit(df_data, df_labels)

CPU times: user 49.3 s, sys: 168 ms, total: 49.5 s
Wall time: 8min 34s


In [32]:
gsearch3.best_params_

{'gamma': 0.0}

In [33]:
gsearch3.best_score_

0.92823061630218684

#### 第四步：调整subsample 和 colsample_bytree 参数

In [34]:
%%time
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
est4 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    scale_pos_weight=1,
    seed=27)
cv4 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch4 = GridSearchCV(est4, param_grid=param_test4, n_jobs=4, iid=False, cv=cv4)
gsearch4.fit(df_data, df_labels)

CPU times: user 50.2 s, sys: 240 ms, total: 50.4 s
Wall time: 14min 38s


In [35]:
gsearch4.best_params_

{'colsample_bytree': 0.8, 'subsample': 0.6}

In [36]:
gsearch4.best_score_

0.92842942345924462

In [37]:
%%time
param_test4_2 = {
    'subsample':[0.55, 0.6, 0.65],
    'colsample_bytree':[0.75, 0.8, 0.85]
}
gsearch4_2 = GridSearchCV(est4, param_grid=param_test4_2, n_jobs=4, iid=False, cv=cv4)
gsearch4_2.fit(df_data, df_labels)

CPU times: user 48.2 s, sys: 176 ms, total: 48.4 s
Wall time: 9min 12s


In [38]:
gsearch4_2.best_params_

{'colsample_bytree': 0.8, 'subsample': 0.6}

In [39]:
gsearch4_2.best_score_

0.92842942345924462

#### 第五步：正则化参数调优

In [40]:
%%time
param_test5 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
est5 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    scale_pos_weight=1,
    seed=27)
cv5 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch5 = GridSearchCV(est5, param_grid=param_test5, n_jobs=4, iid=False, cv=cv5)
gsearch5.fit(df_data, df_labels)

CPU times: user 51.2 s, sys: 132 ms, total: 51.3 s
Wall time: 5min 19s


In [41]:
gsearch5.best_params_

{'reg_alpha': 1e-05}

In [42]:
gsearch5.best_score_

0.92644135188866783

In [43]:
%%time
param_test5_1 = {
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
est5_1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    seed=27)
gsearch5_1 = GridSearchCV(est5_1, param_grid=param_test5_1, n_jobs=4, iid=False, cv=cv5)
gsearch5_1.fit(df_data, df_labels)

CPU times: user 58.2 s, sys: 120 ms, total: 58.3 s
Wall time: 2min 40s


In [44]:
gsearch5_1.best_params_

{'reg_alpha': 0}

In [45]:
gsearch5_1.best_score_

0.92842942345924462

#### 第六步：进一步 降低学习速率 增加更多的树

In [4]:
%%time
param_test6 = {
    'learning_rate':[0.01, 0.1, 0.3]
}
est6 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    reg_alpha=0,
    reg_lambda=1,
    seed=27)
cv6 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch6 = GridSearchCV(est6, param_grid=param_test6, n_jobs=4, iid=False, cv=cv6)
gsearch6.fit(df_data, df_labels)

CPU times: user 58.4 s, sys: 52 ms, total: 58.4 s
Wall time: 1min 47s


In [5]:
gsearch6.best_params_

{'learning_rate': 0.1}

In [6]:
gsearch6.best_score_

0.92842942345924462

In [4]:
%%time
param_test6_1 = {
    'n_estimators':[600, 800, 1000, 1500]
}
est6_1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    reg_alpha=0,
    reg_lambda=1,
    seed=27)
cv6 = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
gsearch6_1 = GridSearchCV(est6_1, param_grid=param_test6_1, n_jobs=4, iid=False, cv=cv6)
gsearch6_1.fit(df_data, df_labels)

CPU times: user 1min, sys: 56 ms, total: 1min
Wall time: 3min 24s


In [5]:
gsearch6_1.best_params_

{'n_estimators': 600}

In [6]:
gsearch6_1.best_score_

0.92842942345924462

### 调参后交叉验证查看最终效果

In [6]:
%%time
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    reg_alpha=0,
    reg_lambda=1,
    seed=27)
modelfit(xgb, df_data, df_labels, cv)


Model Report
Training Accuracy : 1
Testing  Accuracy : 0.92942346
Cross  val  score : 0.92842942
CPU times: user 4min 37s, sys: 160 ms, total: 4min 37s
Wall time: 1min 9s


### 组合特征实践

In [4]:
# 将数据分割为训练集和测试集
train_data, test_data, train_labels, test_labels = \
    train_test_split(df_data, df_labels, test_size=0.2, random_state=1)

In [5]:
train_data.shape

(4020, 57)

In [6]:
test_data.shape

(1006, 57)

In [7]:
# 利用训练集训练出模型clf1并检测效果
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=600,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    reg_alpha=0,
    reg_lambda=1,
    seed=27)
clf1 = xgb.fit(train_data, train_labels)
clf1_labels = clf1.predict(test_data)

In [8]:
print(classification_report(test_labels, clf1_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.94481   0.92675   0.93569       314
      clean    0.94030   0.96183   0.95094       131
      daily    0.90395   0.89888   0.90141       178
       dump    0.91837   0.91837   0.91837        98
        run    0.92308   0.85714   0.88889        14
      sweep    0.93671   0.95279   0.94468       233
       walk    0.89744   0.92105   0.90909        38

avg / total    0.93045   0.93042   0.93035      1006



#### 生成新特征

In [9]:
# 利用原训练集训练出模型clf1对原训练集生成新的训练集
train_data2 = clf1.apply(train_data)
train_data2.shape
train_data2

array([[50, 23, 49, ...,  0,  5,  0],
       [46, 27, 30, ...,  0,  2,  0],
       [46, 25, 30, ...,  0,  6,  0],
       ..., 
       [41, 25, 30, ...,  0,  4,  0],
       [41, 26, 30, ...,  0,  5,  0],
       [53, 28, 42, ...,  0,  2,  0]], dtype=int32)

In [10]:
# 利用原训练集训练出模型clf1对原测试集生成新的测试集
test_data2 = clf1.apply(test_data)
test_data2.shape
test_data2

array([[45, 25, 30, ...,  0,  6,  0],
       [32, 25, 32, ...,  0,  5,  0],
       [45, 25, 30, ...,  0,  4,  0],
       ..., 
       [46, 25, 30, ...,  0,  6,  0],
       [41, 25, 30, ...,  0,  5,  0],
       [39, 25, 33, ...,  0,  4,  0]], dtype=int32)

In [11]:
# 利用新训练集训练出新模型clf2，并在新测试集上测试
clf2 = xgb.fit(train_data2, train_labels)
clf2_labels = clf2.predict(test_data2)

In [12]:
print(classification_report(test_labels, clf2_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.94156   0.92357   0.93248       314
      clean    0.91912   0.95420   0.93633       131
      daily    0.89944   0.90449   0.90196       178
       dump    0.92553   0.88776   0.90625        98
        run    0.92308   0.85714   0.88889        14
      sweep    0.92405   0.93991   0.93191       233
       walk    0.89744   0.92105   0.90909        38

avg / total    0.92364   0.92346   0.92340      1006



#### 原特征和新特征合并

In [13]:
# 合并函数
def mergeToOne(X,X2):
    X3=[]
    for i in range(X.shape[0]):
        tmp=np.array([list(X[i]),list(X2[i])])
        X3.append(list(np.hstack(tmp)))
    X3=np.array(X3)
    return X3

In [14]:
# 原训练集和新训练集合并为新的训练集
train_data3 = mergeToOne(train_data, train_data2)
train_data3.shape
train_data3

array([[  9.14897172e-02,   1.17917280e+00,  -2.48756373e-01, ...,
          0.00000000e+00,   5.00000000e+00,   0.00000000e+00],
       [ -3.51275036e-01,  -1.85045880e-01,   1.70630355e+00, ...,
          0.00000000e+00,   2.00000000e+00,   0.00000000e+00],
       [ -9.49873944e-01,  -5.68356785e-01,  -1.01270796e+00, ...,
          0.00000000e+00,   6.00000000e+00,   0.00000000e+00],
       ..., 
       [ -7.86117223e-01,  -5.45791445e-01,  -1.30422767e+00, ...,
          0.00000000e+00,   4.00000000e+00,   0.00000000e+00],
       [  3.69263126e-01,   1.10268977e+00,   6.08600078e-04, ...,
          0.00000000e+00,   5.00000000e+00,   0.00000000e+00],
       [  4.70093450e-01,  -4.09916843e-01,   2.06607679e+00, ...,
          0.00000000e+00,   2.00000000e+00,   0.00000000e+00]])

In [15]:
# 原测试集和新测试集合并为新的测试集
test_data3 = mergeToOne(test_data, test_data2)
test_data3.shape
test_data3

array([[-1.02867869, -0.54883891, -0.50517894, ...,  0.        ,
         6.        ,  0.        ],
       [-0.93697866, -0.19744733, -0.52745753, ...,  0.        ,
         5.        ,  0.        ],
       [-1.08969337, -0.29251743,  0.09483577, ...,  0.        ,
         4.        ,  0.        ],
       ..., 
       [-1.05418781, -0.51831651, -0.28008058, ...,  0.        ,
         6.        ,  0.        ],
       [-0.88977841, -1.2161169 , -1.05505567, ...,  0.        ,
         5.        ,  0.        ],
       [-0.34316468,  0.79940605, -1.15294356, ...,  0.        ,
         4.        ,  0.        ]])

In [16]:
# 利用最新训练集训练出最新模型clf3，并在最新测试集上测试
clf3 = xgb.fit(train_data3, train_labels)
clf3_labels = clf3.predict(test_data3)

In [17]:
print(classification_report(test_labels, clf3_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.94426   0.91720   0.93053       314
      clean    0.91176   0.94656   0.92884       131
      daily    0.88827   0.89326   0.89076       178
       dump    0.91579   0.88776   0.90155        98
        run    0.92308   0.85714   0.88889        14
      sweep    0.92050   0.94421   0.93220       233
       walk    0.89744   0.92105   0.90909        38

avg / total    0.91978   0.91948   0.91945      1006



## Random Forest

In [3]:
# #网格搜索法寻找最优参数组合，很费时间
# rfc = RandomForestClassifier(n_estimators=200)
# parameters = {'max_features':np.arange(8,30,2)}
# clf = GridSearchCV(rfc, parameters)
# clf.fit(df_data, df_labels)
# clf.best_score_
# clf.best_params_
# clf.best_estimator_

In [4]:
clf = RandomForestClassifier(n_estimators=200, max_features=10)

In [5]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [6]:
%%time
scores = cross_val_score(clf, df_data, df_labels, cv=cv)

Wall time: 1min 50s


In [7]:
scores
scores.mean()

array([ 0.91053678,  0.92644135,  0.9304175 ,  0.92047714,  0.92047714,
        0.92644135,  0.91053678,  0.91650099,  0.92644135,  0.92047714])

0.92087475149105358

In [8]:
# #进行T次交叉验证，求平均值
# T = 10
# ave_scores =[]
# for t in range(T):
#     ave_scores.append( cross_val_score(clf, df_data, df_labels, cv=cv).mean() )
# ave_scores = np.array(ave_scores)

## Gradient Boosting Tree

In [9]:
# clf = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1)
clf = GradientBoostingClassifier()

In [10]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [11]:
%%time
scores = cross_val_score(clf, df_data, df_labels, cv=cv)

Wall time: 4min 3s


In [12]:
scores
scores.mean()

array([ 0.87872763,  0.89264414,  0.89860835,  0.88667992,  0.90059642,
        0.89860835,  0.88469185,  0.88866799,  0.89463221,  0.88866799])

0.8912524850894632