In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data = pd.read_csv('./ufc_data/data_final.csv')

In [3]:
data.dtypes[:30]

Title                            bool
R_hst_SIG_STR._attempted      float64
R_hst_SIG_STR._succeeded      float64
R_hst_TOTAL_STR._attempted    float64
R_hst_TOTAL_STR._succeeded    float64
R_hst_TD_attempted            float64
R_hst_TD_succeeded            float64
R_hst_HEAD_attempted          float64
R_hst_HEAD_succeeded          float64
R_hst_BODY_attempted          float64
R_hst_BODY_succeeded          float64
R_hst_LEG_attempted           float64
R_hst_LEG_succeeded           float64
R_hst_DISTANCE_attempted      float64
R_hst_DISTANCE_succeeded      float64
R_hst_CLINCH_attempted        float64
R_hst_CLINCH_succeeded        float64
R_hst_GROUND_attempted        float64
R_hst_GROUND_succeeded        float64
R_hst_KD                      float64
R_hst_SIG_STR_pct             float64
R_hst_TD_pct                  float64
R_hst_SUB_ATT                 float64
R_hst_PASS                    float64
R_hst_REV                     float64
B_hst_SIG_STR._attempted      float64
B_hst_SIG_ST

In [4]:
data.dtypes[30:]

B_hst_TD_succeeded          float64
B_hst_HEAD_attempted        float64
B_hst_HEAD_succeeded        float64
B_hst_BODY_attempted        float64
B_hst_BODY_succeeded        float64
B_hst_LEG_attempted         float64
B_hst_LEG_succeeded         float64
B_hst_DISTANCE_attempted    float64
B_hst_DISTANCE_succeeded    float64
B_hst_CLINCH_attempted      float64
B_hst_CLINCH_succeeded      float64
B_hst_GROUND_attempted      float64
B_hst_GROUND_succeeded      float64
B_hst_KD                    float64
B_hst_SIG_STR_pct           float64
B_hst_TD_pct                float64
B_hst_SUB_ATT               float64
B_hst_PASS                  float64
B_hst_REV                   float64
R_Height(cm)                float64
R_Reach(cm)                 float64
B_Height(cm)                float64
B_Reach(cm)                 float64
R_age                       float64
B_age                       float64
R_experience                  int64
B_experience                  int64
R_title_experience          

首先训练一个xgboost模型作为基础模型，在训练xgboost模型时，我们注意到这些数据的差比起数据本身对结果更有预测性。

In [5]:
columns_atp_suc = ['SIG_STR.', 'TOTAL_STR.', 'TD', 'HEAD', 'BODY', 'LEG', 'DISTANCE', 'CLINCH', 'GROUND']
xgb_data = pd.DataFrame(index=data.index)
for col in columns_atp_suc:
    xgb_data['diff_'+col+'_attempted'] = data['R_hst_'+col+'_attempted'] - data['B_hst_'+col+'_attempted']
    xgb_data['diff_'+col+'_succeeded'] = data['R_hst_'+col+'_succeeded'] - data['B_hst_'+col+'_succeeded']
columns_hst = ['KD', 'SIG_STR_pct', 'TD_pct', 'SUB_ATT', 'PASS', 'REV' ]
for col in columns_hst:
    xgb_data['diff_'+col] = data['R_hst_'+col] - data['B_hst_'+col]
columns_other = ['Height(cm)', 'Reach(cm)', 'age', 'experience', 'title_experience', 'KO', 'sub', 
                 'winning', 'title_winning']
for col in columns_other:
    xgb_data['diff_'+col] = data['R_'+col] - data['B_'+col]

In [6]:
xgb_data['Title'] = data['Title']
xgb_data['red_won'] = data['red_won']
xgb_data['new_weight_class'] = data['new_weight_class'].map({'Light':1, 'Middle': 2, 'Heavy': 3})
xgb_data['In_NA'] = data['In_NA']
#xgb_data['fight_ID'] = data['fight_ID']

为了加快训练我们的基础模型，以及防止过拟合，对训练基础模型的数据不选取有'_attempted'后缀的特征

In [7]:
base_columns = [col for col in xgb_data.columns if '_attempted' not in col]

In [8]:
base_data = xgb_data[base_columns]

In [9]:
base_data.columns

Index(['diff_SIG_STR._succeeded', 'diff_TOTAL_STR._succeeded',
       'diff_TD_succeeded', 'diff_HEAD_succeeded', 'diff_BODY_succeeded',
       'diff_LEG_succeeded', 'diff_DISTANCE_succeeded',
       'diff_CLINCH_succeeded', 'diff_GROUND_succeeded', 'diff_KD',
       'diff_SIG_STR_pct', 'diff_TD_pct', 'diff_SUB_ATT', 'diff_PASS',
       'diff_REV', 'diff_Height(cm)', 'diff_Reach(cm)', 'diff_age',
       'diff_experience', 'diff_title_experience', 'diff_KO', 'diff_sub',
       'diff_winning', 'diff_title_winning', 'Title', 'red_won',
       'new_weight_class', 'In_NA'],
      dtype='object')

In [10]:
def get_model(data, pars, train_par, num_round=200):
    """给定数据集， 参数， 返回一个训练好的xgboost模型"""
    y = data['red_won']
    X_train, X_valid, y_train, y_valid = train_test_split(data.drop('red_won', axis=1), y, test_size=0.2,
                                                         random_state=0)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    model = xgb.train(pars, dtrain, num_round, evals=watchlist, **train_par)
    return model

def get_importance(model):
    """return a DataFrame indicating feature importance"""
    feature_importance = model.get_fscore()
    feature_importance = pd.DataFrame({'featurn': list(feature_importance.keys()), 
                                   'importance': list(feature_importance.values())}
                                 ).sort_values(by='importance', ascending=False)
    return feature_importance

In [11]:
xgb_para = {'eta': 0.1, 'min_child_weight': 10, 'colsample_bytree': 0.8, 'max_depth': 6,  'gamma': 0,
            'subsample': 0.8, 'lambda': 1, 'booster' : 'gbtree', 'verbosity': 1, 'alpha': 0,
            'eval_metric': ['error', 'auc'], 'objective': 'binary:logistic', 'scale_pos_weight': 0.3/0.7,
            } 
train_par = { 'early_stopping_rounds': 50, 'maximize': False, 'verbose_eval': 10, }

In [12]:
base_model = get_model(base_data, xgb_para, train_par)

[0]	train-error:0.418955	train-auc:0.654422	valid-error:0.463557	valid-auc:0.580713
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.352855	train-auc:0.750864	valid-error:0.40622	valid-auc:0.657972
[20]	train-error:0.333171	train-auc:0.777358	valid-error:0.398445	valid-auc:0.664027
[30]	train-error:0.309599	train-auc:0.797364	valid-error:0.380952	valid-auc:0.666341
[40]	train-error:0.296719	train-auc:0.819242	valid-error:0.381924	valid-auc:0.669982
[50]	train-error:0.280437	train-auc:0.835057	valid-error:0.382896	valid-auc:0.666821
[60]	train-error:0.26294	train-auc:0.852156	valid-error:0.375121	valid-auc:0.665626
[70]	train-error:0.244957	train-auc:0.86802	valid-error:0.372206	valid-auc:0.666502
[80]	train-error:0.231349	train-auc:0.880719	valid-error:0.376093	valid-auc:0.658573
Stopping. Best iteration:
[36]	train-error:0.302552	train-auc:0.810533	valid-error:0.381924	vali

In [13]:

get_importance(base_model)

Unnamed: 0,featurn,importance
4,diff_SIG_STR_pct,96
2,diff_LEG_succeeded,88
11,diff_CLINCH_succeeded,79
7,diff_GROUND_succeeded,77
20,diff_TD_succeeded,72
0,diff_age,70
18,diff_DISTANCE_succeeded,69
21,diff_BODY_succeeded,68
8,diff_PASS,68
3,diff_TD_pct,68


In [14]:
model1 = get_model(xgb_data, xgb_para, train_par)

[0]	train-error:0.426974	train-auc:0.657756	valid-error:0.474247	valid-auc:0.578168
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.351397	train-auc:0.759529	valid-error:0.400389	valid-auc:0.651814
[20]	train-error:0.327339	train-auc:0.781239	valid-error:0.399417	valid-auc:0.671843
[30]	train-error:0.300608	train-auc:0.809355	valid-error:0.372206	valid-auc:0.678043
[40]	train-error:0.28627	train-auc:0.825894	valid-error:0.379009	valid-auc:0.673955
[50]	train-error:0.26853	train-auc:0.843018	valid-error:0.372206	valid-auc:0.671828
[60]	train-error:0.25079	train-auc:0.860077	valid-error:0.368319	valid-auc:0.671043
[70]	train-error:0.23548	train-auc:0.873631	valid-error:0.370262	valid-auc:0.666106
Stopping. Best iteration:
[26]	train-error:0.307169	train-auc:0.803679	valid-error:0.382896	valid-auc:0.678263



In [15]:
get_importance(model1)

Unnamed: 0,featurn,importance
0,diff_age,59
8,diff_DISTANCE_succeeded,52
25,diff_BODY_attempted,49
4,diff_SIG_STR_pct,49
10,diff_TD_attempted,49
7,diff_GROUND_attempted,48
32,diff_KO,48
21,diff_LEG_attempted,47
16,diff_winning,47
23,diff_GROUND_succeeded,45


In [16]:
data ['new_weight_class'] = data['new_weight_class'].map({'Light': 1, 'Middle': 2, 'Heavy': 3})
data.drop('fight_ID', axis=1, inplace=True)
model2 = get_model(data, xgb_para, train_par)

[0]	train-error:0.358445	train-auc:0.698834	valid-error:0.396501	valid-auc:0.639351
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.334629	train-auc:0.785474	valid-error:0.393586	valid-auc:0.686533
[20]	train-error:0.309356	train-auc:0.810795	valid-error:0.389699	valid-auc:0.687217
[30]	train-error:0.288457	train-auc:0.831307	valid-error:0.379981	valid-auc:0.689094
[40]	train-error:0.265128	train-auc:0.854356	valid-error:0.375121	valid-auc:0.6916
[50]	train-error:0.247631	train-auc:0.871024	valid-error:0.359572	valid-auc:0.695234
[60]	train-error:0.231349	train-auc:0.886507	valid-error:0.355685	valid-auc:0.695736
[70]	train-error:0.215796	train-auc:0.89882	valid-error:0.347911	valid-auc:0.696559
[80]	train-error:0.205832	train-auc:0.910145	valid-error:0.356657	valid-auc:0.696611
[90]	train-error:0.195383	train-auc:0.918925	valid-error:0.367347	valid-auc:0.693633
[100]	train

这个模型比起基础的模型提高了，跟据之前数据探索的结论我们知道年龄优势， 经验优势，是否参加过头衔争夺赛， 是否为金腰带获得者都对比赛结果有较大影响。

In [17]:
data['age_adv'] = data['B_age'] - data['R_age']
data['experience_adv'] = data['R_experience'] - data['B_experience']
data['title_experience_adv'] = data['R_title_experience'] - data['B_title_experience']
data['title_winning_adv'] = data['R_title_winning'] - data['B_title_winning']
data['R_title_holder'] = data['R_title_winning'] > 0
data['B_title_holder'] = data['B_title_winning'] > 0

In [18]:

model3 = get_model(data, xgb_para, train_par)

[0]	train-error:0.365492	train-auc:0.701477	valid-error:0.400389	valid-auc:0.648032
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.332928	train-auc:0.782426	valid-error:0.397473	valid-auc:0.685622
[20]	train-error:0.306197	train-auc:0.810187	valid-error:0.379009	valid-auc:0.692229
[30]	train-error:0.287971	train-auc:0.827523	valid-error:0.362488	valid-auc:0.699164
[40]	train-error:0.264642	train-auc:0.847864	valid-error:0.379009	valid-auc:0.700231
[50]	train-error:0.249332	train-auc:0.866665	valid-error:0.365403	valid-auc:0.701733
[60]	train-error:0.230863	train-auc:0.884284	valid-error:0.365403	valid-auc:0.703152
[70]	train-error:0.214581	train-auc:0.896556	valid-error:0.357629	valid-auc:0.705176
[80]	train-error:0.199757	train-auc:0.911675	valid-error:0.361516	valid-auc:0.704345
[90]	train-error:0.184204	train-auc:0.922513	valid-error:0.358601	valid-auc:0.702479
[100]	tr

相比前一个模型，这个模型的valid-error和valid-auc都有提高，看看加入身高优势会怎样

In [19]:
data['height_adv'] = data['R_Height(cm)'] - data['B_Height(cm)']
model4 = get_model(data, xgb_para, train_par)

[0]	train-error:0.387363	train-auc:0.698948	valid-error:0.414966	valid-auc:0.638392
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.325881	train-auc:0.783248	valid-error:0.376093	valid-auc:0.684546
[20]	train-error:0.299879	train-auc:0.810288	valid-error:0.373178	valid-auc:0.686548
[30]	train-error:0.287242	train-auc:0.830433	valid-error:0.367347	valid-auc:0.687629
[40]	train-error:0.265128	train-auc:0.855964	valid-error:0.367347	valid-auc:0.688643
[50]	train-error:0.246902	train-auc:0.870144	valid-error:0.368319	valid-auc:0.685237
[60]	train-error:0.225759	train-auc:0.887439	valid-error:0.358601	valid-auc:0.689015
[70]	train-error:0.212151	train-auc:0.899048	valid-error:0.36346	valid-auc:0.690115
[80]	train-error:0.199271	train-auc:0.911324	valid-error:0.359572	valid-auc:0.687919
[90]	train-error:0.187849	train-auc:0.920281	valid-error:0.367347	valid-auc:0.685534
[100]	tra

成绩反倒有点下降， 显而易见我们的模型发生了严重的过拟合，来看看特征重要性

In [20]:
get_importance(model4).iloc[:40,]

Unnamed: 0,featurn,importance
6,age_adv,56
10,R_hst_TD_pct,52
31,B_hst_TD_attempted,50
21,R_hst_BODY_attempted,45
0,R_age,44
25,R_hst_LEG_succeeded,41
47,B_hst_TOTAL_STR._succeeded,40
52,R_hst_TD_attempted,40
40,R_hst_GROUND_attempted,39
17,R_hst_SIG_STR_pct,38


In [21]:
get_importance(model4).iloc[40:,]

Unnamed: 0,featurn,importance
70,B_Height(cm),22
15,B_hst_LEG_succeeded,22
63,B_hst_GROUND_succeeded,22
61,B_hst_BODY_succeeded,22
59,R_hst_REV,22
58,B_winning,22
44,B_hst_PASS,21
2,new_weight_class,20
55,R_hst_SIG_STR._attempted,19
26,B_hst_REV,18


### 调节超参数

In [22]:
# 由于这是不平衡的分类，先试试减小scale_positive_weight
xgb_para1 = {'eta': 0.1, 'min_child_weight': 10, 'colsample_bytree': 0.8, 'max_depth': 6,  'gamma': 0,
            'subsample': 0.8, 'lambda': 1, 'booster' : 'gbtree', 'verbosity': 0, 'alpha': 0,
            'eval_metric': ['error', 'auc'], 'objective': 'binary:logistic', 'scale_pos_weight': 0.3,
            } 
model5 = get_model(data, xgb_para1, train_par)

[0]	train-error:0.444228	train-auc:0.694367	valid-error:0.480078	valid-auc:0.641426
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.426002	train-auc:0.770915	valid-error:0.469388	valid-auc:0.685638
[20]	train-error:0.403402	train-auc:0.794833	valid-error:0.453839	valid-auc:0.688577
[30]	train-error:0.378372	train-auc:0.811816	valid-error:0.443149	valid-auc:0.691208
[40]	train-error:0.364763	train-auc:0.831128	valid-error:0.424684	valid-auc:0.693076
[50]	train-error:0.343135	train-auc:0.848916	valid-error:0.421769	valid-auc:0.693151
[60]	train-error:0.326853	train-auc:0.866088	valid-error:0.425656	valid-auc:0.695703
[70]	train-error:0.309113	train-auc:0.878081	valid-error:0.418853	valid-auc:0.695844
[80]	train-error:0.290887	train-auc:0.893086	valid-error:0.413022	valid-auc:0.693754
[90]	train-error:0.280923	train-auc:0.901108	valid-error:0.413994	valid-auc:0.691848
[100]	tr

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
clf = XGBClassifier(learning_rate=0.1, colsample_bytree=0.8, gamma=0, n_estimators=200,
                    subsample=1, reg_lambda=1, booster='gbtree', reg_alpha=0, 
                    objective='binary:logistic', scale_pos_weight=0.3/0.7)
grid_pars = { 'min_child_weight': list(range(5, 50, 5)), 
             'max_depth':list(range(3, 11))} 
clf = GridSearchCV(clf, grid_pars, scoring='roc_auc')
y = data['red_won']
X = data.drop('red_won', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train )

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=200, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=0.4285714285714286,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_child_weigh

In [24]:
clf.best_score_

0.6805280746707483

竟然比先前更差， 减小colsample_bytree 试试

In [25]:
xgb_para1 = {'eta': 0.1, 'min_child_weight': 10, 'colsample_bytree': 0.6, 'max_depth': 6,  'gamma': 0,
            'subsample': 0.8, 'lambda': 1, 'booster' : 'gbtree', 'verbosity': 0, 'alpha': 0,
            'eval_metric': ['error', 'auc'], 'objective': 'binary:logistic', 'scale_pos_weight': 0.3,
            } 
model6 = get_model(data, xgb_para1, train_par)

[0]	train-error:0.444471	train-auc:0.693097	valid-error:0.48105	valid-auc:0.63063
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.434265	train-auc:0.77206	valid-error:0.466472	valid-auc:0.684645
[20]	train-error:0.410207	train-auc:0.793101	valid-error:0.458698	valid-auc:0.683897
[30]	train-error:0.398299	train-auc:0.811268	valid-error:0.450923	valid-auc:0.689153
[40]	train-error:0.376185	train-auc:0.829497	valid-error:0.43829	valid-auc:0.690036
[50]	train-error:0.352855	train-auc:0.845879	valid-error:0.431487	valid-auc:0.690027
[60]	train-error:0.339004	train-auc:0.861843	valid-error:0.419825	valid-auc:0.694911
[70]	train-error:0.316889	train-auc:0.878911	valid-error:0.414966	valid-auc:0.694224
[80]	train-error:0.303038	train-auc:0.890803	valid-error:0.413022	valid-auc:0.693388
[90]	train-error:0.291859	train-auc:0.900394	valid-error:0.410107	valid-auc:0.689582
[100]	train-

In [26]:
xgb_para = {'eta': 0.1, 'min_child_weight': 10, 'colsample_bytree': 0.9, 'max_depth': 6,  'gamma': 0,
            'subsample': 0.8, 'lambda': 1, 'booster' : 'gbtree', 'verbosity': 0, 'alpha': 0,
            'eval_metric': ['error', 'auc'], 'objective': 'binary:logistic', 'scale_pos_weight': 0.3,
            } 
model5 = get_model(data, xgb_para1, train_par)

[0]	train-error:0.444471	train-auc:0.693097	valid-error:0.48105	valid-auc:0.63063
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-error:0.434265	train-auc:0.77206	valid-error:0.466472	valid-auc:0.684645
[20]	train-error:0.410207	train-auc:0.793101	valid-error:0.458698	valid-auc:0.683897
[30]	train-error:0.398299	train-auc:0.811268	valid-error:0.450923	valid-auc:0.689153
[40]	train-error:0.376185	train-auc:0.829497	valid-error:0.43829	valid-auc:0.690036
[50]	train-error:0.352855	train-auc:0.845879	valid-error:0.431487	valid-auc:0.690027
[60]	train-error:0.339004	train-auc:0.861843	valid-error:0.419825	valid-auc:0.694911
[70]	train-error:0.316889	train-auc:0.878911	valid-error:0.414966	valid-auc:0.694224
[80]	train-error:0.303038	train-auc:0.890803	valid-error:0.413022	valid-auc:0.693388
[90]	train-error:0.291859	train-auc:0.900394	valid-error:0.410107	valid-auc:0.689582
[100]	train-

In [27]:
xgb_para = {'eta': 0.1, 'min_child_weight': 50, 'colsample_bytree': 0.9, 'max_depth': 5,  'gamma': 0,
            'subsample': 0.8, 'lambda': 1, 'booster' : 'gbtree', 'verbosity': 0, 'alpha': 0,
            'eval_metric': ['error', 'auc'], 'objective': 'binary:logistic', 'scale_pos_weight': 0.3,
            } 
train_par['verbose_eval'] = 40
model = get_model(data, xgb_para, train_par, 2000)

[0]	train-error:0.57497	train-auc:0.639555	valid-error:0.594752	valid-auc:0.634755
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[40]	train-error:0.440097	train-auc:0.727605	valid-error:0.455782	valid-auc:0.700448
[80]	train-error:0.408019	train-auc:0.757469	valid-error:0.434402	valid-auc:0.705684
[120]	train-error:0.390522	train-auc:0.776389	valid-error:0.431487	valid-auc:0.708578
[160]	train-error:0.374727	train-auc:0.79254	valid-error:0.420797	valid-auc:0.711339
[200]	train-error:0.361847	train-auc:0.809822	valid-error:0.41691	valid-auc:0.712355
[240]	train-error:0.350425	train-auc:0.821662	valid-error:0.413994	valid-auc:0.714443
Stopping. Best iteration:
[227]	train-error:0.350911	train-auc:0.817958	valid-error:0.413022	valid-auc:0.715582



经过反复调参，发现上面的模型能获得最大auc，将它保存。

In [28]:
model.save_model('./best_xgb_model.csv')

In [38]:
import category_encoders as ce
ohe = ce.OneHotEncoder(cols=['Title', 'In_NA', 'R_title_holder', 'B_title_holder'])
data = ohe.fit_transform(data)


KeyError: 'Title'

In [39]:
y = data['red_won']
X = data.drop('red_won', axis=1)

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
from tensorflow.keras import utils





In [54]:
def make_mpl(input_dim):
    model = Sequential()
    
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [55]:
rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

In [57]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [58]:
model = make_mpl(X.shape[1])
model.fit(X_train, utils.to_categorical(y_train), batch_size=32, 
          validation_data=(X_valid, utils.to_categorical(y_valid)),
          verbose=1, callbacks=[rlr], epochs=100)

Train on 4115 samples, validate on 1029 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f59b589afd0>

In [59]:
def make_mpl(input_dim):
    model = Sequential()
    
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [60]:
model = make_mpl(X.shape[1])
model.fit(X_train, y_train, batch_size=32, 
          validation_data=(X_valid, y_valid),
          verbose=1, callbacks=[rlr], epochs=100)

Train on 4115 samples, validate on 1029 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f59af5c8f90>

In [61]:
from sklearn.linear_model import LogisticRegression as log_reg
log_model = log_reg()
log_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
y_valid_pred = log_model.predict(X_valid)

In [63]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, y_valid_pred)

0.6987366375121478

可以看到， xgboost分类器的效果最好， logistic其次， 而mlp模型最差