In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
import model
import evaluation
import preprocess
from collections import defaultdict
from IPython.display import display, Markdown

%matplotlib inline

模型cross validation 1次的时间大概在 4s 左右

In [2]:
X, y = preprocess.load_data()

In [3]:
models = {
    'DecisionTree': model.DecisionTree,
    'SVM': model.SVM,
    'LR': model.LinearModel,
    'XGBoost': model.XGBoost
}

## Model View

In [None]:
dfs_results = defaultdict(lambda: pd.DataFrame(columns=['f1', 'roc_auc']))

In [None]:
# 1m 39s
for name, md in models.items():
    for use_balanced_trick in (True, False):
        result = evaluation.cross_validation(md(balanced_learning=use_balanced_trick), X, y, scoring='both', n_jobs=-1)
        display(Markdown("**{}**".format(name)))
        display(Markdown("- f1: {}".format(result['f1'])))
        display(Markdown("- roc_auc: {}".format(result['roc_auc'])))
        if use_balanced_trick:
            dfs_results['origin'].loc[name] = result['f1'].mean(), result['roc_auc'].mean()
        else:
            dfs_results['model_balance'].loc[name] = result['f1'].mean(), result['roc_auc'].mean()


## Data View

In [None]:
# 5min
sample_methods = {
    'smote': preprocess.get_smote('regular'),
    'sm_bl1': preprocess.get_smote('borderline1'),
    'sm_bl2': preprocess.get_smote('borderline2'),
    'sm_svm': preprocess.get_smote('svm'),
    'upsampling': preprocess.upsampling,
    'downsampling': preprocess.downsampling
}

df_sample = defaultdict(lambda: pd.DataFrame(columns=['f1', 'roc_auc']))
for name, md in models.items():
    for method_name, method in sample_methods.items():
        result = evaluation.cross_validation(md(balanced_learning=False, sample_method=method), X, y, scoring='both', n_jobs=-1)
        display(Markdown("**{}_{}**".format(name, method_name)))
        display(Markdown("- f1: {}".format(result['f1'])))
        display(Markdown("- roc_auc: {}".format(result['roc_auc'])))
        dfs_results[method_name].loc[name] = result['f1'].mean(), result['roc_auc'].mean()



In [None]:
df_overall = pd.DataFrame(columns=('f1', 'roc_auc'))
for name, df in dfs_results.items():
    for i, row in df.iterrows():
        df_overall.loc[i + '_' + name] = row

In [None]:
df_overall.sort_index(inplace=True)

sns.plt.figure(figsize=(10, 10))
sns.plt.xlim((0.92, 1))
colors = sns.color_palette(n_colors=len(df_overall))
palette = np.concatenate([[colors[x]]* (len(sample_methods) + 2) for x in range(len(models))])
sns.barplot(x='roc_auc', y=df_overall.index.values, data=df_overall, palette=palette)

In [None]:
sns.plt.figure(figsize=(10, 10))
sns.plt.xlim((0.6, 1))
colors = sns.color_palette(n_colors=len(df_overall))
palette = np.concatenate([[colors[x]]* (len(sample_methods) + 2) for x in range(len(models))])
sns.barplot(x='f1', y=df_overall.index.values, data=df_overall, palette=palette)

In [8]:
evaluation.cross_validation(model.MultiClassesLearner('KNN'), X, y, scoring='both', n_jobs=-1)

{'f1': array([ 0.86075949,  0.8583691 ,  0.88655462]),
 'roc_auc': array([ 0.94523086,  0.95091916,  0.96111451])}

In [15]:
output = []
for a in range(15, 25):
    a /= 100
    for b in (800, 1000, 1200):
        scores = evaluation.cross_validation(model.XGBoost(colsample_bytree=0.8, subsample=0.8,
                                                           max_depth=9, min_child_weight=1, reg_alpha=reg,
                                                           learning_rate=0.2, silent=False, gamma=0.0001,
                                                           n_estimators=1000, normalizer_name='standard'),
                                             X, y, scoring='f1', n_jobs=-1, n_splits=5)
        print("a:{} b{} \tscore:{}".format(a, b, scores.mean()))
        output.append([a, b, scores.mean()])



a:0.15 b800 	score:0.8879845810405949
a:0.15 b1000 	score:0.8878510373889916
a:0.15 b1200 	score:0.8881521335604614
a:0.16 b800 	score:0.8906637380565098
a:0.16 b1000 	score:0.8893851663766477
a:0.16 b1200 	score:0.8902548231790135
a:0.17 b800 	score:0.8871552294959855
a:0.17 b1000 	score:0.8864939455852399
a:0.17 b1200 	score:0.8865550629045487
a:0.18 b800 	score:0.8884270513238677
a:0.18 b1000 	score:0.8875241542727071
a:0.18 b1200 	score:0.88755145015859
a:0.19 b800 	score:0.8888288425744438
a:0.19 b1000 	score:0.8893213278112146
a:0.19 b1200 	score:0.8895080630638199
a:0.2 b800 	score:0.8905400090923532
a:0.2 b1000 	score:0.8907662802782687
a:0.2 b1200 	score:0.890759680831283
a:0.21 b800 	score:0.886388675109313
a:0.21 b1000 	score:0.8868033675103328
a:0.21 b1200 	score:0.8867738255025716
a:0.22 b800 	score:0.8874717817844904
a:0.22 b1000 	score:0.887619478782511
a:0.22 b1200 	score:0.8878491855461462
a:0.23 b800 	score:0.8883600294923883
a:0.23 b1000 	score:0.8881059368585342
a:0

In [18]:
from sklearn.svm import SVC
SVC?

In [19]:
from sklearn.model_selection import cross_val_score

In [21]:
a = {'C': 123}
b = {'D: 123'}
c = {'C: 13'}

In [26]:
a.

TypeError: update() argument after ** must be a mapping, not set

In [16]:
output.sort(key=lambda x: -x[2])

In [17]:
output


[[0.2, 1000, 0.89076628027826865],
 [0.2, 1200, 0.89075968083128299],
 [0.24, 1000, 0.89068761884339553],
 [0.16, 800, 0.89066373805650978],
 [0.2, 800, 0.8905400090923532],
 [0.16, 1200, 0.8902548231790135],
 [0.24, 800, 0.88973884252691116],
 [0.24, 1200, 0.88965996193780261],
 [0.19, 1200, 0.88950806306381991],
 [0.16, 1000, 0.88938516637664766],
 [0.19, 1000, 0.88932132781121465],
 [0.19, 800, 0.88882884257444383],
 [0.18, 800, 0.88842705132386768],
 [0.23, 800, 0.88836002949238835],
 [0.15, 1200, 0.88815213356046141],
 [0.23, 1000, 0.88810593685853423],
 [0.23, 1200, 0.88804654434546448],
 [0.15, 800, 0.88798458104059486],
 [0.15, 1000, 0.88785103738899163],
 [0.22, 1200, 0.88784918554614622],
 [0.22, 1000, 0.88761947878251102],
 [0.18, 1200, 0.88755145015859005],
 [0.18, 1000, 0.88752415427270714],
 [0.22, 800, 0.88747178178449038],
 [0.17, 800, 0.88715522949598546],
 [0.21, 1000, 0.88680336751033284],
 [0.21, 1200, 0.88677382550257156],
 [0.17, 1200, 0.88655506290454866],
 [0.17

In [14]:
import xgboost as xgb

def modelfit(alg, X, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X, label=y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

xtr = model.XGBoost()

feat_imp = pd.Series(xtr.xgb.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

In [7]:
ll = model.LinearEnsemble(
    [
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
        model.LinearModel(),
    ]
) 

tt = model.VotingEnsemble([
    model.DecisionTree(),
    model.DecisionTree(),
    model.DecisionTree(),
    model.DecisionTree(),
    model.DecisionTree(),
])

evaluation.cross_validation(
    model.LinearEnsemble(
        [
            ll,
            model.XGBoost(),
            tt,
        ]
    ),
    X, y, scoring='both', n_jobs=-1
)

{'f1': array([ 0.87292818,  0.87887888,  0.90137858]),
 'roc_auc': array([ 0.97345762,  0.97588045,  0.97862445])}

In [6]:
best_params, scores = evaluation.best_param_search(
    model.SVM(),
    [{'kernel': ['rbf', 'linear', 'sigmoid', 'poly'], 'C': [0.001, 0.01, 0.1, 1, 10]}],
    X,
    y
)



In [8]:
best_params

{'C': 0.001, 'kernel': 'rbf'}

In [7]:
scores

Unnamed: 0,test_score,train_score,fit_time,score_time
C:0.001;kernel:rbf,0.85694,0.849521,14.516547,0.524687
C:0.001;kernel:linear,0.85694,0.849521,16.636302,0.526856
C:0.001;kernel:sigmoid,0.85694,0.849521,17.641749,0.528024
C:0.001;kernel:poly,0.85694,0.849521,17.967802,0.521853
C:0.01;kernel:rbf,0.85694,0.849521,18.532183,0.530692
C:0.01;kernel:linear,0.85694,0.849521,18.845895,0.543201
C:0.01;kernel:sigmoid,0.85694,0.849521,18.48732,0.526689
C:0.01;kernel:poly,0.85694,0.849521,18.347225,0.538531
C:0.1;kernel:rbf,0.85694,0.849521,17.984647,0.533527
C:0.1;kernel:linear,0.85694,0.849521,18.276678,0.542199


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [5]:
clf = GridSearchCV(
    SVC(),
    {'kernel': ['rbf', 'poly'], 'C': [0.001, 1, 10]},
    scoring='f1',
    n_jobs=-1
)

In [None]:
clf.fit(X, y)

In [None]:
clf.cv_results_

In [13]:
clf.cv_results_



{'mean_fit_time': array([ 11.92215776,  13.93207614,  15.42875417,  15.76948428,
         16.00447615,  16.01548378,  16.1870997 ,  15.77065142,
         14.59854555,  12.38040058]),
 'mean_score_time': array([ 0.56821767,  0.56338064,  0.58272696,  0.5708859 ,  0.57422225,
         0.5640475 ,  0.55554183,  0.56488156,  0.47282688,  0.36459247]),
 'mean_test_score': array([ 0.84263926,  0.84263926,  0.84263926,  0.84263926,  0.84263926,
         0.84263926,  0.84263926,  0.84263926,  0.84263926,  0.84263926]),
 'mean_train_score': array([ 0.84832612,  0.84832612,  0.84832612,  0.84832612,  0.84832612,
         0.84832612,  0.84832612,  0.84832612,  0.84832612,  0.84832612]),
 'param_C': masked_array(data = [0.001 0.001 0.01 0.01 0.1 0.1 1 1 10 10],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_kernel': masked_array(data = ['rbf' 'poly' 'rbf' 'poly' 'rbf' 'poly' 'rbf' 'poly' 'rbf' 'poly'],
              mask = [Fals