In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
import model
import evaluation
import preprocess
from collections import defaultdict
from IPython.display import display, Markdown

%matplotlib inline

模型cross validation 1次的时间大概在 4s 左右

In [None]:
X, y = preprocess.load_data()
X_norm = X - np.mean(0)
X_norm /= np.std(X_norm, 0)
X = np.concatenate([X, np.exp(X_norm)], axis=1)

In [None]:
models = {
    'DecisionTree': model.DecisionTree,
    'SVM': model.SVM,
    'LR': model.LinearModel,
    'XGBoost': model.XGBoost
}

## Model View

In [None]:
dfs_results = defaultdict(lambda: pd.DataFrame(columns=['f1', 'roc_auc']))

In [None]:
# 1m 39s
for name, md in models.items():
    for use_balanced_trick in (True, False):
        result = evaluation.cross_validation(md(balanced_learning=use_balanced_trick), X, y, scoring='both', n_jobs=-1)
        display(Markdown("**{}**".format(name)))
        display(Markdown("- f1: {}".format(result['f1'])))
        display(Markdown("- roc_auc: {}".format(result['roc_auc'])))
        if use_balanced_trick:
            dfs_results['origin'].loc[name] = result['f1'].mean(), result['roc_auc'].mean()
        else:
            dfs_results['model_balance'].loc[name] = result['f1'].mean(), result['roc_auc'].mean()


## Data View

In [None]:
# 5min
sample_methods = {
    'smote': preprocess.get_smote('regular'),
    'sm_bl1': preprocess.get_smote('borderline1'),
    'sm_bl2': preprocess.get_smote('borderline2'),
    'sm_svm': preprocess.get_smote('svm'),
    'upsampling': preprocess.upsampling,
    'downsampling': preprocess.downsampling
}

df_sample = defaultdict(lambda: pd.DataFrame(columns=['f1', 'roc_auc']))
for name, md in models.items():
    for method_name, method in sample_methods.items():
        result = evaluation.cross_validation(md(sample_method=method), X, y, scoring='both', n_jobs=-1)
        display(Markdown("**{}_{}**".format(name, method_name)))
        display(Markdown("- f1: {}".format(result['f1'])))
        display(Markdown("- roc_auc: {}".format(result['roc_auc'])))
        dfs_results[method_name].loc[name] = result['f1'].mean(), result['roc_auc'].mean()



In [None]:
df_overall = pd.DataFrame(columns=('f1', 'roc_auc'))
for name, df in dfs_results.items():
    for i, row in df.iterrows():
        df_overall.loc[i + '_' + name] = row

In [None]:
df_overall.sort_index(inplace=True)

sns.plt.figure(figsize=(10, 10))
sns.plt.xlim((0.92, 1))
colors = sns.color_palette(n_colors=len(df_overall))
palette = np.concatenate([[colors[x]]* (len(sample_methods) + 2) for x in range(len(models))])
sns.barplot(x='roc_auc', y=df_overall.index.values, data=df_overall, palette=palette)

In [None]:
sns.plt.figure(figsize=(10, 10))
sns.plt.xlim((0.6, 1))
colors = sns.color_palette(n_colors=len(df_overall))
palette = np.concatenate([[colors[x]]* (len(sample_methods) + 2) for x in range(len(models))])
sns.barplot(x='f1', y=df_overall.index.values, data=df_overall, palette=palette)

In [None]:
evaluation.cross_validation(model.MultiClassesLearner('KNN'), X, y, scoring='both', n_jobs=-1)

In [None]:
evaluation.cross_validation(
    model.LinearEnsemble(
        [
            model.LinearModel(),
            model.LinearModel(),
            model.LinearModel(),
            model.LinearModel(),
            model.SVM(kernel='linear'),
            model.SVM(kernel='linear'),
            model.SVM(kernel='rbf'),
            model.MultiClassesLearner('KNN'),
            model.XGBoost()
        ]
    ), 
    X, y, scoring='both', n_jobs=-1
)