### 模型训练与评估

In [1]:
# 性能评价函数
# 每个优惠券核销预测的平均AUC作为评价标准
# 对每个优惠券coupon_id单独计算核销预测的AUC值
# 再对所有优惠券AUC值求平均作为最终的评价函数
# coupon平均AUC计算
from sklearn.metrics import roc_auc_score
import numpy as np
def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        coupon_df = i[1]
        # 测算AUC必须大于1个类别
        if len(coupon_df['label'].unique()) < 2:
            continue
        auc = roc_auc_score(coupon_df['label'], coupon_df['pred'])
        aucs.append(auc)
    return np.average(aucs)

In [2]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
######### 全局参数 ############
id_col_names = ['user_id', 'coupon_id', 'date_received']
target_col_name = 'label'
id_target_cols = ['user_id', 'coupon_id', 'date_received', 'label']
myeval = 'roc_auc'
cvscore = 0

######### 目录定义 ############
datapath = './dataset/'
featurepath = './feature/'
resultpath = './result/'
tmppath = './tmp/'
scorepath = './score/'

######### 工具函数 ############
# 返回ID列
def get_id_df(df):
    return df[id_col_names]

# 返回target列
def get_target_df(df):
    return df[target_col_name]

# 返回特征列
def get_predictors_df(df):
    predictors = [f for f in df.columns if f not in id_target_cols]
    return df[predictors]

# 按特征名读取训练集
def read_featurefile_train(featurename):
    df = pd.read_csv(featurepath + 'train_' + featurename + '.csv', sep=',', encoding='utf-8')
    df.fillna(0, inplace=True)
    return df

# 按特征名读取测试集
def read_featurefile_test(featurename):
    df = pd.read_csv(featurepath + 'test_' + featurename + '.csv', sep=',', encoding='utf-8')
    df.fillna(0, inplace=True)
    return df

# 按特征名读取数据
def read_data(featurename):
    traindf = read_featurefile_train(featurename)
    testdf = read_featurefile_test(featurename)
    return traindf, testdf

# 调用分类算法
def get_sklearn_model(model_name):

    if model_name == 'NB':
        return MultinomialNB(alpha=0.01)

    elif model_name == 'LR':
        return LogisticRegression(penalty='l2')

    elif model_name == 'KNN':
        return KNeighborsClassifier()

    elif model_name == 'RF':
        return RandomForestClassifier()

    elif model_name == 'DT':
        return DecisionTreeClassifier()

    elif model_name == 'SVC':
        return SVC(kernel='rbf')

    elif model_name == 'GBDT':
        return GradientBoostingClassifier()

    elif model_name == 'XGB':
        return XGBClassifier()

    elif model_name == 'LGB':
        return LGBMClassifier()

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
# 画学习曲线
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('training examples')
    plt.ylabel('score')
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        scoring=myeval,
        n_jobs=n_jobs,
        train_sizes=train_sizes
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()
    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color='r')
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color='g')
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color='r',
             label='training score')
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color='g',
             label='cross-validation score')
    plt.legend(loc='best')
    return plt

# 画算法的学习曲线，为加快画图速度，最多选20%的数据
def plot_curve_single(traindf,
                      classifier,
                      cvnum,
                      train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
    X = get_predictors_df(traindf)
    y = get_target_df(traindf)
    estimator = get_sklearn_model(classifier)
    title = 'learning curve of' + classifier + ', cv:' + str(cvnum)
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(0, 1.01),
                        cv=cvnum,
                        train_sizes=train_sizes)

# 按照日期分割
def test_model(traindf, classifier):
    train = traindf[traindf.date_received < 20160515].copy()
    test = traindf[traindf.date_received >= 20160515].copy()
    train_data = get_predictors_df(train).copy()
    train_target = get_target_df(train).copy()
    test_data = get_predictors_df(test).copy()
    test_target = get_target_df(test).copy()
    clf = get_sklearn_model(classifier)
    clf.fit(train_data, train_target)
    result = clf.predict_proba(test_data)[:,1]
    test['pred'] = result
    score = roc_auc_score(test_target, result)
    print(classifier + '总体AUC：', score)
    score_coupon = myauc(test)
    print(classifier + 'Coupon AUC: ', score_coupon)

from sklearn.model_selection import train_test_split
def test_model_split(traindf, classifier):
    target = get_target_df(traindf).copy()
    train_all, test_all,train_target,test_target = train_test_split(
        traindf, target, test_size=0.2, random_state=0
    )

    train_data = get_predictors_df(train_all).copy()
    test_data = get_predictors_df(test_all).copy()
    clf = get_sklearn_model(classifier)
    clf.fit(train_data, train_target)
    result = clf.predict_proba(test_data)[:,1]
    test = test_all.copy()
    test['pred'] = result
    score = roc_auc_score(test_target, result)
    print(classifier + '总体AUC：', score)
    score_coupon = myauc(test)
    print(classifier + 'Coupon AUC: ', score_coupon)

### 不同算法模型的性能对比

In [3]:
# 1. 朴素贝叶斯

In [4]:
# 2. 逻辑回归

In [5]:
# 3. 决策树

In [6]:
# 4. 随机森林

In [7]:
# 5. XGBoost

In [8]:
# 6. LightGBM

### 结果输出

In [None]:
# 预测函数
def classifier_df_simple(train_feat, test_feat, classifier):
    model = get_sklearn_model(classifier)
    model.fit(get_predictors_df(train_feat), get_target_df(train_feat))
    predicted = pd.DataFrame(
        model.predict_proba(get_predictors_df(test_feat))[:, 1]
    )
    return predicted

# 输出结果函数
def output_predicted(predicted, resultfile, test_feat):
    predicted = round(predicted, 3)
    resultdf = get_id_df(test_feat).copy()
    resultdf['Probability'] = predicted
    return resultdf

predicted = classifier_df_simple(train_f3, test_f3, 'LGB')

# 生成结果数据
result = output_predicted(predicted, 'sf3_LGB.csv', test_f3)
# 输出结果
result.to_csv('sf3_lgb.csv', header=False, index=False, sep=',')

## 模型验证

### 交叉验证

In [9]:
# 简单交叉验证
from sklearn.model_selection import train_test_split

target = get_target_df(train_f3).copy()
traindf = train_f3.copy()
train_all, test_all,train_target,test_target = train_test_split(
        traindf, target, test_size=0.2, random_state=0
    )
train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()
clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:,1]
test_pred = clf.predict_proba(test_data)[:,1]

score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)

print('logistic regression train' + '总体AUC：', score_train)
print('logistic regression test' + '总体AUC：', score_test)

train_all['pred'] = train_pred
test_all['pred'] = test_pred

print('logistic regression test' + 'coupon AUC：', myauc(train_all))
print('logistic regression test' + 'coupon AUC：', myauc(test_all))

NameError: name 'train_f3' is not defined

In [None]:
# K折交叉验证
train = train_f3.copy()
target = get_target_df(train_f3).copy()

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

for k, (train_index, test_index) in enumerate(kf.split(train)):
    X_train, X_test, y_train, y_test = train[train_index], train[test_index], target[train_index], target[test_index]
    clf = LogisticRegression()
    clf = clf.fit(get_predictors_df(X_train), y_train)

    train_pred = clf.predict_proba(get_predictors_df(X_train))[:,1]
    test_pred = clf.predict_proba(get_predictors_df(X_test))[:,1]
    score_train = roc_auc_score(y_train, train_pred)
    score_test = roc_auc_score(y_test, test_pred)
    X_train['pred'] = train_pred
    X_test['pred'] = test_pred
    print(k+1, '折' + 'logistic regression train' + '总体AUC：', score_train)
    print(k+1, '折' + 'logistic regression test' + '总体AUC：', score_test)
    print(k+1, '折' + 'logistic regression train' + 'coupon AUC：', myauc(X_train))
    print(k+1, '折' + 'logistic regression test' + 'coupon AUC：', myauc(X_test))

In [10]:
# 留一法交叉验证和留P法交叉验证

In [11]:
# StratifiedKFold 和 StratifiedShuffleSplit

### 模型比较

### 验证结果可视化

### 模型调参

In [12]:
# 网格搜索

In [13]:
# 随机搜索

In [14]:
# 启发式搜索

### 实际方案

In [None]:
# 1. 确定最佳学习率和迭代次数
# 2. 确定max_depth和num_leaves
# 3. 确定min_data_in_leaf和max_bin
# 4. 确定feature_fraction，bagging_fraction，bagging_freq
# 5. 确定lambda_l1和lambda_l2
# 6. 确定min_split_gain
# 7. 降低学习率，增加迭代次数，验证模型
