In [26]:
import pandas as pd
import xgboost as xgb
import warnings

In [27]:
def prepare(dataset):
    # 源数据
    data = dataset.copy()
    # 标记是否为满减
    data['is_manjian'] = data['Discount_rate'].map(lambda x: 1 if ':' in str(x) else 0)
    #计算折扣率
    data['discount_rate'] = data['Discount_rate'].map(lambda x: float(x)
    if ':' not in str(x)
    else
    (float(str(x).split(':')[0]) - float(str(x).split(':')[1])) / float(str(x).split(':')[0]))
    # 计算满减的最低消费
    data['min_cost_of_manjian'] = data['Discount_rate'].map(
        lambda x: -1 if ':' not in str(x) else int(str(x).split(':')[0]))
    # 距离处理，空距离填充为-1
    data['Distance'].fillna(-1, inplace=True)
    data['null_distance'] = data['Distance'].map(lambda x: 1 if x == -1 else 0)
    # 时间处理
    data['date_received'] = pd.to_datetime(data['Date_received'], format='%Y%m%d')
    #对训练集特殊处理
    #将训练集中的每一个 column 转换成列表
    if 'Date' in data.columns.tolist():
        data['date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
    # 返回
    return data


def get_label(dataset):
    # 源数据
    data = dataset.copy()
    # 打标:领券后 15 天内消费为 1,否则为 0
    data['label'] = list(map(lambda x, y: 1 if (x - y).total_seconds() / (60 * 60 * 24)
                                               <= 15 else 0, data['date'], data['date_received']))
    # 返回
    return data


def get_feature(label_field):
    # 源数据
    data = label_field.copy()
    # 将 Coupon_id 列中 float 类型的元素转换为 int 类型
    data['Coupon_id'] = data['Coupon_id'].map(int)
    # 将 Date_received 列中 float 类型的元素转换为 int 类型
    data['Date_received'] = data['Date_received'].map(int)
    data['cnt'] = 1  # 方便特征提取
    # 返回的特征数据集
    feature = data.copy()
    ######################1、用户领券数#################
    keys = ['User_id']  # 主键为用户
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_cnt'}).reset_index()
    # 将 id 列与特征列左连
    feature = pd.merge(feature, pivot, on=keys, how='left')
    ######################2、用户领取特定优惠券数####################
    keys = ['User_id', 'Coupon_id']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    ######################3、用户当天领券数######################
    keys = ['User_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    ######################4、用户当天领取特定优惠券数######################
    keys = ['User_id', 'Coupon_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    ####################5、用户是否在同一天重复领取了特定优惠券####################
    keys = ['User_id', 'Coupon_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=lambda x: 1 if len(x) >
                                                                                  1 else 0)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'repeat_receive'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    # 删除辅助提特征的'cnt'
    feature.drop(['cnt'], axis=1, inplace=True)
    # 返回
    return feature

In [28]:
def get_shiyanwu_feature(label_field):
    # 源数据
    data = label_field.copy()
    data['Coupon_id'] = data['Coupon_id'].map(int)
    data['Date_received'] = data['Date_received'].map(int)
    data['cnt'] = 1  # 方便特征提取
    # 主键
    keys = ['User_id']
    # 特征名前缀,由 history_field 和主键组成
    prefixs = 'history_field_' + '_'.join(keys) + '_'
    # 返回的特征数据集
    u_feat = label_field[keys].drop_duplicates(keep='first')
    ######################1、用户领券数######################
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_cnt'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################2、在多少不同商家领取优惠券######################
    pivot = pd.pivot_table(data, index=keys, values='Merchant_id', aggfunc=lambda x:
    len(str(x)))
    pivot = pd.DataFrame(pivot).rename(columns={'Merchant_id': prefixs +
                                                               'receive_differ_Merchant_cnt'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################3、领券并消费数######################
    pivot = pd.pivot_table(data[data['Date'].map(lambda x: str(x) != 'nan')],
                           index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_and_consume_cnt'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################4、领券未消费数######################
    pivot = pd.pivot_table(data[data['Date'].map(lambda x: str(x) == 'nan')],
                           index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefixs +
                                                       'receive_not_consume_cnt'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################5、核销率######################
    u_feat[prefixs + 'receive_and rate'] = list(map(lambda x, y: x / y if y != 0 else 0,
                                                    u_feat[prefixs + 'receive_and_consume_cnt'],
                                                    u_feat[prefixs + 'receive_cnt']))

    ######################6、领取并消费优惠券的平均折扣率######################
    pivot = pd.pivot_table(data[data['Date'].map(lambda x: str(x) == 'nan')],
                           index=keys, values='cnt', aggfunc="mean")
    pivot = pd.DataFrame(pivot).rename(columns={'discount_rate': prefixs +
                                                                 'receive_and_consume_mean_discount_rate'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################7、领取并消费优惠券的平均距离######################
    data_half = data[data['Distance'].map(lambda x: int(x) != -1)]
    data_ = data_half[data_half['Date'].map(lambda x: str(x) != 'nan')]
    pivot = pd.pivot_table(data_, index=keys, values='Distance', aggfunc="mean")
    pivot = pd.DataFrame(pivot).rename(columns={'Distance': prefixs +
                                                            'receive_and_consume_mean_distance'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)
    ######################8、在多少不同商家领取并消费优惠券######################
    pivot = pd.pivot_table(data[data['Date'].map(lambda x: str(x) == 'nan')],
                           index=keys, values='Merchant_id', aggfunc=lambda x: len(str(x)))
    pivot = pd.DataFrame(pivot).rename(columns={'Merchant_id': prefixs +
                                                               'receive_and_consume_differ_Merchant_cnt'}).reset_index()
    u_feat = pd.merge(u_feat, pivot, on=keys, how='left')
    u_feat.fillna(0, downcast='infer', inplace=True)

    ######################9、在多少不同商家领取并消费优惠券######################
    u_feat[prefixs + 'receive_differ_Merchant_consume_rate'] = list(map(lambda x, y: x / y
    if y != 0 else 0, u_feat[prefixs +
                             'receive_and_consume_differ_Merchant_cnt'], u_feat[
                                                                            prefixs + 'receive_differ_Merchant_cnt']))
    #添加特征
    history_feat = label_field.copy()
    #添加用户特征
    history_feat = pd.merge(history_feat, u_feat, on=['User_id'], how='left')
    #返回
    return history_feat

In [32]:
def get_week_feature(label_field):
    # 源数据
    data = label_field.copy()
    data['Coupon_id'] = data['Coupon_id'].map(int)
    data['Date_received'] = data['Date_received'].map(int)
    # 返回的处理好的特征数据集
    feature = data.copy()
    ######################1、星期几######################
    feature['week'] = feature['date_received'].map(lambda x: x.weekday())
    ######################2、判断领券日是否为休息日######################
    feature['is_weekend'] = feature['week'].map(lambda x: 1 if x == 5 or x == 6 else 0)
    ######################3、one-hot 离散星期几，前缀使用 week_######################
    feature = pd.concat([feature, pd.get_dummies(feature['week'], prefix='week')],
                        axis=1)
    feature.index = range(len(feature))  # 重置 index
    # 返回
    return feature


def get_dataset(history_field, middle_field, label_field):
    # 特征工程
    week_feat = get_week_feature(label_field)  # 日期特征
    simple_feat = get_feature(label_field)  # 示例简单特征
    history_feat = get_shiyanwu_feature(label_field)  #实验五需要的特征
    # 构造数据集'''
    share_characters = list(
        set(simple_feat.columns.tolist()) & set(week_feat.columns.tolist()) &
        set(history_feat.columns.tolist()) & set(label_field.columns.tolist()))
    label_field.index = range(len(label_field))
    dataset = pd.concat([label_field, simple_feat.drop(share_characters, axis=1)],
                        axis=1)
    dataset = pd.concat([dataset, history_feat.drop(share_characters, axis=1)], axis=1)
    dataset = pd.concat([dataset, week_feat.drop(share_characters, axis=1)], axis=1)
    # 删除无用属性并将 label 置于最后一列
    if 'Date' in dataset.columns.tolist():  # 表示训练集和验证集
        dataset.drop(['Merchant_id', 'Discount_rate', 'Date', 'date_received', 'date'],
                     axis=1, inplace=True)
        label = dataset['label'].tolist()
        dataset.drop(['label'], axis=1, inplace=True)
        dataset['label'] = label
    else:
        dataset.drop(['Merchant_id', 'Discount_rate', 'date_received'], axis=1,
                     inplace=True)
    # 修正数据类型
    dataset = dataset.dropna()
    dataset = dataset.dropna(how='any', subset=['User_id'], inplace=False)
    dataset['User_id'] = dataset['User_id'].map(int)
    dataset['Coupon_id'] = dataset['Coupon_id'].map(int)
    dataset['Date_received'] = dataset['Date_received'].map(int)
    dataset['Distance'] = dataset['Distance'].map(int)
    if 'label' in dataset.columns.tolist():
        dataset['label'] = dataset['label'].map(int)
    # 去重
    dataset.drop_duplicates(keep='first', inplace=True)
    dataset.index = range(len(dataset))
    # 返回
    return dataset

In [33]:
def model_xgb(train, test):
    # xgb 参数
    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'silent': 1,
              'eta': 0.01,
              'max_depth': 5,
              'min_child_weight': 3,
              'gamma': 0.4,
              'lambda': 1,
              'colsample_bylevel': 0.8,
              'colsample_bytree': 0.7,
              'subsample': 0.9,
              'scale_pos_weight': 1}
    # 数据集
    dtrain = xgb.DMatrix(train.drop(['User_id', 'Coupon_id', 'Date_received', 'label'],
                                    axis=1), label=train['label'])
    dtest = xgb.DMatrix(test.drop(['User_id', 'Coupon_id', 'Date_received'], axis=1))
    # 训练
    watchlist = [(dtrain, 'train')]
    model = xgb.train(params, dtrain, num_boost_round=10, evals=watchlist)
    # 预测
    predict = model.predict(dtest)
    # 处理结果
    predict = pd.DataFrame(predict, columns=['prob'])
    result = pd.concat([test[['User_id', 'Coupon_id', 'Date_received']], predict],
                       axis=1)
    # 特征重要性
    feat_importance = pd.DataFrame(columns=['feature_name', 'importance'])
    feat_importance['feature_name'] = model.get_score().keys()
    feat_importance['importance'] = model.get_score().values()
    feat_importance.sort_values(['importance'], ascending=False, inplace=True)
    # 返回
    return result, feat_importance


In [34]:
if __name__ == '__main__':
    # 源数据
    off_train = pd.read_csv('ccf_offline_stage1_train.csv')
    off_test = pd.read_csv('ccf_offline_stage1_test_revised.csv')
    # 预处理
    off_train = prepare(off_train)
    off_test = prepare(off_test)
    # 打标
    off_train = get_label(off_train)
    # 划分区间
    # 训练集历史区间、中间区间、标签区间
    train_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/3/2', periods=60))]
    train_middle_field = off_train[off_train['date'].isin(pd.date_range('2016/5/1',
                                                                        periods=15))]
    train_label_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/5/16', periods=31))]
    # 验证集历史区间、中间区间、标签区间
    validate_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/1/16', periods=60))]
    validate_middle_field = off_train[
        off_train['date'].isin(pd.date_range('2016/3/16', periods=15))]
    validate_label_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/3/31', periods=31))]
    # 测试集历史区间、中间区间、标签区间
    test_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/4/17', periods=60))]
    test_middle_field = off_train[off_train['date'].isin(pd.date_range('2016/6/16', periods=15))]
    test_label_field = off_test.copy()
    # 构造训练集、验证集、测试集
    print('构造训练集')
    train = get_dataset(train_history_field, train_middle_field, train_label_field)
    print('构造验证集')
    validate = get_dataset(validate_history_field, validate_middle_field, validate_label_field)
    print('构造测试集')
    test = get_dataset(test_history_field, test_middle_field, test_label_field)
    # 线上训练
    big_train = pd.concat([train, validate], axis=0)
    result, feat_importance = model_xgb(big_train, test)
    # 保存
    result.to_csv(r'easy.csv', index=False, header=None)

构造训练集
构造验证集
构造测试集


KeyError: 'Date'