<a href="https://colab.research.google.com/github/zcy20051117/machine-learning-code/blob/main/o2o%E4%BB%A3%E7%A0%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')  # 不显示警告

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def prepare(dataset):

    # 源数据
    data = dataset.copy()
    # 折扣率处理
    data['is_manjian'] = data['Discount_rate'].map(lambda x: 1 if ':' in str(x) else 0)  # Discount_rate是否为满减
    data['discount_rate'] = data['Discount_rate'].map(lambda x: float(x) if ':' not in str(x) else
    (float(str(x).split(':')[0]) - float(str(x).split(':')[1])) / float(str(x).split(':')[0]))  # 满减全部转换为折扣率
    data['min_cost_of_manjian'] = data['Discount_rate'].map(
        lambda x: -1 if ':' not in str(x) else int(str(x).split(':')[0]))  # 满减的最低消费
    # 距离处理
    data['Distance'].fillna(-1, inplace=True)  # 空距离填充为-1
    data['null_distance'] = data['Distance'].map(lambda x: 1 if x == -1 else 0)
    # 时间处理
    data['date_received'] = pd.to_datetime(data['Date_received'], format='%Y%m%d')
    if 'Date' in data.columns.tolist():  # off_train
        data['date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
    #print(data['Date_received'])
    # 返回
    return data

In [None]:
def prepare_on(dataset):

    # 源数据
    data = dataset.copy()
    # 折扣率处理
    data['is_manjian'] = data['Discount_rate'].map(lambda x: 1 if ':' in str(x) else 0)  # Discount_rate是否为满减
    data['discount_rate'] = data['Discount_rate'].map(lambda x: float(x) if ':' not in str(x) else
    (float(str(x).split(':')[0]) - float(str(x).split(':')[1])) / float(str(x).split(':')[0]))  # 满减全部转换为折扣率
    data['min_cost_of_manjian'] = data['Discount_rate'].map(
        lambda x: -1 if ':' not in str(x) else int(str(x).split(':')[0]))  # 满减的最低消费

    # 时间处理
    data['date_received'] = pd.to_datetime(data['Date_received'], format='%Y%m%d')
    if 'Date' in data.columns.tolist():  # off_train
        data['date'] = pd.to_datetime(data['Date'], format='%Y%m%d')


    return data

In [None]:
def get_label(dataset):

    # 源数据
    data = dataset.copy()
    # 打标:领券后15天内消费为1,否则为0
    data['label'] = list(map(lambda x, y: 1 if (x - y).total_seconds() / (60 * 60 * 24) <= 15 else 0, data['date'],
                             data['date_received']))
    # 返回
    return data

In [None]:
def get_simple_feature(label_field):

    # 源数据
    data = label_field.copy()
    data['User_id']=data['User_id'].map(int)
    data['Merchant_id']=data['Merchant_id'].map(int)
    data['Distance']=data['Distance'].map(int)
    data['min_cost_of_manjian']=data['min_cost_of_manjian'].map(int)
    data['Coupon_id'] = data['Coupon_id'].map(int)  # 将Coupon_id列中float类型的元素转换为int类型,因为列中存在np.nan即空值会让整列的元素变为float
    data['Date_received'] = data['Date_received'].map(int)
	# 将Date_received列中float类型的元素转换为int类型,因为列中存在np.nan即空值会让整列的元素变为float
    data['cnt'] = 1  # 方便特征提取
    # 返回的特征数据集
    feature = data.copy()
    print(data.columns.tolist())

    #print(data['min_cost_of_manjian'])
    # 用户领券数
    keys = ['User_id']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'  # 特征名前缀,由label_field和主键组成
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)  # 以keys为键,'cnt'为值,使用len统计出现的次数
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()  # pivot_table后keys会成为index,统计出的特征列会以values即'cnt'命名,将其改名为特征名前缀+特征意义,并将index还原
    feature = pd.merge(feature, pivot, on=keys, how='left')  # 将id列与特征列左连
    feature.fillna(0,inplace=True)

    # 用户领取特定优惠券数
    keys = ['User_id', 'Coupon_id']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'  # 特征名前缀,由label_field和主键组成
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)  # 以keys为键,'cnt'为值,使用len统计出现的次数
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()  # pivot_table后keys会成为index,统计出的特征列会以values即'cnt'命名,将其改名为特征名前缀+特征意义,并将index还原
    feature = pd.merge(feature, pivot, on=keys, how='left')  # 将id列与特征列左连
    feature.fillna(0,inplace=True)

    # 用户当天领券数
    keys = ['User_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'  # 特征名前缀,由label_field和主键组成
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)  # 以keys为键,'cnt'为值,使用len统计出现的次数
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()  # pivot_table后keys会成为index,统计出的特征列会以values即'cnt'命名,将其改名为特征名前缀+特征意义,并将index还原
    feature = pd.merge(feature, pivot, on=keys, how='left')  # 将id列与特征列左连
    feature.fillna(0,inplace=True)

    # 用户当天领取特定优惠券数
    keys = ['User_id', 'Coupon_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'  # 特征名前缀,由label_field和主键组成
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)  # 以keys为键,'cnt'为值,使用len统计出现的次数
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()  # pivot_table后keys会成为index,统计出的特征列会以values即'cnt'命名,将其改名为特征名前缀+特征意义,并将index还原
    feature = pd.merge(feature, pivot, on=keys, how='left')  # 将id列与特征列左连
    feature.fillna(0,inplace=True)

    # 用户是否在同一天重复领取了特定优惠券
    keys = ['User_id', 'Coupon_id', 'Date_received']  # 主键
    prefixs = 'simple_' + '_'.join(keys) + '_'  # 特征名前缀,由label_field和主键组成
    pivot = pd.pivot_table(data, index=keys, values='cnt',
                           aggfunc=lambda x: 1 if len(x) > 1 else 0)  # 以keys为键,'cnt'为值,判断领取次数是否大于1
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'repeat_receive'}).reset_index()  # pivot_table后keys会成为index,统计出的特征列会以values即'cnt'命名,将其改名为特征名前缀+特征意义,并将index还原
    feature = pd.merge(feature, pivot, on=keys, how='left')  # 将id列与特征列左连
    feature.fillna(0,inplace=True)

    #优惠券当天被领取数
    keys = ['Coupon_id', 'Date_received']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #不能提跟label有关的特征

    #用户在不同商家消费次数
    keys = ['User_id', 'Merchant_id']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #用户在不同商家使用不同优惠券次数
    keys = ['Coupon_id', 'User_id', 'Merchant_id']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #商家投放优惠券数
    keys = ['Coupon_id', 'Merchant_id']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #商家的特定优惠券在当天被领取数
    keys = ['Coupon_id', 'Merchant_id', 'Date_received']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #商家优惠券当天被领次数
    keys = ['Merchant_id', 'Date_received']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #用户月初领券数
    keys = ['User_id', 'Coupon_id']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data[data['Date_received']%100<=10], index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt_pre'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #商家被领券数
    keys = ['Merchant_id']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #用户领取满不同减优惠券数量
    keys = ['User_id', 'min_cost_of_manjian']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt_mincost'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #用户与商家zhongwei距离(这TM不像对的但auc上了)
    keys = ['User_id', 'Merchant_id', 'Distance']
    prefixs = 'simple_' + '_'.join(keys) + '_'
    pivot = pd.pivot_table(data[data['null_distance']==0], index=keys, values='cnt', aggfunc='median')
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt_mid'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    #7401,应该是榨干了250426-15:40(榨干个p 250427-23：20)

    #在历史区间提了几个特征，是7407

    #用户去同一距离的商户(250428 7412)
    keys = ['Distance','User_id']
    pivot = pd.pivot_table(data[data['null_distance']==0], index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={
        'cnt': prefixs + 'receive_cnt'}).reset_index()
    feature = pd.merge(feature, pivot, on=keys, how='left')
    feature.fillna(0,inplace=True)

    feature = feature.sort_values(by='Date_received')
    feature = pd.concat([feature, feature.groupby('User_id')['Date_received'].rank(method='min')], axis=1)
    feature.columns = list(feature.columns[:-1]) + ['用户组内排序']
    feature = pd.concat([feature, feature.groupby('Merchant_id')['Date_received'].rank(method='min')], axis=1)
    feature.columns = list(feature.columns[:-1]) + ['商户组内排序']
    feature = feature[[i for i in feature.columns if i not in ['Date_received']]]


    #print(feature.columns.tolist())
    #某包品好码：去除重复列（250428—23：08 7418）
    #feature = feature.loc[:, ~feature.T.duplicated(keep='first')]
    #记着去重就行，运行是真tm慢呀
    # 删除辅助提特征的'cnt'
    #feature.drop(['cnt'], axis=1, inplace=True)


    # 返回
    return feature

In [None]:
def get_week_feature(label_field):

    # 源数据
    data = label_field.copy()
    data['Coupon_id'] = data['Coupon_id'].map(int)  # 将Coupon_id列中float类型的元素转换为int类型,因为列中存在np.nan即空值会让整列的元素变为float
    data['Date_received'] = data['Date_received'].map(
        int)  # 将Date_received列中float类型的元素转换为int类型,因为列中存在np.nan即空值会让整列的元素变为float
    # 返回的特征数据集
    feature = data.copy()
    feature['week'] = feature['date_received'].map(lambda x: x.weekday())  # 星期几
    feature['is_weekend'] = feature['week'].map(lambda x: 1 if x == 5 or x == 6 else 0)  # 判断领券日是否为休息日
    feature = pd.concat([feature, pd.get_dummies(feature['week'], prefix='week')], axis=1)  # one-hot离散星期几
    feature.index = range(len(feature))  # 重置index
    # 返回
    return feature

In [None]:
def get_user_feature(history_field,label_field):
    data=history_field.copy()
    data['User_id']=data['User_id'].map(int)
    data['Date_received']=data['Date_received'].map(int)
    data['Date']=data['Date'].fillna(0).map(int)
    data['Distance']=data['Distance'].map(int)
    data['jiange']=list(map(lambda x,y: (x - y).total_seconds() / (60 * 60 * 24), data['date'],
                             data['date_received']))
    data['cnt']=1
    keys=['User_id']
    prefix='unsimple_'+'_'.join(keys)+'_'
    user_feat=label_field[keys].drop_duplicates(keep = 'first')

    #用户领券数
    pivot = pd.pivot_table(data,index = keys,values = 'cnt',aggfunc = len)
    pivot = pd.DataFrame(pivot).rename(columns = {'cnt': prefix+'user_received'}).reset_index()
    user_feat = pd.merge(user_feat,pivot,on = keys, how = 'left')
    user_feat.fillna(0,inplace=True)

    #用户核销数
    pivot = pd.pivot_table(data[data['label']==1],index = keys,values = 'cnt',aggfunc = len)
    pivot = pd.DataFrame(pivot).rename(columns = {'cnt': prefix+'user_hexiao'}).reset_index()
    user_feat = pd.merge(user_feat,pivot,on = keys, how = 'left')
    user_feat.fillna(0,inplace=True)

    #用户核销率7406
    user_feat['user_hexiao_rate']=user_feat[prefix+'user_hexiao']/user_feat[prefix+'user_received']
    user_feat.fillna(0,inplace=True)
    user_feat.drop([prefix+'user_hexiao',prefix+'user_received'],axis=1,inplace=True)

    #用户领券满减优惠券数(这个有没有无所谓，分都是7407)
    pivot = pd.pivot_table(data[data['is_manjian']==1],index = keys,values = 'cnt',aggfunc = len)
    pivot = pd.DataFrame(pivot).rename(columns = {'cnt': prefix+'user_manjian'}).reset_index()
    user_feat = pd.merge(user_feat,pivot,on = keys, how = 'left')
    user_feat.fillna(0,inplace=True)

    #用户满减优惠券核销数
    pivot = pd.pivot_table(data[(data['label']==1) & (data['is_manjian']==1)],index = keys,values = 'cnt',aggfunc = len)
    pivot = pd.DataFrame(pivot).rename(columns = {'cnt': prefix+'user_manjian_hexiao'}).reset_index()
    user_feat = pd.merge(user_feat,pivot,on = keys, how = 'left')
    user_feat.fillna(0,inplace=True)

    #用户满减优惠券核销率(7407  250426-18:17)
    user_feat['manjian_hexiao_rate']=user_feat[prefix+'user_manjian_hexiao']/user_feat[prefix+'user_manjian']
    user_feat.drop([prefix+'user_manjian_hexiao'],axis=1,inplace=True)
    user_feat.fillna(0,inplace=True)

    #user_feat = user_feat.loc[:, ~user_feat.T.duplicated(keep='first')]
    #print(user_feat.columns.tolist())
    return user_feat

In [None]:
def get_merchant_feat(history_field,label_field):
    data=history_field.copy()
    data['User_id']=data['User_id'].map(int)
    data['Merchant_id']=data['Merchant_id'].map(int)
    data['Coupon_id']=data['Coupon_id'].map(int)
    data['Date_received'] = data['Date_received'].map(int)
    data['is_hexiao']=list(map(lambda x:1 if pd.notnull(x) else 0,data['date']))
    data['cnt']=1

    feature=data.copy()

    keys=['Merchant_id']
    prefix='unsimple_'+'_'.join(keys)+'_'
    merchant_feat=label_field[keys].drop_duplicates(keep = 'first')

    #商家zonggong投优惠券的数量
    pivot = pd.pivot_table(feature, index=keys, values='Coupon_id', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'Coupon_id': prefix + 'coupon_count'}).reset_index()
    merchant_feat = pd.merge(merchant_feat, pivot, on=keys, how='left')
    merchant_feat.fillna(0, inplace=True)

    #商家优惠券被领取次数
    pivot = pd.pivot_table(feature, index=keys, values='is_hexiao', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'is_hexiao': prefix + 'shagnjiabeilingquancishu'}).reset_index()
    merchant_feat = pd.merge(merchant_feat, pivot, on=keys, how='left')
    merchant_feat.fillna(0, inplace=True)

    #商家优惠券被领取后核销次数
    pivot = pd.pivot_table(feature[feature['label']==1], index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'shagnjiabeilingquanhouhexiaocishu'}).reset_index()
    merchant_feat = pd.merge(merchant_feat, pivot, on=keys, how='left')
    merchant_feat.fillna(0, inplace=True)

    #商家优惠券被领取后核销率
    merchant_feat['shangjiayonghuhexiaolv']=merchant_feat[prefix + 'shagnjiabeilingquanhouhexiaocishu']/merchant_feat[prefix + 'shagnjiabeilingquancishu']
    merchant_feat.fillna(0,inplace=True)

    pivot_avg = pd.pivot_table(feature[feature['label'] == 1], index=keys, values='Distance', aggfunc='mean')
    pivot_avg = pd.DataFrame(pivot_avg).rename(columns={'Distance': prefix + 'AverageDistanceAfterRedeem'}).reset_index()

    # 计算最小距离
    pivot_min = pd.pivot_table(feature[feature['label'] == 1], index=keys, values='Distance', aggfunc='min')
    pivot_min = pd.DataFrame(pivot_min).rename(columns={'Distance': prefix + 'MinDistanceAfterRedeem'}).reset_index()

    # 计算最大距离
    pivot_max = pd.pivot_table(feature[feature['label'] == 1], index=keys, values='Distance', aggfunc='max')
    pivot_max = pd.DataFrame(pivot_max).rename(columns={'Distance': prefix + 'MaxDistanceAfterRedeem'}).reset_index()

    # 合并平均距离数据
    merchant_feat = pd.merge(merchant_feat, pivot_avg, on=keys, how='left')
    # 合并最小距离数据
    merchant_feat = pd.merge(merchant_feat, pivot_min, on=keys, how='left')
    # 合并最大距离数据
    merchant_feat = pd.merge(merchant_feat, pivot_max, on=keys, how='left')

    # 填充缺失值为 0
    merchant_feat.fillna(0, inplace=True)
    data[data['date_received'].notnull()]
    #商家平均每种优惠券核销多少张
    pivot = pd.pivot_table(data,index = keys,values = 'Coupon_id',aggfunc = lambda x:len(set(x)))
    pivot = pd.DataFrame(pivot).rename(columns={'Coupon_id': prefix + 'averange_coupon'}).reset_index()
    merchant_feat = pd.merge(merchant_feat, pivot, on=keys, how='left')

    # 筛选出已核销的优惠券
    redeemed_coupons = feature[feature['label'] == 1]
    # 按商家和优惠券分组并统计核销数量
    coupon_counts = redeemed_coupons.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='RedemptionCount')
    # 按商家分组计算平均核销数量
    average_redemption = coupon_counts.groupby('Merchant_id')['RedemptionCount'].mean().reset_index(name='AverageRedemptionPerCoupon')
    # 确保合并键的数据类型一致
    merchant_feat['Merchant_id'] = merchant_feat['Merchant_id'].astype(average_redemption['Merchant_id'].dtype)
    # 进行合并操作
    merchant_feat = pd.merge(merchant_feat, average_redemption, on='Merchant_id', how='left')
    # 填充缺失值为 0
    merchant_feat.fillna(0, inplace=True)
    return merchant_feat

In [None]:
def get_coupon_feature(history_field,label_field):
    data=history_field.copy()
    data['User_id']=data['User_id'].map(int)
    data['Merchant_id']=data['Merchant_id'].map(int)
    data['Coupon_id']=data['Coupon_id'].map(int)
    data['cnt']=1
    data['uhqbeilingqu']=list(map(lambda x:1 if pd.notnull(x) else 0,data['Coupon_id']))
    data['is_hexiao']=list(map(lambda x:1 if pd.notnull(x) else 0,data['date']))
    data['weilingquan']=list(map(lambda x:1 if pd.notnull(x) else 0,data['Coupon_id']))
    data['manjianyhq']=list(data['Discount_rate'].map(lambda x:1 if ':' in str(x) else 0))
    data['cnt']=1
    #data['Date_received'] = data['Date_received'].fillna(0)
   #data['yue']=data['date_received'].apply(yue())
    feature=data.copy()

    keys=['Coupon_id']
    prefix='unsimple_'+'_'.join(keys)+'_'
    coupon_feat=label_field[keys].drop_duplicates(keep = 'first')
    useless_feat=label_field[keys].drop_duplicates(keep = 'first')


    #优惠券被领取数量
    pivot = pd.pivot_table(feature[feature.date_received.notnull()],index = keys,values = 'uhqbeilingqu',aggfunc = len)
    pivot = pd.DataFrame(pivot).rename(columns = {'uhqbeilingqu': prefix+'coupon_lingqu'}).reset_index()
    useless_feat = pd.merge(useless_feat,pivot,on = keys, how = 'left')
    useless_feat.fillna(0,inplace=True)

    #优惠券被核销数量
    pivot = pd.pivot_table(feature[feature['label']==1],index = keys,values = 'is_hexiao',aggfunc = lambda x:len(x))
    pivot = pd.DataFrame(pivot).rename(columns = {'is_hexiao': prefix+'coupon_hexiao'}).reset_index()
    useless_feat = pd.merge(useless_feat,pivot,on = keys, how = 'left')
    useless_feat.fillna(0,inplace=True)

    #优惠券核销率
    coupon_feat[prefix+'coupon_hexiaolv']=useless_feat[prefix+'coupon_hexiao']/useless_feat[prefix+'coupon_lingqu']
    coupon_feat.fillna(0,inplace=True)

    #多少💴开减
    useless_feat['discount_man'] = feature['Discount_rate'].map(lambda x: np.nan if ':' not in x else (str(x).split(':')[0]))
    useless_feat['discount_man'] = pd.to_numeric( useless_feat['discount_man'], errors='coerce')
    useless_feat.fillna(0,inplace=True)

    #减多少
    useless_feat['discount_jian'] = feature['Discount_rate'].map(lambda x: np.nan if ':' not in x else (str(x).split(':')[1]))
    useless_feat['discount_jian'] = pd.to_numeric( useless_feat['discount_jian'], errors='coerce')
    useless_feat.fillna(0,inplace=True)

    #折扣率
    coupon_feat['zhekoulv']=( useless_feat['discount_man']- useless_feat['discount_jian'])/ useless_feat['discount_man']
    coupon_feat.fillna(0,inplace=True)

    #周几领券
    coupon_feat['day_of_week'] = feature.date_received.map(lambda x: x.weekday())

    #月几领券
    coupon_feat['day_of_month'] = feature.date_received.map(lambda x: x.month)

    #周几领的券众数
    pivot = pd.pivot_table(data,index = keys,values = 'date_received',aggfunc = lambda x:x.dt.weekday.mode().values[0])
    pivot = pd.DataFrame(pivot).rename(columns = {'date_received': prefix+'zhoujilingyhq'}).reset_index()
    coupon_feat = pd.merge(coupon_feat,pivot,on = keys, how = 'left')
    coupon_feat.fillna(0,inplace=True)

    return coupon_feat

In [None]:
def get_user_merchant_feature(history_field,label_field):
    data=history_field.copy()
    data['User_id']=data['User_id'].map(int)
    data['Merchant_id']=data['Merchant_id'].map(int)
    data['Coupon_id']=data['Coupon_id'].map(int)
    data['cnt']=1
    data['uhqbeilingqu']=list(map(lambda x:1 if pd.notnull(x) else 0,data['Coupon_id']))
    data['is_hexiao']=list(map(lambda x:1 if pd.notnull(x) else 0,data['date']))
    data['weilingquan']=list(map(lambda x:1 if pd.notnull(x) else 0,data['Coupon_id']))
    data['manjianyhq']=list(data['Discount_rate'].map(lambda x:1 if ':' in str(x) else 0))
    feature=data.copy()

    keys=['User_id','Merchant_id']
    prefix='unsimple_'+'_'.join(keys)+'_'
    um_feat=label_field[keys].drop_duplicates(keep = 'first')


    #用户领取商家的优惠券次数
    pivot = pd.pivot_table(feature, index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'yonghu_lignqu_shangjia_yhq_cishu'}).reset_index()
    um_feat = pd.merge(um_feat, pivot, on=keys, how='left')
    um_feat.fillna(0, inplace=True)

    #用户领取商家的优惠券后核销次数
    pivot = pd.pivot_table(feature[feature['label']==1], index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'yonghu_lignqu_shangjia_yhq_hexiao_cishu'}).reset_index()
    um_feat = pd.merge(um_feat, pivot, on=keys, how='left')
    um_feat.fillna(0, inplace=True)

    #用户领取商家的优惠券后核销率
    um_feat['yong_gai_yhq_hexiaolv']=um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_hexiao_cishu']/um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_cishu']
    um_feat.fillna(0,inplace=True)

    #用户领取商家的优惠券后不核销次数
    pivot = pd.pivot_table(feature[feature['label']==0], index=keys, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'yonghu_lignqu_shangjia_yhq_buhexiao_cishu'}).reset_index()
    um_feat = pd.merge(um_feat, pivot, on=keys, how='left')
    um_feat.fillna(0, inplace=True)

    #用户领券不核销数
    key=['User_id']
    useless_feat=label_field[keys].drop_duplicates(keep = 'first')
    pivot = pd.pivot_table(feature[feature['label']==0], index=key, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'wdnmd'}).reset_index()
    useless_feat = pd.merge(useless_feat, pivot, on=key, how='left')
    useless_feat.fillna(0, inplace=True)

    #用户总核销次数
    pivot = pd.pivot_table(feature[feature['label']==1], index=key, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'wdnmdd'}).reset_index()
    useless_feat = pd.merge(useless_feat, pivot, on=key, how='left')
    useless_feat.fillna(0, inplace=True)

    #print(useless_feat.columns.tolist())
    #用户对每个商家的不核销次数占用户总的不核销次数的比重
    um_feat['nmsl']=um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_buhexiao_cishu']/useless_feat[prefix + 'wdnmd']
    um_feat.fillna(0,inplace=True)

    #用户对每个商家的优惠券核销次数占用户总的核销次数的比重
    um_feat['nmsll']=um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_hexiao_cishu']/useless_feat[prefix + 'wdnmdd']
    um_feat.fillna(0,inplace=True)

    #商家不核销次数
    keyes=['Merchant_id']
    pivot = pd.pivot_table(feature[feature['label']==0], index=keyes, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'wdnmddd'}).reset_index()
    useless_feat = pd.merge(useless_feat, pivot, on=keyes, how='left')
    useless_feat.fillna(0, inplace=True)

    #商家核销次数
    pivot = pd.pivot_table(feature[feature['label']==1], index=keyes, values='cnt', aggfunc=len)
    pivot = pd.DataFrame(pivot).rename(columns={'cnt': prefix + 'wdnmdddd'}).reset_index()
    useless_feat = pd.merge(useless_feat, pivot, on=keyes, how='left')
    useless_feat.fillna(0, inplace=True)


    #用户对每个商家的不核销次数占商家总的不核销次数的比重
    um_feat['mlgb']=um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_buhexiao_cishu']/useless_feat[prefix + 'wdnmddd']
    um_feat.fillna(0,inplace=True)

    #用户对每个商家的优惠券核销次数占商家总的核销次数的比重
    um_feat['mlgbb']=um_feat[prefix + 'yonghu_lignqu_shangjia_yhq_hexiao_cishu']/useless_feat[prefix + 'wdnmdddd']
    um_feat.fillna(0,inplace=True)

    return um_feat

In [None]:
def get_history_feature(dataset,history_field,label_field):
    # 用户特征
    user_feat=get_user_feature(history_field,label_field)
    merchant_feat=get_merchant_feat(history_field,label_field)
    #coupon_feat=get_coupon_feature(history_field,label_field)
    user_merchant_feat=get_user_merchant_feature(history_field,label_field)


    final_feat=pd.merge(dataset,user_feat,on = ['User_id'],how = 'left')
    final_feat=pd.merge(final_feat,merchant_feat,on=['Merchant_id'],how='left')
    #final_feat=pd.merge(final_feat,coupon_feat,on=['Coupon_id'],how='left')
    final_feat=pd.merge(final_feat,user_merchant_feat,on=['User_id','Merchant_id'],how='left')

    return final_feat

In [None]:
def get_dataset(history_field, middle_field, label_field):

    # 特征工程
    week_feat = get_week_feature(label_field)  # 日期特征
    simple_feat = get_simple_feature(label_field)  # 示例简单特征
    """
    # 构造数据集
    share_characters = list(
        set(simple_feat.columns.tolist()) & set(week_feat.columns.tolist()))  # 共有属性,包括id和一些基础特征,为每个特征块的交集
    dataset = pd.concat([week_feat, simple_feat.drop(share_characters, axis=1)], axis=1)
    """
    # 构造数据集
    share_characters = list(
        set(simple_feat.columns.tolist()) & set(week_feat.columns.tolist()))  # 共有属性,包括id和一些基础特征,为每个特征块的交集
    dataset = pd.concat([week_feat, simple_feat.drop(share_characters, axis=1)], axis=1)
    dataset=get_history_feature(dataset,history_field,label_field)
    # 删除无用属性并将label置于最后一列
    if 'Date' in dataset.columns.tolist():  # 表示训练集和验证集
        dataset.drop(['Merchant_id', 'Discount_rate', 'Date', 'date_received', 'date'], axis=1, inplace=True)
        label = dataset['label'].tolist()
        dataset.drop(['label'], axis=1, inplace=True)
        dataset['label'] = label
    else:  # 表示测试集
        dataset.drop(['Merchant_id', 'Discount_rate', 'date_received'], axis=1, inplace=True)
    # 修正数据类型
    dataset['User_id'] = dataset['User_id'].map(int)
    dataset['Coupon_id'] = dataset['Coupon_id'].map(int)
    dataset['Date_received'] = dataset['Date_received'].map(int)
    dataset['Distance'] = dataset['Distance'].map(int)
    if 'label' in dataset.columns.tolist():
        dataset['label'] = dataset['label'].map(int)
    # 去重
    dataset.drop_duplicates(keep='first', inplace=True)
    dataset.index = range(len(dataset))
    # 返回
    return dataset

In [None]:
def model_xgb(train, test):

    # xgb参数
    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'silent': 1,
              'eta': 0.01,
              'max_depth': 5,
              'min_child_weight': 1,
              'gamma': 0,
              'lambda': 1,
              'colsample_bylevel': 0.7,
              'colsample_bytree': 0.7,
              'subsample': 0.9,
              'scale_pos_weight': 1}
    # 数据集
    dtrain = xgb.DMatrix(train.drop(['User_id', 'Coupon_id', 'Date_received', 'label'], axis=1), label=train['label'])
    dtest = xgb.DMatrix(test.drop(['User_id', 'Coupon_id', 'Date_received'], axis=1))
    # 训练
    watchlist = [(dtrain, 'train')]
    model = xgb.train(params, dtrain, num_boost_round=2500, evals=watchlist,verbose_eval=50)
    # 预测
    predict = model.predict(dtest)
    # 处理结果
    predict = pd.DataFrame(predict, columns=['prob'])
    result = pd.concat([test[['User_id', 'Coupon_id', 'Date_received']], predict], axis=1)
    # 特征重要性
    feat_importance = pd.DataFrame(columns=['feature_name', 'importance'])
    feat_importance['feature_name'] = model.get_score().keys()
    feat_importance['importance'] = model.get_score().values()
    feat_importance.sort_values(['importance'], ascending=False, inplace=True)
    # 返回
    return result, feat_importance

In [None]:
if __name__ == '__main__':
    # 源数据
    #off_train = pd.read_csv(r'/content/drive/aliyun_o2o/ccf_offline_stage1_train.csv')
    #on_train=pd.read_csv(r'D:\o2o\ccf_online_stage1_train.csv')
    #off_test = pd.read_csv(r'/content/drive/aliyun_o2o/ccf_offline_stage1_test.csv')
    train_path = "/content/drive/My Drive/aliyun_o2o/ccf_offline_stage1_train.csv"
    test_path  = "/content/drive/My Drive/aliyun_o2o/ccf_offline_stage1_test_revised.csv"

    #读取 CSV 文件
    off_train = pd.read_csv(train_path)
    off_test = pd.read_csv(test_path)
    # 预处理
    off_train = prepare(off_train)
    off_test = prepare(off_test)
    #on_train=prepare_on(on_train)
    # 打标
    off_train = get_label(off_train)

    # 划分区间
    # 训练集历史区间、中间区间、标签区间
    train_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/3/2', periods=60))]  # [20160302,20160501)
    train_middle_field = off_train[off_train['date'].isin(pd.date_range('2016/5/1', periods=15))]  # [20160501,20160516)
    train_label_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/5/16', periods=31))]  # [20160516,20160616)
    # 验证集历史区间、中间区间、标签区间
    validate_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/1/16', periods=60))]  # [20160116,20160316)
    validate_middle_field = off_train[
        off_train['date'].isin(pd.date_range('2016/3/16', periods=15))]  # [20160316,20160331)
    validate_label_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/3/31', periods=31))]  # [20160331,20160501)
    # 测试集历史区间、中间区间、标签区间
    test_history_field = off_train[
        off_train['date_received'].isin(pd.date_range('2016/4/17', periods=60))]  # [20160417,20160616)
    test_middle_field = off_train[off_train['date'].isin(pd.date_range('2016/6/16', periods=15))]  # [20160616,20160701)
    test_label_field = off_test.copy()  # [20160701,20160801)

    # 构造训练集、验证集、测试集
    print('构造训练集')
    train = get_dataset(train_history_field, train_middle_field, train_label_field)
    print('构造验证集')
    validate = get_dataset(validate_history_field, validate_middle_field, validate_label_field)
    print('构造测试集')
    test = get_dataset(test_history_field, test_middle_field, test_label_field)

    # 线下验证

    # 线上训练
    big_train = pd.concat([train, validate], axis=0)
    result, feat_importance = model_xgb(big_train, test)
    # 保存
    #result.to_csv('/content/drive/aliyun_o2o/easy.csv', index=False, header=None)
    result.to_csv('/content/drive/My Drive/aliyun_o2o/easy.csv', index=False, header=None)


构造训练集
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'is_manjian', 'discount_rate', 'min_cost_of_manjian', 'null_distance', 'date_received', 'date', 'label', 'cnt']
构造验证集
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'is_manjian', 'discount_rate', 'min_cost_of_manjian', 'null_distance', 'date_received', 'date', 'label', 'cnt']
构造测试集
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'is_manjian', 'discount_rate', 'min_cost_of_manjian', 'null_distance', 'date_received', 'cnt']
[0]	train-auc:0.84754
[50]	train-auc:0.87294
[100]	train-auc:0.87626
[150]	train-auc:0.87854
[200]	train-auc:0.88109
[250]	train-auc:0.88306
[300]	train-auc:0.88522
[350]	train-auc:0.88703
[400]	train-auc:0.88858
[450]	train-auc:0.88991
[500]	train-auc:0.89103
[550]	train-auc:0.89216
[600]	train-auc:0.89312
[650]	train-auc:0.89397
[700]	train-auc:0.89474
[750]	train-auc:0.89546
[800]	tr

In [None]:
print(xgb.__version__)

2.1.4
