In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mode
import gc
import xgboost as xgb
import datetime
import lightgbm as lgb 
import warnings
warnings.filterwarnings('ignore')


In [None]:
data_train = pd.read_table('ijcai_train_final.txt',sep=' ')
data_test = pd.read_table('ijcai_test_final.txt',sep=' ')

In [None]:
#1.基于time的特征提取（一次构建）

def MeanVarFeature(data,co,Ftype,time):
    if Ftype == 'mean':
        MVF = data.groupby(co)[time].mean().reset_index()
        MVF.columns = [co, '{}_mean_{}'.format(co[:-3],time)]
        data = data.merge(MVF, how='left', on=co)
    elif Ftype == 'var':
        MVF = data.groupby(co)[time].std().reset_index()
        MVF.columns = [co, '{}_var_{}'.format(co[:-3],time)]
        data = data.merge(MVF, how='left', on=co)
    return data

def MeanVarDoubleFeature(data,co,Ftype,time1='date',time2='hour'):
    if Ftype == 'mean':
        MVF = data.groupby([co,time1])[time2].mean().reset_index()
        MVF.columns = [co,time1, '{}_mean_{}_{}'.format(co[:-3],time1,time2)]
        data = data.merge(MVF, how='left', on=[co,time1])
        
    if Ftype == 'var':
        MVF = data.groupby([co,time1])[time2].var().reset_index()
        MVF.columns = [co,time1, '{}_var_{}_{}'.format(co[:-3],time1,time2)]
        data = data.merge(MVF, how='left', on=[co,time1])
        
    return data

def TimeFeaturePick(data):
    data = data.copy()
    data['time'] = data['context_timestamp'].apply(lambda x:datetime.datetime.fromtimestamp(x)) #localtime - 标准时间
    data['date'] = data['time'].apply(lambda x:x.day).astype('int')
    data['hour'] = data['time'].apply(lambda x:x.hour).astype('int')
    data['minute'] = data['time'].apply(lambda x:x.minute).astype('int')
    
    for co in id_columns:
        for Ftype in ['mean','var']:
            for time in ['date','hour']:
                data = MeanVarFeature(data,co,Ftype,time)
            data = MeanVarDoubleFeature(data,co,Ftype)
    return data

dt = TimeFeaturePick(dt)

In [None]:
#2.基于点击者user的特征提取（二次构建）

def UserFeature(dt):
    dt = dt.copy()    
    
    #1.1 基于click的user特征
    dt['user_first_click_time'] = dt['context_timestamp'].min()  #第一次点击的时间
    dt['user_last_click_time'] = dt['context_timestamp'].max()   #最后一次点击的时间
    dt['user_last-first_time'] = dt['user_first_click_time'] - dt['user_last_click_time'] #首尾时间差
    dt['user_count'] = len(dt)  #总点击次数，反映用户活跃度
    dt['user_shop_count'] = len(set(dt['shop_id']))  #用户点击的不同shop总数
    dt['user_item_count'] = len(set(dt['item_id']))  #用户点击的不同item总数
    
    #1.2 基于画像的user特征
    #1.2.1  提取id类列做交互，得到该user最常点击的id类型
    for co in id_columns:  #得到用户最常点击的id
        dt['user_mode_{}'.format(co)] = dt[co].mode()[0]
    #1.2.2  提取level类列做交互，得到该user的各项level特征
    for co in level_columns:  
        dt['user_mean_{}'.format(co)] = dt[co].mean() #得到用户各项level列的均值偏好
        dt['user_var_{}'.format(co)] = dt[co].var() #得到用户各项level列的方差偏好
        dt['user_max_{}'.format(co)] = dt[co].max() #得到用户各项level列的方差偏好
        dt['user_min_{}'.format(co)] = dt[co].min() #得到用户各项level列的方差偏好
        
    return dt

dt = dt.groupby('user_id').apply(UserFeature).reset_index(drop=True)


In [None]:
#3.基于点击对象item的特征提取（三次构建）

def ItemFeature(dt):
    dt = dt.copy()
    describe_columns = ([x for x in dt.columns if 'mean' in x or 'var' in x or 'user_max' in x or 'user_min' in x])
    
    max_time = dt['context_timestamp'].max()
    last_day_time = max_time - 86400

    #2.1 基于画像的item特征
    dt['item_count'] = len(dt) #item被点击总次数，反映item热度
    dt['item_user_count'] = len(set(dt['user_id'])) #点入item的用户总数，反映shop热度
    dt['item_normal_price_level'] = dt[dt['context_timestamp']<=last_day_time]['item_price_level'].mean() #平时价格
    dt['item_abnormal_price_level'] = dt[dt['context_timestamp']>last_day_time]['item_price_level'].mean() #异常时价格
    dt['abnormal_discount'] = dt['item_normal_price_level'] / dt['item_abnormal_price_level'] #异常时折扣
    
    #2.2 基于item和user再交互的特征
    for co in id_columns:  #得到该shop对不同的user最常点击ID的偏好
        dt['item_user_mode_{}'.format(co)] = dt['user_mode_{}'.format(co)].mode()[0]
        
    for co in level_columns+describe_columns :  #得到该item对不同的user属性的统计特征偏好
        dt['item_user_mean_{}'.format(co)] = dt[co].mean()  #二次交互均值特征
        dt['item_user_var_{}'.format(co)] = dt[co].var()  #二次交互方差特征
        dt['item_user_max_{}'.format(co)] = dt[co].max()  #二次交互最大值特征
        dt['item_user_min_{}'.format(co)] = dt[co].min()  #二次交互最小值特征    

    return dt

dt = dt.groupby('item_id').apply(ItemFeature).reset_index(drop=True)


In [None]:
def FeaturePick(dt):
    list_columns = [x for x in dt.columns if 'list' in x]   #筛选list类列，用于拆分并交互

    for co in list_columns:
        dt['{}_len'.format(co)] = dt[co].apply(lambda x:len(x.split(';')))
        dt['{}_id0'.format(co)] = dt[co].apply(lambda x:x.split(';')[0])
        dt['{}_id1'.format(co)] = dt[co].apply(lambda x:x.split(';')[1])
        del dt[co]
    dt['len_predict_category_property'] = dt['predict_category_property'].apply(lambda x:len(x.split(';')))
    dt['predict_category_id0'] = dt['predict_category_property'].apply(lambda x:x.split(';')[0].split(':')[0])
    dt['predict_property_id0'] = dt['predict_category_property'].apply(lambda x:x.split(';')[0].split(':')[1])
    del dt['predict_category_property']

    #0.2按列的类型分组
    id_columns = [x for x in dt.columns if 'id' in x]  #筛选id类列，用于交互（得到用户点击最多的ID）
    level_columns = [x for x in dt.columns if 'level' in x]  #筛选level类列，用于交互（得到用户的各项均值偏好）
    
    dt = TimeFeaturePick(dt)
    dt = dt.groupby('user_id').apply(UserFeature).reset_index(drop=True)
    dt = dt.groupby('item_id').apply(ItemFeature).reset_index(drop=True)
    del dt['time']
    return dt

data_train = FeaturePick(data_train)
data_test  = FeaturePick(data_test)

#经过特征工程处理，共得到752维特征。

In [None]:
#划分训练集测试集
train_feature = data_train.copy()
train_label = data_train['is_trade']
test_feature = data_test.copy()

In [None]:
#模型训练
import lightgbm as lgb
import xgboost as xgb
# import DeepFM

# lgb
lgb_train = lgb.LGBMClassifier(num_leaves=1024, learning_rate=0.01,n_estimators=4200,colsample_bytree = 0.8,
                    subsample = 0.8)
lgb_train.fit(train_feature,train_label)
lgb_pre_test_y = lgb_train.predict_proba(test_feature)[:,1]

# xgb
xgb_train = xgb.XGBClassifier(max_depth=10,learning_rate=0.01,n_estimators=4000,colsample_bylevel=0.5,colsample_bytree=0.8,subsample=0.8)
xgb_train.fit(train_feature,train_label)
xgb_pre_test_y = xgb_train.predict_proba(test_feature)[:,1]

# NN
DeepFM_train = DeepFM(learning_rate = 0.0001,drop_out=True,batch_size = 32*16, h_depth = 4)
DeepFM_train.fit(train_feature,train_label)
nn_pre_test_y = DeepFM_train.predict_proba(test_feature)[:,1]

In [None]:
#答案输出
ans = lgb_pre_test_y*0.6+xgb_pre_test_y*0.2+nn_pre_test_y*0.2
submit = DF()
submit['instance_id'] = data_test['instance_id'] 
submit['predicted_score'] = ans
submit.to_csv('ans.txt',sep=' ',index=False)