In [3]:
# !pip install gensim

# 导入库

In [77]:
import pandas as pd
import numpy as np
import re
from scipy import stats
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 导入数据集

In [2]:
common_path = '/home/kesci/input/round28100/' 
train_df = pd.read_csv(common_path + 'train_round_2.csv')
test_df = pd.read_csv(common_path + 'test_round_2_v4.csv')

In [3]:
test_df.shape

(10087, 344)

In [4]:
train_df.shape

(33000, 346)

# 多天启动特征

In [5]:
def get_time_gap(strs,parm):
    time = strs.split(":")
    time = list(set(time))
    time = sorted(list(map(lambda x:int(x),time)))
    time_gap = []
    #用户只在当天活跃
    if len(time) == 1: return -20

    for index, value in enumerate(time):
        if index <= len(time) - 2:
            gap = abs(time[index] - time[index + 1])
            time_gap.append(gap)

    if parm == '1': return np.mean(time_gap)
    elif parm == '2': return np.max(time_gap)
    elif parm == '3': return np.min(time_gap)
    elif parm == '4': return np.std(time_gap)
    elif parm == '5': return stats.skew(time_gap)
    elif parm == '6': return stats.kurtosis(time_gap)
    
def get_week(day):
    day = int(day)
    if day >= 1 and day <= 7: return 1
    if day >= 8 and day <= 14: return 2
    if day >= 15 and day <= 21: return 3
    if day >= 22 and day <= 28: return 4
    if day >= 28: return 5

def cur_day_repeat_count(strs):
    time = strs.split(":")
    time = dict(Counter(time))
    time = sorted(time.items(), key=lambda x: x[1], reverse=False)
    # 一天一次启动
    if (len(time) == 1) & (time[0][1] == 1): return 0
    # 一天多次启动
    elif (len(time) == 1) & (time[0][1] > 1): return 1
    # 多天多次启动
    elif (len(time) > 1) & (time[0][1] >= 2): return 2
    else: return 3

def get_continue_day(day_list):
    time = day_list.split(":")
    time = list(map(lambda x:int(x),time))
    m = np.array(time)
    if len(set(m)) == 1:
        return -1
    m = list(set(m))
    if len(m) == 0:
        return -20
    n = np.where(np.diff(m) == 1)[0]
    i = 0
    result = []
    while i < len(n) - 1:
        state = 1
        while n[i + 1] - n[i] == 1:
            state += 1
            i += 1
            if i == len(n) - 1:
                break
        if state == 1:
            i += 1
            result.append(2)
        else:
            i += 1
            result.append(state + 1)
    if len(n) == 1:
        result.append(2)
    if len(result) != 0:
        # print(result)
        return np.max(result)

def get_continue_launch_count(strs,parm):
    time = strs.split(":")
    time = dict(Counter(time))
    time = sorted(time.items(), key=lambda x: x[0], reverse=False)
    key_list = []
    value_list = []
    if len(time) == 1:
        return -2
    for key,value in dict(time).items():
        key_list.append(int(key))
        value_list.append(int(value))

    if np.mean(np.diff(key_list, 1)) == 1:
        if parm == '1': return np.mean(value_list)
        elif parm == '2': return np.max(value_list)
        elif parm == '3': return np.min(value_list)
        elif parm == '4': return np.sum(value_list)
        elif parm == '5': return np.std(value_list)
    else:
        return -1

def get_weekend(day):
    day = int(day)
    if day in [6,7,13,14,20,21,27,28]:
        return 1
    else:
        return 0

# 特征

In [6]:
train_df['mark'] = 'train'
test_df['mark'] = 'test'
total_df = train_df.append(test_df)

# 简单加减特征
total_df['seller_procduction'] = total_df['seller']+total_df['Product_id']
total_df['user_procduction'] = total_df['user_id']+total_df['Product_id']
total_df['seller_user'] = total_df['seller']+total_df['user_id']
total_df['seller_user_procduction'] = total_df['seller']+total_df['user_id']+total_df['Product_id']
# ?
total_df['production_action'] = total_df['Product_id']+str(total_df['action_type'])

feat1 = total_df.groupby(['seller_procduction'],as_index=False)['day'].agg({'s_p_count':'count'})
total_df = pd.merge(total_df, feat1, on=['seller_procduction'], how='left')

# feat1 = total_df.groupby(['user_procduction'],as_index=False)['day'].agg({'user_procduction_count':'count'})
# total_df = pd.merge(total_df, feat1, on=['user_procduction'], how='left')

feat1 = total_df.groupby(['seller_user'],as_index=False)['day'].agg({'seller_user_count':'count'})
total_df = pd.merge(total_df, feat1, on=['seller_user'], how='left')

# 用户是否多天有多次启动(均值)
feat1 = total_df.groupby(['user_id'],as_index=False)['day'].agg({'user_count':'count'})
feat3 = total_df[['user_id', 'day']]
feat3['day'] = feat3['day'].astype('str')
feat3 = feat3.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
feat3.rename(columns={'day': 'act_list'}, inplace=True)
feat3['time_gap_mean'] = feat3['act_list'].apply(get_time_gap,args=('1'))
# 最大
feat3['time_gap_max'] = feat3['act_list'].apply(get_time_gap,args=('2'))
# 最小
feat3['time_gap_min'] = feat3['act_list'].apply(get_time_gap,args=('3'))
# 方差
feat3['time_gap_std'] = feat3['act_list'].apply(get_time_gap,args=('4'))
# 锋度
feat3['time_gap_skew'] = feat3['act_list'].apply(get_time_gap, args=('5'))
# 偏度
feat3['time_gap_kurt'] = feat3['act_list'].apply(get_time_gap, args=('6'))
# 平均每天启动次数
feat3['mean_act_count'] = feat3['act_list'].apply(lambda x: len(x.split(":")) / len(set(x.split(":"))))
# 平均行为日期
feat3['act_mean_date'] = \
feat3['act_list'].apply(lambda x: np.sum([int(ele) for ele in x.split(":")]) / len(x.split(":")))
feat3['con_act_max'] = feat3['act_list'].apply(get_continue_day)
del feat3['act_list']

total_df = pd.merge(total_df, feat1, on=['user_id'], how='left')
total_df = pd.merge(total_df, feat3, on=['user_id'], how='left')

# 商品是否多天有多次启动(均值)
feat1 = total_df.groupby(['Product_id'],as_index=False)['day'].agg({'Product_count':'count'})
feat3 = total_df[['Product_id', 'day']]
feat3['day'] = feat3['day'].astype('str')
feat3 = feat3.groupby(['Product_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
feat3.rename(columns={'day': 'act_list'}, inplace=True)
feat3['time_gap_mean1'] = feat3['act_list'].apply(get_time_gap,args=('1'))
# 最大
feat3['time_gap_max1'] = feat3['act_list'].apply(get_time_gap,args=('2'))
# 最小
feat3['time_gap_min1'] = feat3['act_list'].apply(get_time_gap,args=('3'))
# 方差
feat3['time_gap_std1'] = feat3['act_list'].apply(get_time_gap,args=('4'))
# 锋度
feat3['time_gap_skew1'] = feat3['act_list'].apply(get_time_gap, args=('5'))
# 偏度
feat3['time_gap_kurt1'] = feat3['act_list'].apply(get_time_gap, args=('6'))
# 平均每天启动次数
feat3['mean_act_count1'] = feat3['act_list'].apply(lambda x: len(x.split(":")) / len(set(x.split(":"))))
# 平均行为日期
feat3['act_mean_date1'] = feat3['act_list'].apply(lambda x: np.sum([int(ele) for ele in x.split(":")]) / len(x.split(":")))

feat3['con_act_max1'] = feat3['act_list'].apply(get_continue_day)
del feat3['act_list']

total_df = pd.merge(total_df, feat1, on=['Product_id'], how='left')
total_df = pd.merge(total_df, feat3, on=['Product_id'], how='left')

#卖家是否多天有多次启动(均值)
feat1 = total_df.groupby(['seller'],as_index=False)['day'].agg({'seller_count':'count'})
feat3 = total_df[['seller', 'day']]
feat3['day'] = feat3['day'].astype('str')
feat3 = feat3.groupby(['seller'])['day'].agg(lambda x: ':'.join(x)).reset_index()
feat3.rename(columns={'day': 'act_list'}, inplace=True)
feat3['time_gap_mean2'] = feat3['act_list'].apply(get_time_gap,args=('1'))
# 最大
feat3['time_gap_max2'] = feat3['act_list'].apply(get_time_gap,args=('2'))
# 最小
feat3['time_gap_min2'] = feat3['act_list'].apply(get_time_gap,args=('3'))
# 方差
feat3['time_gap_std2'] = feat3['act_list'].apply(get_time_gap,args=('4'))
# 锋度
feat3['time_gap_skew2'] = feat3['act_list'].apply(get_time_gap, args=('5'))
# 偏度
feat3['time_gap_kurt2'] = feat3['act_list'].apply(get_time_gap, args=('6'))
# 平均每天启动次数
feat3['mean_act_count2'] = feat3['act_list'].apply(lambda x: len(x.split(":")) / len(set(x.split(":"))))
# 平均行为日期
feat3['act_mean_date2'] = feat3['act_list'].apply(lambda x: np.sum([int(ele) for ele in x.split(":")]) / len(x.split(":")))

feat3['con_act_max2'] = feat3['act_list'].apply(get_continue_day)
del feat3['act_list']

total_df = pd.merge(total_df, feat1, on=['seller'], how='left')
total_df = pd.merge(total_df, feat3, on=['seller'], how='left')

total_df['week'] = total_df['day'].apply(get_week)
temp = pd.get_dummies(total_df['week'], prefix = 'week')
total_df['week_new'] = list(map(lambda x: 'week_new' + str(x), total_df.week))
temp = pd.crosstab(total_df.user_id,total_df.week_new).reset_index()
del total_df['week_new']
total_df = pd.merge(total_df, temp, on=['user_id'], how='left')
del total_df['week']

# *********************************************************************************
#时间差分
def timeFeatures(dataset):
    dataset.sort_values(by = ['day'], ascending = True, inplace = True)
    dataset['shift'] = dataset.groupby(['user_id'])['day'].apply(lambda x:x.shift(1))
    dataset['user_span'] = dataset['day'] - dataset['shift']
    # dataset['user_span'] =  dataset['user_span'].fillna(0).astype('int')
    dataset.sort_index(axis = 0, inplace = True)
    del dataset['shift']
    
    return dataset

def action_feat(strs,parm):
    action = strs.split(":")
    count0 = action.count("0")
    count1 = action.count("1")
    count2 = action.count("2")
    count3 = action.count("3")
    count5 = action.count("5")
    if parm == 999: return len(action)
    if parm == 0: return count0/len(action)
    if parm == 1: return count1/len(action)
    if parm == 2: return count2/len(action)
    if parm == 3: return count3/len(action)
    if parm == 5: return count5/len(action)

# day相关
total_df['product_start'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg(min))
total_df['product_end'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg(max))
# total_df['product_skew'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg(stats.skew))
# total_df['product_kurtosis'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg(stats.kurtosis))
# total_df['product_std'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg(stats.kurtosis))
# total_df['product_mean'] = total_df['product_end']-total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].agg("mean"))
total_df['seller_start'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].agg(min))
total_df['seller_end'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].agg(max))
# total_df['seller_skew'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].agg(stats.skew))
# total_df['seller_kurtosis'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].agg(stats.kurtosis))
# total_df['seller_std'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].agg("std"))
# total_df['seller_mean'] = total_df['seller_end']-total_df['seller'].map(total_df.groupby(['seller'])['day'].agg("mean"))
total_df['user_start'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg(min))
total_df['user_end'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg(max))
# total_df['user_skew'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg(stats.skew))
# total_df['user_kurtosis'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg(stats.kurtosis))
# total_df['user_std'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg("std"))
# total_df['user_mean'] = total_df['user_end']-total_df['user_id'].map(total_df.groupby(['user_id'])['day'].agg("mean"))
# total_df['product_in'] = total_df['product_end'] - total_df['product_start']
# total_df['seller_in'] = total_df['seller_end'] - total_df['seller_start']
# total_df['user_in'] = total_df['user_end'] - total_df['user_start']

total_df['user_from_start'] = total_df['day'] - total_df['user_start']
total_df['user_to_end'] = total_df['user_end'] - total_df['day']
total_df['seller_from_start'] = total_df['day'] - total_df['seller_start']
total_df['seller_to_end'] = total_df['seller_end'] - total_df['day']
total_df['Product_id_from_start'] = total_df['day'] - total_df['product_start']
total_df['Product_id_to_end'] = total_df['product_end'] - total_df['day']
# total_df['user_to_mean'] = total_df['day'] - total_df['act_mean_date']
# total_df['Product_to_mean'] = total_df['day'] - total_df['act_mean_date1']
# total_df['seller_to_mean'] = total_df['day'] - total_df['act_mean_date2']

# action_type
# action_type_0 = train_df[train_df['action_type'] == 0]['favorite'].mean()
# action_type_1 = train_df[train_df['action_type'] == 1]['favorite'].mean()
# action_type_2 = train_df[train_df['action_type'] == 2]['favorite'].mean()
# action_type_3 = train_df[train_df['action_type'] == 3]['favorite'].mean()
# action_type_5 = train_df[train_df['action_type'] == 5]['favorite'].mean()
# print(action_type_0,action_type_1,action_type_2,action_type_3,action_type_5)
# total_df['action_type_'] = total_df['action_type'].map({2:action_type_2, 3:action_type_3, 0:action_type_0, 1:action_type_1, 5:action_type_5})

# total_df['action_type'] = total_df['action_type'].map({5:1, 2:2, 0:3, 1:4, 4:5})
# action_type = pd.get_dummies(total_df['action_type'], prefix = 'action_type')
# total_df = total_df.join(action_type)

# feat1 = total_df[['user_id', 'action_type']]
# feat1['action_type'] = feat1['action_type'].astype('str')
# feat1 = feat1.groupby(['user_id'])['action_type'].agg(lambda x: ':'.join(x)).reset_index(name='action_list')
    
# feat1['count'] = feat1['action_list'].apply(lambda x: action_feat(x,999))
# feat1['action_0_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,0))
# feat1['action_1_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,1))
# feat1['action_2_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,2))
# feat1['action_3_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,3))
# feat1['action_5_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,5))
# del feat1['count']
# del feat1['action_list']
# total_df = pd.merge(total_df,feat1,on = ['user_id'], how ='left')
# del feat1

# feat1 = total_df[['Product_id', 'action_type']]
# feat1['action_type'] = feat1['action_type'].astype('str')
# feat1 = feat1.groupby(['Product_id'])['action_type'].agg(lambda x: ':'.join(x)).reset_index(name='action_list')
    
# feat1['count'] = feat1['action_list'].apply(lambda x: action_feat(x,999))
# feat1['action_0_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,0))
# feat1['action_1_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,1))
# feat1['action_2_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,2))
# feat1['action_3_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,3))
# feat1['action_5_rate'] = feat1['action_list'].apply(lambda x: action_feat(x,5))
# del feat1['count']
# del feat1['action_list']
# total_df = pd.merge(total_df,feat1,on = ['Product_id'], how ='left')
# del feat1

# webInfo
# total_df['user_web_11'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_1'].agg("mean"))
# total_df['user_web_12'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_1'].agg("max"))
# total_df['user_web_13'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_1'].agg("min"))
# total_df['user_web_14'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_1'].agg("std"))
# total_df['user_web_21'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_2'].agg("mean"))
# total_df['user_web_22'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_2'].agg("max"))
# total_df['user_web_23'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_2'].agg("min"))
# total_df['user_web_24'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_2'].agg("std"))
# total_df['user_web_31'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_3'].agg("mean"))
# total_df['user_web_32'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_3'].agg("max"))
# total_df['user_web_33'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_3'].agg("min"))
# total_df['user_web_34'] = total_df['user_id'].map(total_df.groupby(['user_id'])['WebInfo_3'].agg("std"))
# total_df['WebInfo_123'] = total_df['WebInfo_1'] + total_df['WebInfo_2'] + total_df['WebInfo_3']
# total_df['WebInfo_12'] = total_df['WebInfo_1'] + total_df['WebInfo_2']
# total_df['WebInfo_23'] = total_df['WebInfo_2'] + total_df['WebInfo_3']
# total_df['WebInfo_13'] = total_df['WebInfo_1'] + total_df['WebInfo_3']

In [7]:
# 389 + 6 + 6 + 13(12) = 414 
temp_df = total_df.copy()
total_df.shape

(43087, 401)

# 特征2

In [114]:
total_df = temp_df.copy()

# total_df['weekend'] = total_df['day'].apply(get_weekend)

# 两两组合
# total_df['user_day'] = total_df['user_id'].map(total_df.groupby(['user_id'])['day'].nunique())
# total_df['day_user'] = total_df['day'].map(total_df.groupby(['day'])['user_id'].nunique())
# total_df['seller_day'] = total_df['seller'].map(total_df.groupby(['seller'])['day'].nunique())
# total_df['day_seller'] = total_df['day'].map(total_df.groupby(['day'])['seller'].nunique())
# total_df['product_day'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['day'].nunique())
# total_df['day_product'] = total_df['day'].map(total_df.groupby(['day'])['Product_id'].nunique())
total_df['seller_user_'] = total_df['seller'].map(total_df.groupby(['seller'])['user_id'].nunique())
total_df['user_seller'] = total_df['user_id'].map(total_df.groupby(['user_id'])['seller'].nunique())
total_df['seller_product'] = total_df['seller'].map(total_df.groupby(['seller'])['Product_id'].nunique())
total_df['product_seller'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['seller'].nunique())
total_df['product_user'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['user_id'].nunique())
total_df['user_product'] = total_df['user_id'].map(total_df.groupby(['user_id'])['Product_id'].nunique())
# total_df['product_action'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type'].nunique())
# total_df['action_product'] = total_df['action_type'].map(total_df.groupby(['action_type'])['Product_id'].nunique())
# total_df['seller_action'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type'].nunique())
# total_df['action_seller'] = total_df['action_type'].map(total_df.groupby(['action_type'])['seller'].nunique())
# total_df['user_action'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type'].nunique())
# total_df['action_user'] = total_df['action_type'].map(total_df.groupby(['action_type'])['user_id'].nunique())

total_df['action_type_'] = total_df['action_type'].map({2:1, 3:2, 0:3, 1:4, 5:5})
total_df['user_action_1_'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type_'].agg("mean"))
total_df['user_action_2_'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type_'].agg("max"))
total_df['user_action_3_'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type_'].agg("min"))
total_df['user_action_4_'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type_'].agg("std"))
# total_df['user_action_5_'] = total_df['user_id'].map(total_df.groupby(['user_id'])['action_type_'].agg("sum"))

total_df['seller_action_1_'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type_'].agg("mean"))
total_df['seller_action_2_'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type_'].agg("max"))
total_df['seller_action_3_'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type_'].agg("min"))
total_df['seller_action_4_'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type_'].agg("std"))
# total_df['seller_action_5_'] = total_df['seller'].map(total_df.groupby(['seller'])['action_type_'].agg("sum"))

total_df['product_action_1_'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type_'].agg("mean"))
total_df['product_action_2_'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type_'].agg("max"))
total_df['product_action_3_'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type_'].agg("min"))
total_df['product_action_4_'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type_'].agg("std"))
# total_df['product_action_5_'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['action_type_'].agg("sum"))


# 特征对
# user_seller = total_df.groupby(['user_id','seller']).size().reset_index(name='user_seller_d')
# total_df = pd.merge(total_df,user_seller,on = ['user_id','seller'], how ='left')
# user_seller = total_df.groupby(['user_id','Product_id']).size().reset_index(name='user_Product_d')
# total_df = pd.merge(total_df,user_seller,on = ['user_id','Product_id'], how ='left')
# user_seller = total_df.groupby(['Product_id','seller']).size().reset_index(name='Product_seller_d')
# total_df = pd.merge(total_df,user_seller,on = ['Product_id','seller'], how ='left')
# user_product_seller = total_df.groupby(['user_id','Product_id','seller']).size().reset_index(name='user_product_seller')
# total_df = pd.merge(total_df,user_product_seller,on = ['user_id','Product_id','seller'], how ='left')

# day为粒度
user_per_day = total_df.groupby(['user_id','day']).size().reset_index(name='user_per_day')
# user_per_day['user_per_day_min'] = \
# user_per_day['user_id'].map(user_per_day.groupby(['user_id'])['user_per_day'].agg('min'))
# user_per_day['user_per_day_max'] = \
# user_per_day['user_id'].map(user_per_day.groupby(['user_id'])['user_per_day'].agg('max'))
# user_per_day['user_per_day_std'] = \
# user_per_day['user_id'].map(user_per_day.groupby(['user_id'])['user_per_day'].agg('std'))

seller_per_day = total_df.groupby(['seller','day']).size().reset_index(name='seller_per_day')
# seller_per_day['seller_per_day_min'] = \
# seller_per_day['seller'].map(seller_per_day.groupby(['seller'])['seller_per_day'].agg('min'))
# seller_per_day['seller_per_day_max'] = \
# seller_per_day['seller'].map(seller_per_day.groupby(['seller'])['seller_per_day'].agg('max'))
# seller_per_day['seller_per_day_std'] = \
# seller_per_day['seller'].map(seller_per_day.groupby(['seller'])['seller_per_day'].agg('std'))

product_per_day = total_df.groupby(['Product_id','day']).size().reset_index(name='product_per_day')
# product_per_day['Produc_per_day_min'] = \
# product_per_day['Product_id'].map(product_per_day.groupby(['Product_id'])['product_per_day'].agg('min'))
# product_per_day['Produc_per_day_max'] = \
# product_per_day['Product_id'].map(product_per_day.groupby(['Product_id'])['product_per_day'].agg('max'))
# product_per_day['Product_per_day_std'] = \
# product_per_day['Product_id'].map(product_per_day.groupby(['Product_id'])['product_per_day'].agg('std'))
total_df = pd.merge(total_df,user_per_day,on = ['user_id','day'], how ='left')
total_df = pd.merge(total_df,seller_per_day,on = ['seller','day'], how ='left')
total_df = pd.merge(total_df,product_per_day,on = ['Product_id','day'], how ='left')


# 以week为粒度
# total_df['week'] = total_df['day'].apply(get_week)
# total_df['product_start_week'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['week'].agg(min))
# total_df['product_end_week'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['week'].agg(max))
# total_df['seller_start_week'] = total_df['seller'].map(total_df.groupby(['seller'])['week'].agg(min))
# total_df['seller_end_week'] = total_df['seller'].map(total_df.groupby(['seller'])['week'].agg(max))
# total_df['user_start_week'] = total_df['user_id'].map(total_df.groupby(['user_id'])['week'].agg(min))
# total_df['user_end_week'] = total_df['user_id'].map(total_df.groupby(['user_id'])['week'].agg(max))

# user_per_week = total_df.groupby(['user_id','week']).size().reset_index(name='user_per_week')
# user_per_week['user_per_week_min'] = \
# user_per_week['user_id'].map(user_per_week.groupby(['user_id'])['user_per_week'].agg('min'))
# user_per_week['user_per_week_max'] = \
# user_per_week['user_id'].map(user_per_week.groupby(['user_id'])['user_per_week'].agg('max'))
# user_per_week['user_per_week_std'] = \
# user_per_week['user_id'].map(user_per_week.groupby(['user_id'])['user_per_week'].agg('std'))

# seller_per_week = total_df.groupby(['seller','week']).size().reset_index(name='seller_per_week')
# seller_per_week['seller_per_week_min'] = \
# seller_per_week['seller'].map(seller_per_week.groupby(['seller'])['seller_per_week'].agg('min'))
# seller_per_week['seller_per_week_max'] = \
# seller_per_week['seller'].map(seller_per_week.groupby(['seller'])['seller_per_week'].agg('max'))
# seller_per_week['seller_per_week_std'] = \
# seller_per_week['seller'].map(seller_per_week.groupby(['seller'])['seller_per_week'].agg('std'))

# product_per_week = total_df.groupby(['Product_id','week']).size().reset_index(name='product_per_week')
# product_per_week['product_per_week_min'] = \
# product_per_week['Product_id'].map(product_per_week.groupby(['Product_id'])['product_per_week'].agg('min'))
# product_per_week['product_per_week_max'] = \
# product_per_week['Product_id'].map(product_per_week.groupby(['Product_id'])['product_per_week'].agg('max'))
# product_per_week['product_per_week_std'] = \
# product_per_week['Product_id'].map(product_per_week.groupby(['Product_id'])['product_per_week'].agg('std'))

# total_df = pd.merge(total_df,user_per_week,on = ['user_id','week'], how ='left')
# total_df = pd.merge(total_df,seller_per_week,on = ['seller','week'], how ='left')
# total_df = pd.merge(total_df,product_per_week,on = ['Product_id','week'], how ='left')

# 星期几
# total_df['workday'] = total_df['day'].apply(lambda x: x%7)
# user_per_week = total_df.groupby(['user_id','workday']).size().reset_index(name='user_per_week')
# seller_per_week = total_df.groupby(['seller','workday']).size().reset_index(name='seller_per_week')
# product_per_week = total_df.groupby(['Product_id','workday']).size().reset_index(name='product_per_week')
# total_df = pd.merge(total_df,user_per_week,on = ['user_id','workday'], how ='left')
# total_df = pd.merge(total_df,seller_per_week,on = ['seller','workday'], how ='left')
# total_df = pd.merge(total_df,product_per_week,on = ['Product_id','workday'], how ='left')

# # [UserInfo_4,UserInfo_186,UserInfo_229,UserInfo_169,UserInfo_121]
# total_df['user_4_1'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_4'].agg("mean"))
# total_df['user_4_2'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_4'].agg("max"))
# total_df['user_4_3'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_4'].agg("min"))
# total_df['user_4_4'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_4'].agg("std"))
# total_df['UserInfo_4_186'] = total_df['UserInfo_4'] + total_df['UserInfo_186']
# total_df['UserInfo_186_229'] = total_df['UserInfo_186'] + total_df['UserInfo_229']
# total_df['UserInfo_229_169'] = total_df['UserInfo_229'] + total_df['UserInfo_169']
# total_df['UserInfo_169_121'] = total_df['UserInfo_169'] + total_df['UserInfo_121']

#web
# total_df['is_web3_56'] = total_df['WebInfo_3'].apply(lambda x: 1 if x == 0  else 0  )
# total_df['user_web_11'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_1'].agg("mean"))
# total_df['user_web_12'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_1'].agg("max"))
# total_df['user_web_13'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_1'].agg("min"))
# total_df['user_web_14'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_1'].agg("std"))
# total_df['user_web_15'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_1'].nunique())
# total_df['user_web_21'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_2'].agg("mean"))
# total_df['user_web_22'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_2'].agg("max"))
# total_df['user_web_23'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_2'].agg("min"))
# total_df['user_web_24'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_2'].agg("std"))
# total_df['user_web_25'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_2'].nunique())
# total_df['user_web_31'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_3'].agg("mean"))
# total_df['user_web_32'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_3'].agg("max"))
# total_df['user_web_33'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_3'].agg("min"))
# total_df['user_web_34'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_3'].agg("std"))
# total_df['user_web_35'] = total_df['Product_id'].map(total_df.groupby(['Product_id'])['WebInfo_3'].nunique())

# 权重
# total_df['weight'] = total_df['day'].map(train_df.groupby(['day'])['favorite'].agg('mean'))

# w2w_user_product = total_df.groupby(['user_id'])['Product_id'] \
#                     .agg(lambda x: ''.join(x)).reset_index(name='w2w_user_product')
# total_df = pd.merge(total_df,w2w_user_product,on = ['user_id'], how ='left')
# from gensim.models import word2vec
# sentences = [[x] for x in total_df['w2w_user_product']]

# model = word2vec.Word2Vec(sentences,size=5,min_count=1,)
# w2w0,w2w1,w2w2,w2w3,w2w4 = [],[],[],[],[]

# for i in range(len(sentences)):
#     w2w0.append(model[sentences[i][0]][0])
#     w2w1.append(model[sentences[i][0]][1])
#     w2w2.append(model[sentences[i][0]][2])
#     w2w3.append(model[sentences[i][0]][3])
#     w2w4.append(model[sentences[i][0]][4])
# total_df['w2w0'] = pd.DataFrame(w2w0)
# total_df['w2w1'] = pd.DataFrame(w2w1)
# total_df['w2w2'] = pd.DataFrame(w2w2)
# total_df['w2w3'] = pd.DataFrame(w2w3)
# total_df['w2w4'] = pd.DataFrame(w2w4)
# del total_df['w2w_user_product']

# time
# total_df['start'] = total_df['user_start'] + total_df['seller_start'] + total_df['product_start']
# total_df['end'] = total_df['user_end'] + total_df['seller_end'] + total_df['product_end']

# total_df['test1'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_148'].agg('max'))
# total_df['test2'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_148'].agg('min'))
# total_df['test3'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_148'].agg('mean'))
# total_df['test4'] = total_df['user_id'].map(total_df.groupby(['user_id'])['UserInfo_148'].agg('std'))

# # cumcount
# # total_df = temp_df.copy()
# day_df = temp_df.copy()
# day_df = total_df[['user_id','Product_id','seller','day']]
# day_df.sort_values("day",inplace=True)
# day_df['user_cumcount'] = day_df.groupby('user_id').cumcount()
# day_df['seller_cumcount'] = day_df.groupby('seller').cumcount()
# day_df['Product_id_cumcount'] = day_df.groupby('Product_id').cumcount()
# day_df['user_cumcount_'] = day_df[::-1].groupby('user_id').cumcount()
# day_df['seller_cumcount_'] = day_df[::-1].groupby('seller').cumcount()
# day_df['Product_id_cumcount_'] = day_df[::-1].groupby('Product_id').cumcount()
# day_df.sort_index(inplace=True)
# # day_df[day_df['user_id'] == 'dgijam'][['user_cumcount','user_id','day']]
# # total_df = pd.merge(total_df,day_df, how ='left')
# total_df['user_cumcount'] = day_df['user_cumcount']
# total_df['seller_cumcount'] = day_df['seller_cumcount']
# total_df['Product_id_cumcount'] = day_df['Product_id_cumcount']
# total_df['user_cumcount_'] = day_df['user_cumcount_']
# total_df['seller_cumcount_'] = day_df['seller_cumcount_']
# total_df['Product_id_cumcount_'] = day_df['Product_id_cumcount_']
# # total_df.shape
# # day_df[day_df['user_id'] == 'dgijam'][['user_cumcount','user_id','day']]

# 和前面重复
# ten_days = total_df[(total_df.day >=20 ) & (total_df.day <= 30)]
# user_size_10 = ten_days.groupby(['user_id']).size().reset_index(name='user_size_10')
# seller_size_10 = ten_days.groupby(['seller']).size().reset_index(name='seller_size_10')
# Product_id_size_10 = ten_days.groupby(['Product_id']).size().reset_index(name='Product_id_size_10')
# total_df = pd.merge(total_df,user_size_10,on = ['user_id'], how ='left')
# total_df = pd.merge(total_df,seller_size_10,on = ['seller'], how ='left')
# total_df = pd.merge(total_df,Product_id_size_10,on = ['Product_id'], how ='left')

user_size = total_df.groupby(['user_id']).size().reset_index(name='user_size')
seller_size = total_df.groupby(['seller']).size().reset_index(name='seller_size')
Product_id_size = total_df.groupby(['Product_id']).size().reset_index(name='Product_id_size')
# seller_user_size = total_df.groupby(['seller_user']).size().reset_index(name='seller_user_size')
total_df = pd.merge(total_df,user_size,on = ['user_id'], how ='left')
total_df = pd.merge(total_df,seller_size,on = ['seller'], how ='left')
total_df = pd.merge(total_df,Product_id_size,on = ['Product_id'], how ='left')
# total_df = pd.merge(total_df,seller_user_size,on = ['seller_user'], how ='left')

train_df = total_df[total_df['mark']=='train']
test_df = total_df[total_df['mark']=='test']

train_df['mean_favorite'] = train_df['user_id'].map(train_df.groupby(['user_id'])['favorite'].agg('mean'))
train_df['mean(favorite)'] = (train_df['mean_favorite'] * train_df['user_size'] - train_df['favorite'])/(train_df['user_size']-1)
data_dict = train_df.groupby('user_id')['favorite'].agg('mean').to_dict()
test_df['mean(favorite)'] = test_df['user_id'].map(data_dict)

train_df['mean_favorite_seller'] = train_df['seller'].map(train_df.groupby(['seller'])['favorite'].agg('mean'))
train_df['mean(favorite)_seller'] = (train_df['mean_favorite_seller'] * train_df['seller_size'] - train_df['favorite'])/(train_df['seller_size']-1)
data_dict = train_df.groupby('seller')['favorite'].agg('mean').to_dict()
test_df['mean(favorite)_seller'] = test_df['seller'].map(data_dict)

# train_df['mean_favorite_seller_user'] = train_df['seller_user'].map(train_df.groupby(['seller_user'])['favorite'].agg('mean'))
# train_df['mean(favorite)_seller_user'] = (train_df['mean_favorite_seller_user'] * train_df['seller_user_size'] \
# - train_df['favorite'])/(train_df['seller_user_size']-1)
# data_dict = train_df.groupby('seller_user')['favorite'].agg('mean').to_dict()
# test_df['mean(favorite)_seller_user'] = test_df['seller_user'].map(data_dict)

# train_df['mean_favorite_product'] = train_df['Product_id'].map(train_df.groupby(['Product_id'])['favorite'].agg('mean'))
# train_df['mean(favorite)_product'] = (train_df['mean_favorite_product'] * train_df['Product_id_size'] - train_df['favorite'])/(train_df['Product_id_size']-1)
# data_dict = train_df.groupby('Product_id')['favorite'].agg('mean').to_dict()
# test_df['mean(favorite)_product'] = test_df['Product_id'].map(data_dict)

# train_df['mean_purchase'] = train_df['user_id'].map(train_df.groupby(['user_id'])['purchase'].agg('mean'))
# train_df['mean(purchase)'] = (train_df['mean_purchase'] * train_df['user_size'] - train_df['purchase'])/(train_df['user_size']-1)
# data_dict = train_df.groupby('user_id')['purchase'].agg('mean').to_dict()
# test_df['mean(purchase)'] = test_df['user_id'].map(data_dict)

train_df['mean_purchase_seller'] = train_df['seller'].map(train_df.groupby(['seller'])['purchase'].agg('mean'))
train_df['mean(purchase)_seller'] = (train_df['mean_purchase_seller'] * train_df['seller_size'] - train_df['purchase'])/(train_df['seller_size']-1)
data_dict = train_df.groupby('seller')['purchase'].agg('mean').to_dict()
test_df['mean(purchase)_seller'] = test_df['seller'].map(data_dict)

# train_df['mean_purchase_product'] = train_df['Product_id'].map(train_df.groupby(['Product_id']) \
# ['purchase'].agg('mean'))
# train_df['mean(purchase)_product'] = (train_df['mean_purchase_product'] * train_df['Product_id_size']  \
# - train_df['purchase'])/(train_df['Product_id_size']-1)
# data_dict = train_df.groupby('Product_id')['purchase'].agg('mean').to_dict()
# test_df['mean(purchase)_product'] = test_df['Product_id'].map(data_dict)

# train_df['mean(favorite)_day'] =  train_df['mean(favorite)'] * train_df['user_start']
# test_df['mean(favorite)_day'] =  test_df['mean(favorite)'] * test_df['user_start']
# train_df['mean(favorite)_seller_day'] =  train_df['mean(favorite)_seller'] * train_df['seller_start']
# test_df['mean(favorite)_seller_day'] =  test_df['mean(favorite)_seller'] * test_df['seller_start']
# train_df['mean(purchase)_seller_day'] =  train_df['mean(purchase)_seller'] * train_df['product_start']
# test_df['mean(purchase)_seller_day'] =  test_df['mean(purchase)_seller'] * test_df['product_start']

# woe
# favorite_df = train_df[train_df['favorite'] == 1]
# train_df['events'] = train_df['user_id'].map(favorite_df.groupby(['user_id']).size())
# non_favorite_df = train_df[train_df['favorite'] == 0]
# train_df['non_events'] = train_df['user_id'].map(non_favorite_df.groupby(['user_id']).size())
# train_df['non_events'] = train_df['non_events'].fillna(0)
# train_df['events'] = train_df['events'].fillna(0)
# import math
# train_df['woe'] = ((train_df['non_events'] + 0.5)/len(non_favorite_df))/ \
# ((train_df['events'] + 0.5)/len(favorite_df))
# train_df['woe'] = train_df['woe'].apply(lambda x: math.log(x))
# data_dict = train_df.groupby('user_id')['woe'].agg('mean').to_dict()
# test_df['woe'] = test_df['user_id'].map(data_dict)

# train_df['events_rate'] = train_df['events']/len(favorite_df)
# train_df['non_events_rate'] = train_df['non_events']/len(non_favorite_df)
# train_df['IV'] = (train_df['events_rate'] - train_df['non_events_rate'])*train_df['woe']

# del train_df['events']
# del train_df['non_events']



del train_df['mark']
del test_df['mark']
del train_df['mean_favorite']
del train_df['mean_favorite_seller']
# del train_df['mean_favorite_seller_user']

# del train_df['mean_purchase']
del train_df['mean_purchase_seller']
# del train_df['mean_purchase_product']
train_df.shape
# 394 406 412 418 419 422 423 424 425 428 

(33000, 428)

In [62]:
test_df.shape

(10087, 430)

In [26]:
# def count_batplot(feature):
#     f, ax =plt.subplots(4,1, figsize = (20,12))
#     sns.countplot(feature, hue = 'favorite', data = train_df, ax = ax[0])
#     sns.countplot(feature, hue = 'purchase', data = train_df, ax = ax[2])
#     sns.barplot(x = feature, y = 'favorite', data = train_df, palette="Blues_d", ax = ax[1])
#     sns.barplot(x = feature, y = 'purchase', data = train_df, palette="Blues_d", ax = ax[3])
#     plt.show()
# count_batplot('start')

In [109]:
from sklearn.feature_selection import SelectKBest,f_regression
import xgboost as xgb
def xgb_fea_select(train_x, train_y, f_name_list):
    rate_fea = 0.5
    dtrain=xgb.DMatrix(train_x, label=train_y)
    params = {
        'booster':'gbtree',
              'max_depth': 3,
              'colsample_bytree': 0.7,
              'subsample': 0.7, 
              'eta': 0.03,
              'silent': 1,
              # 'objective': 'binary:logistic',
             'objective': 'rank:pairwise',
              'min_child_weight': 3,
              'seed': 10,
              'eval_metric':'auc',
            #   'scale_pos_weight': 17410 / 15590,
            #   'scale_pos_weight': 3176 / 76824,
              'scale_pos_weight': 4062/28938,
              'verbose': 100,
        
    }
    watchlist = [(dtrain,'train')]
    bst=xgb.train(params,dtrain,num_boost_round=500,evals=watchlist, 
                  early_stopping_rounds=100)
    fscore_dict = bst.get_fscore()
    # print(fscore_dict)
    sorted_fs_dict = sorted(fscore_dict.items(),key = lambda x:x[1],reverse = True)
    # print([sorted_fs_dict[0] for x in sorted_fs_dict])
    fea_id_set = ([(item[0][:]) for item in sorted_fs_dict[:int(len(sorted_fs_dict)*rate_fea)]])
    # print(fea_id_set)
    # f_name_list = [item for i, item in enumerate(f_name_list) if i in fea_id_set]
    return fea_id_set

# 筛选

In [115]:
df_train = train_df
df_test = test_df
target1 = train_df['favorite']
target2 = train_df['purchase']
categorical_features = ['seller','user_id','Product_id']
for col in categorical_features:
        train_df[col] = train_df[col].astype('category')
        test_df[col] = test_df[col].astype('category')
df_train_columns = df_train.columns.tolist()
# 3个object特征
df_train_columns.remove('seller')
df_train_columns.remove('user_id')
df_train_columns.remove('Product_id')
# 2个target
df_train_columns.remove('favorite')
df_train_columns.remove('purchase')

df_train_columns.remove('seller_procduction')
df_train_columns.remove('seller_user')
df_train_columns.remove('user_procduction')
df_train_columns.remove('seller_user_procduction')
df_train_columns.remove('production_action')
# df_train_columns.remove('seller_user_count')
# df_train_columns.remove("mean_act_count2")

# df_train_columns.remove("con_act_max2")
df_train_columns.remove('action_type')
# df_train_columns.remove("seller_count")
df_train_columns1 = df_train_columns.copy()
# df_train_columns1.append('favorite')

# df_train_columns.remove('mean(purchase)')
df_train_columns.remove('mean(purchase)_seller')
# df_train_columns.remove('mean(purchase)_product')

df_train_columns1.remove('seller_user_')
df_train_columns1.remove('user_seller')
df_train_columns1.remove('seller_product')
df_train_columns1.remove('product_seller')
df_train_columns1.remove('product_user')
df_train_columns1.remove('user_product')

df_train_columns1.remove('mean(favorite)')
df_train_columns1.remove('mean(favorite)_seller')
# df_train_columns1.remove('mean_favorite_day')
# df_train_columns1.remove('mean(purchase)_seller')
# df_train_columns1.remove('mean(purchase)_product')

# df_train_columns1 = ['UserInfo_40','UserInfo_197','UserInfo_160','UserInfo_261','UserInfo_8','UserInfo_3','UserInfo_12','UserInfo_127','UserInfo_227']
target_favorite = train_df['favorite']
target_purchase = train_df['purchase']

#df_train_columns = xgb_fea_select(df_train.iloc[:][df_train_columns], df_train['favorite'], df_train_columns)

df_train_columns1 = xgb_fea_select(df_train.iloc[:][df_train_columns1], df_train['purchase'], df_train_columns1)

[0]	train-auc:0.69749
Will train until train-auc hasn't improved in 100 rounds.
[1]	train-auc:0.717612
[2]	train-auc:0.734915
[3]	train-auc:0.739087
[4]	train-auc:0.745396
[5]	train-auc:0.746121
[6]	train-auc:0.750536
[7]	train-auc:0.747089
[8]	train-auc:0.750077
[9]	train-auc:0.751562
[10]	train-auc:0.75096
[11]	train-auc:0.753935
[12]	train-auc:0.755902
[13]	train-auc:0.756468
[14]	train-auc:0.756725
[15]	train-auc:0.758002
[16]	train-auc:0.758003
[17]	train-auc:0.757325
[18]	train-auc:0.756567
[19]	train-auc:0.758243
[20]	train-auc:0.759094
[21]	train-auc:0.759327
[22]	train-auc:0.758621
[23]	train-auc:0.757921
[24]	train-auc:0.758209
[25]	train-auc:0.758273
[26]	train-auc:0.757704
[27]	train-auc:0.757346
[28]	train-auc:0.757601
[29]	train-auc:0.757463
[30]	train-auc:0.757069
[31]	train-auc:0.756411
[32]	train-auc:0.757184
[33]	train-auc:0.757082
[34]	train-auc:0.758947
[35]	train-auc:0.759287
[36]	train-auc:0.758656
[37]	train-auc:0.758354
[38]	train-auc:0.758098
[39]	train-auc:0.7

In [99]:
# from sklearn import feature_selection
# from sklearn.linear_model import LogisticRegression

# df_train = df_train.fillna(df_train.mean())
# lr_selector = feature_selection.SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit(df_train[df_train_columns], df_train['favorite'])
# indx = temp._get_support_mask().tolist()
# scores = get_importance(temp.estimator_).tolist()
# result = temp.transform(matrix).tolist()


In [96]:
# temp_columns = df_train_columns.copy()
# df_train_columns = temp_columns.copy()
# # df_train_columns += ['user_action']
print(len(df_train_columns)) #137
print(len(df_train_columns1)) #144
print(df_train.shape)

416
133
(33000, 430)


# lgb_favorite

In [111]:
# favorite
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbose": -1,
         "nthread": 4,
         "random_state": 666}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
# feature_importance_df = pd.DataFrame()
feature_importance_favorite = pd.DataFrame(df_train_columns, columns = ['feature'])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['favorite'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target1.iloc[trn_idx])#, categorical_feature=categorical_features)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target1.iloc[val_idx])#, categorical_feature=categorical_features)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    feature_importance_favorite[f"importance_fold{fold_}"] = clf.feature_importance()
    # fold_importance_df = pd.DataFrame()
    # fold_importance_df["Feature"] = df_train_columns
    # fold_importance_df["importance"] = clf.feature_importance()
    # fold_importance_df["fold"] = fold_ + 1
    # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

print(roc_auc_score(target1.values, oof))

fold 0
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.880002	valid_1's auc: 0.876738
[400]	training's auc: 0.904917	valid_1's auc: 0.895944
[600]	training's auc: 0.92116	valid_1's auc: 0.901151
Early stopping, best iteration is:
[678]	training's auc: 0.926169	valid_1's auc: 0.90134
fold 1
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.880626	valid_1's auc: 0.873603
[400]	training's auc: 0.904219	valid_1's auc: 0.890564
[600]	training's auc: 0.921397	valid_1's auc: 0.897312
[800]	training's auc: 0.93437	valid_1's auc: 0.899149
[1000]	training's auc: 0.945192	valid_1's auc: 0.900034
[1200]	training's auc: 0.954434	valid_1's auc: 0.900435
[1400]	training's auc: 0.962192	valid_1's auc: 0.900553
Early stopping, best iteration is:
[1384]	training's auc: 0.961599	valid_1's auc: 0.900628
fold 2
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.879613	valid_1's auc: 0.875999
[400

In [228]:
#0.9035534899726219 线上638
#0.8999157474028441 线上640
#0.8750573848315113 线上637
#0.7044681140320659 线上620
#0.700491297865058 线上619
#0.6993089706468049?
#0.7005724298591971 线上618 6+12+6+time_from_start*6
#0.6949741785758629 线上617 6+12+6
#0.6891717304314795 线上616 6+12 action_type
#0.6783940168424141 线上614 6个特征

In [76]:
# feature_importance_favorite[feature_importance_favorite['feature'] == 'UserInfo_4']

In [75]:
# feature_importance_favorite.head()

In [74]:
# nums = ['UserInfo_4','UserInfo_186','UserInfo_229','UserInfo_169','UserInfo_121']
# # nums = ['ProductInfo_87','ProductInfo_37','ProductInfo_133','ProductInfo_2','ProductInfo_96']
# for i in range(len(nums)):
#     if nums[i] in df_train_columns:
#         print(i)
# # # [ProductInfo_87,ProductInfo_37,ProductInfo_133,ProductInfo_2,ProductInfo_96]

In [25]:
# favorite特征重要度
feature_importance_favorite['importance'] = feature_importance_favorite.iloc[:,1:].mean(axis = 1)
plt.figure(figsize=(14,40))
sns.barplot(x="importance", y="feature", data=feature_importance_favorite.sort_values(by="importance", ascending=False))
plt.title('LightGBM  Features (avg over folds)')
plt.tight_layout()

# lgb_purchase

In [19]:
# df_test['favorite'] = predictions
# df_test['favorite'] = df_test['favorite'].apply(lambda x : 1 if x>0.5 else 0 )

In [116]:
# purchase
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 666}
oof = np.zeros(len(df_train))
predictions1 = np.zeros(len(df_test))
# feature_importance_df1 = pd.DataFrame()
df_train_columns1 = df_train_columns1
feature_importance_purchase = pd.DataFrame(df_train_columns1, columns = ['feature'])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['purchase'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns1], label=target2.iloc[trn_idx])#, categorical_feature=categorical_feature)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns1], label=target2.iloc[val_idx])#, categorical_feature=categorical_feature)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns1], num_iteration=clf.best_iteration)
    feature_importance_purchase[f"importance_fold{fold_}"] = clf.feature_importance()
    # fold_importance_df1 = pd.DataFrame()
    # fold_importance_df1["Feature"] = df_train_columns1
    # fold_importance_df1["importance"] = clf.feature_importance()
    # fold_importance_df1["fold"] = fold_ + 1
    # feature_importance_df1 = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions1 += clf.predict(df_test[df_train_columns1], num_iteration=clf.best_iteration) / folds.n_splits
print('cv score for valid is: ', roc_auc_score(target2,oof))

fold 0
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.806046	valid_1's auc: 0.791778
[200]	training's auc: 0.825885	valid_1's auc: 0.797714
[300]	training's auc: 0.856198	valid_1's auc: 0.808585
[400]	training's auc: 0.883621	valid_1's auc: 0.815789
[500]	training's auc: 0.90853	valid_1's auc: 0.819658
[600]	training's auc: 0.928796	valid_1's auc: 0.82237
[700]	training's auc: 0.944994	valid_1's auc: 0.824262
[800]	training's auc: 0.957138	valid_1's auc: 0.825248
[900]	training's auc: 0.967532	valid_1's auc: 0.826007
[1000]	training's auc: 0.975016	valid_1's auc: 0.826685
[1100]	training's auc: 0.981088	valid_1's auc: 0.827496
[1200]	training's auc: 0.985754	valid_1's auc: 0.828028
[1300]	training's auc: 0.989453	valid_1's auc: 0.827908
[1400]	training's auc: 0.992201	valid_1's auc: 0.827799
Early stopping, best iteration is:
[1355]	training's auc: 0.99106	valid_1's auc: 0.828148
fold 1
Training until validation scores don't improve for 100 round

In [91]:
#0.8150047543877149 640
#0.8528995707864748 625
#0.7253596621228516 620 product_per_day
#0.7245031815417257 618 调整后action_type
#0.7264163278976133 619

#0.7241813760375115 6+12
#0.7249323746494951 线上614 6个特征

In [100]:
# favorite特征重要度
feature_importance_purchase['importance'] = feature_importance_purchase.iloc[:,1:].mean(axis = 1)
plt.figure(figsize=(14,40))
sns.barplot(x="importance", y="feature", data=feature_importance_purchase.sort_values(by="importance", ascending=False))
plt.title('LightGBM  Features (avg over folds)')
plt.tight_layout()

# xgb_favorite

In [43]:
xgb_params = {'eta': 0.01, 
              'max_depth': 10, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
              'silent': True, 
              'nthread': 4}#xgb的参数，可以自己改

In [44]:
oof_xgb = np.zeros(len(df_train))#用于存放训练集的预测
predictions_xgb = np.zeros(len(df_test))#用于存放测试集的预测

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['favorite'].values)):
    print("fold n{}".format(fold_))
    trn_data = xgb.DMatrix(df_train.iloc[trn_idx][df_train_columns], target1.iloc[trn_idx])#训练集的80%
    val_data = xgb.DMatrix(df_train.iloc[val_idx][df_train_columns], target1.iloc[val_idx])#训练集的20%，验证集
 
    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=10000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(df_train.iloc[val_idx][df_train_columns]), ntree_limit=clf.best_ntree_limit)#预测20%的验证集
    predictions_xgb += clf.predict(xgb.DMatrix(df_test[df_train_columns]), ntree_limit=clf.best_ntree_limit) / folds.n_splits#预测测试集，并且取平均
    

fold n0
[0]	train-auc:0.763047	valid_data-auc:0.727522
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't improved in 200 rounds.
[100]	train-auc:0.926909	valid_data-auc:0.882652
[200]	train-auc:0.942527	valid_data-auc:0.889866
[300]	train-auc:0.95879	valid_data-auc:0.90106
[400]	train-auc:0.969719	valid_data-auc:0.906394
[500]	train-auc:0.979397	valid_data-auc:0.909429
[600]	train-auc:0.987587	valid_data-auc:0.911055
[700]	train-auc:0.992935	valid_data-auc:0.911899
[800]	train-auc:0.996302	valid_data-auc:0.912307
[900]	train-auc:0.998152	valid_data-auc:0.913435
[1000]	train-auc:0.999092	valid_data-auc:0.913846
[1100]	train-auc:0.999616	valid_data-auc:0.91399
[1200]	train-auc:0.999811	valid_data-auc:0.914994
[1300]	train-auc:0.999919	valid_data-auc:0.915215
[1400]	train-auc:0.999968	valid_data-auc:0.915388
[1500]	train-auc:0.999986	valid_data-auc:0.915958
[1600]	train-auc:0.999994	valid_data-auc:0.916445
[17

In [45]:
print("CV score: {:<8.8f}".format(roc_auc_score(target1, oof_xgb)))

CV score: 0.91805575


# xgb_purchase

In [46]:
oof_xgb = np.zeros(len(df_train))#用于存放训练集的预测
predictions_xgb1 = np.zeros(len(df_test))#用于存放测试集的预测

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['purchase'].values)):
    print("fold n{}".format(fold_))
    trn_data = xgb.DMatrix(df_train.iloc[trn_idx][df_train_columns1], target1.iloc[trn_idx])#训练集的80%
    val_data = xgb.DMatrix(df_train.iloc[val_idx][df_train_columns1], target1.iloc[val_idx])#训练集的20%，验证集
 
    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=10000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(df_train.iloc[val_idx][df_train_columns1]), ntree_limit=clf.best_ntree_limit)#预测20%的验证集
    predictions_xgb1 += clf.predict(xgb.DMatrix(df_test[df_train_columns1]), ntree_limit=clf.best_ntree_limit) / folds.n_splits#预测测试集，并且取平均
    
# print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))


fold n0
[0]	train-auc:0.664024	valid_data-auc:0.547299
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't improved in 200 rounds.
[100]	train-auc:0.968895	valid_data-auc:0.573016
[200]	train-auc:0.991192	valid_data-auc:0.571091
[300]	train-auc:0.997447	valid_data-auc:0.570898
Stopping. Best iteration:
[102]	train-auc:0.96996	valid_data-auc:0.573988

fold n1
[0]	train-auc:0.666057	valid_data-auc:0.540996
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't improved in 200 rounds.
[100]	train-auc:0.976525	valid_data-auc:0.561611
[200]	train-auc:0.994153	valid_data-auc:0.559108
Stopping. Best iteration:
[31]	train-auc:0.924758	valid_data-auc:0.56571

fold n2
[0]	train-auc:0.67675	valid_data-auc:0.522764
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't impr

In [47]:
print("CV score: {:<8.8f}".format(roc_auc_score(target2,oof_xgb)))

CV score: 0.51324584


# 提交

In [101]:
submition = pd.DataFrame(test_df, columns= ['user_id','Product_id'])
submition = submition.rename(index=str, columns={"Product_id": "product_id"})
submition['pred_favorite'] = predictions # 
submition['pred_purchase'] = predictions1 #

# submition['pred_favorite'] = submition['pred_favorite'].apply(lambda x : 0.95 if x > 0.95 else x)
# submition['pred_favorite'] = submition['pred_favorite'].apply(lambda x : 0.05 if x < 0.05 else x)
# submition['pred_purchase'] = submition['pred_purchase'].apply(lambda x : 0.95 if x > 0.95 else x)
# submition['pred_purchase'] = submition['pred_purchase'].apply(lambda x : 0.05 if x < 0.05 else x)

# submition['pred_favorite'] = predictions_xgb 
# submition['pred_purchase'] = predictions1

# submition['pred_favorite'] = (predictions*0.64+predictions_xgb*0.64)/(0.64+0.64) # 
# submition['pred_purchase'] = (predictions1*0.64+predictions_xgb1*0.64) /(0.64+0.64)#

# submition['pred_favorite'] = predictions_xgb*0.45+ predictions*0.55 # 0.0.0.5503838727351065
# submition['pred_purchase'] = predictions_xgb1*0.3+ predictions1*0.7 #.0.0.5062587993915597
# submition['pred_favorite'] = submition['pred_favorite'].apply(lambda x:round(x,1))
# submition['pred_purchase'] = submition['pred_purchase'].apply(lambda x:round(x,1))
submition.head()
submition.to_csv('/home/kesci/work/sub.csv',index=False)

In [39]:
# submition[submition['pred_favorite'] > 0.95]

In [69]:
submition.shape

(10087, 4)

In [102]:
!./upgeek_submit_tool -file sub.csv -token eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0YXNrIjoidGFzazIiLCJhdXRoVHlwZSI6InN1Ym1pdCIsInRlYW0iOiI4NjUwMDgiLCJpYXQiOjE1NjQzODQ3NjJ9.gatKmml30p6gkSILk0UkTFQcqOf1XsWcyLgpauo8aAo

Upgeek Submit Tool
Result File: sub.csv (0.529 MiB)
Uploaded.       
Submit Success.
{"message":"提交成功，请等待评审完成"}

