In [1]:
import pandas as pd
from collections import Counter
import datetime
import numpy as np
import warnings
import time
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
users = pd.read_csv('../Data/raw_data/user_register_log.txt', header=None,names=['user_id','register_day','register_type','device_type'],sep='\t') # 注册用户数据加载
user_login = pd.read_csv('../Data/raw_data/app_launch_log.txt', header=None,names=['user_id','day'],sep='\t') # app登录日志数据加载
user_act = pd.read_csv('../Data/raw_data/user_activity_log.txt', header=None,names=['user_id','day','page','video_id','author_id','action_type'],sep='\t') # 用户行为日志数据加载
user_video = pd.read_csv('../Data/raw_data/video_create_log.txt', header=None,names=['user_id','day'],sep='\t') # 用户拍摄视频日志数据加载

所有数据都是1-30天内的数据

且无缺失值

# 对数据进行滑窗法划分：

data: 

    +-data1 1-15
        --feature:1-15;label:16-22
    +-data2 1-23
        --feature:9-23;label:24-30
    +-data3 1-30
        --feature:16-30;label: 31-37

In [3]:
def cutDataFunc(data, cut_col ,start_day, end_day):
    return data[(data[cut_col]<=end_day)&(data[cut_col]>=start_day)]

def cutDataByTime(data_url, start_day, end_day):
    temp_users = cutDataFunc(users, 'register_day', 1, end_day)
    temp_login = cutDataFunc(user_login, 'day', start_day, end_day)
    temp_act = cutDataFunc(user_act, 'day', start_day, end_day)
    temp_video = cutDataFunc(user_video, 'day', start_day, end_day)
    
    temp_users.to_csv(data_url+'users.csv',index=False)
    temp_login.to_csv(data_url+'login.csv',index=False)
    temp_act.to_csv(data_url+'act.csv',index=False)
    temp_video.to_csv(data_url+'video.csv',index=False)
 

In [6]:
def cutDataProgram():
    start = time.time()
    print ("---------START-----------")
    
    cutDataByTime('../Data/data1/train_', 1, 15)
    cutDataByTime('../Data/data1/test_', 16, 22)
    print ("-----第1数据集完成-------")
    
    cutDataByTime('../Data/data2/train_', 9, 23)
    cutDataByTime('../Data/data2/test_', 24, 30)
    print ("-----第2数据集完成-------")
    
    cutDataByTime('../Data/data3/train_', 16, 30)
    print ("-----第3数据集完成-------")
    
    print ("----------END------------")
    end = time.time()
    print (end - start)

In [7]:
cutDataProgram()

---------START-----------
-----第1数据集完成-------
-----第2数据集完成-------
-----第3数据集完成-------
----------END------------
199.9984393119812


# train集构造和标签

In [8]:
# 获取有活跃行为的用户集，（从test集中获取）
def getActivityUsers(data_url):
    test_login = pd.read_csv(data_url+'test_login.csv')
    test_act = pd.read_csv(data_url+'test_act.csv')
    test_video = pd.read_csv(data_url+'test_video.csv')
    
    activity_user = np.unique(pd.concat([test_login['user_id'], test_act['user_id'], test_video['user_id']]))
    return activity_user

def get_diff_from_ls(x):
    x.sort()
    return list(np.diff(x))

# （video and lanuch）data create feature method
# 1、将day转换成距离时间窗口截点的距离
# 2、描述性day、cnt统计特征
# 3、连续1、 2、 3、 7天内的统计特征
def getCountFeature(data, name='login'):
    day_max = max(data['day'])
    data['day'] = day_max - data.day
    df = data.groupby(['user_id','day'],as_index=False).apply(lambda x:x.shape[0])
    df_temp = pd.DataFrame(df, columns=['cnt']).reset_index()
    
    df_temp_group = df_temp.groupby(['user_id'],as_index=False)
    res_df = df_temp_group.agg({'day':['max','min','std'],'cnt':['count','sum','max','var','mean']})
    res_df.columns = ['user_id', name+'_day_max',name+'_day_min',name+'_day_std',name+'_cnt',name+'_sum',name+'_max',name+'_var',name+'_mean']
    
    # 日期差分统计特征
#     df_temp_diff = df_temp[['user_id','day']].groupby('user_id').aggregate(lambda x: list(set(x)))
#     df_temp_diff.day = df_temp_diff.day.apply(lambda x:get_diff_from_ls(x))
#     res_df[name+'_day_diff_max'] = df_temp_diff.day.apply(lambda x: max(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_min'] = df_temp_diff.day.apply(lambda x: min(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_mean'] = df_temp_diff.day.apply(lambda x: np.mean(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_std'] = df_temp_diff.day.apply(lambda x: pd.Series(x).std() if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_skew'] = df_temp_diff.day.apply(lambda x: pd.Series(x).skew()).fillna(-1).values
#     res_df[name+'_day_diff_kurt'] = df_temp_diff.day.apply(lambda x: pd.Series(x).kurt()).fillna(-1).values
    
    
    for day_len in [1,3,7]:
        df_temp_day = df_temp[(df_temp.day>=0) & (df_temp.day<day_len)]
        df_temp_day_cnt = df_temp_day.groupby(['user_id']).apply(lambda x:sum(x['cnt'].values))
        add_df = pd.DataFrame(df_temp_day_cnt, columns=[name+'_'+str(day_len)+'_cnt']).reset_index()
        res_df = pd.merge(res_df, add_df, on='user_id', how='left').fillna(0)
        if day_len != 1:
            res_df[name+'_'+str(day_len)+'_cnt_arg'] = res_df[name+'_'+str(day_len)+'_cnt'] / day_len
    return res_df.fillna(0)

# act: create feature method
# 1、将day转换成距离时间窗口截点的距离
# 2、描述性day、cnt统计特征
# 3、连续1、 2、 3、 7天内的统计特征
# 4、page和action_type不同类型的所有统计数以及比率特征
# 5、page和action_type不同类型在连续1、 3、 7天内的统计特征
# 6、user_id是否为author_id的成员
def getCountFeatureAboutAct(data, name='act'):
    day_max = max(data['day'])
    data['day'] = day_max - data.day
    authors = set(data['author_id'])
    
    df = data.groupby(['user_id','day'],as_index=False).apply(lambda x:x.shape[0])
    df_temp = pd.DataFrame(df, columns=['cnt']).reset_index()
    
    df_temp_group = df_temp.groupby(['user_id'],as_index=False)
    res_df = df_temp_group.agg({'day':['max','min','std'],'cnt':['count','sum','max','var','mean']})
    res_df.columns = ['user_id', name+'_day_max',name+'_day_min',name+'_day_std',name+'_cnt',name+'_sum',name+'_max',name+'_var',name+'_mean']
    
    # 日期差分统计特征
#     df_temp_diff = df_temp[['user_id','day']].groupby('user_id').aggregate(lambda x: list(set(x)))
#     df_temp_diff.day = df_temp_diff.day.apply(lambda x:get_diff_from_ls(x))
#     res_df[name+'_day_diff_max'] = df_temp_diff.day.apply(lambda x: max(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_min'] = df_temp_diff.day.apply(lambda x: min(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_mean'] = df_temp_diff.day.apply(lambda x: np.mean(x) if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_std'] = df_temp_diff.day.apply(lambda x: pd.Series(x).std() if len(x)!=0 else -1).fillna(-1).values
#     res_df[name+'_day_diff_skew'] = df_temp_diff.day.apply(lambda x: pd.Series(x).skew()).fillna(-1).values
#     res_df[name+'_day_diff_kurt'] = df_temp_diff.day.apply(lambda x: pd.Series(x).kurt()).fillna(-1).values
    
    for day_len in [1,3,7]:
        df_temp_day = df_temp[(df_temp.day>=0) & (df_temp.day<day_len)]
        df_temp_day_cnt = df_temp_day.groupby(['user_id']).apply(lambda x:sum(x['cnt'].values))
        add_df = pd.DataFrame(df_temp_day_cnt, columns=[name+'_'+str(day_len)+'_cnt']).reset_index()
        res_df = pd.merge(res_df, add_df, on='user_id', how='left').fillna(0)
        if day_len != 1:
            res_df[name+'_'+str(day_len)+'_cnt_arg'] = res_df[name+'_'+str(day_len)+'_cnt'] / day_len
    
    # page 处理
    temp = data.groupby(['user_id','day','page'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    for day_len in [1,3,7]:
        for col in [0,1,2,3,4]:
            temp_day = temp[(temp.day>=0) & (temp.day<day_len)]
            df_temp_day_cnt = temp_day.groupby(['user_id']).apply(lambda x:sum(x[col].values))
            add_df = pd.DataFrame(df_temp_day_cnt, columns=['page_'+str(col)+'_'+str(day_len)+'_cnt']).reset_index()
            res_df = pd.merge(res_df, add_df, on='user_id', how='left').fillna(0)
            if day_len != 1:
                res_df['page_'+str(col)+'_'+str(day_len)+'_cnt_arg'] = res_df['page_'+str(col)+'_'+str(day_len)+'_cnt'] / day_len
    
    page = data.groupby(['user_id','page'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    page_sum = page[0]+page[1]+page[2]+page[3]+page[4]
    res_df['page_0_sigle'] = page[0] / page_sum
    res_df['page_1_sigle'] = page[1] / page_sum
    res_df['page_2_sigle'] = page[2] / page_sum
    res_df['page_3_sigle'] = page[3] / page_sum
    res_df['page_4_sigle'] = page[4] / page_sum
#     res_df['page_sum'] = page_sum
    res_df[['page_0','page_1','page_2','page_3','page_4']] = page[[0,1,2,3,4]]
    
    # action_type处理
    temp = data.groupby(['user_id','day','action_type'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    temp_group_action = temp.groupby(['user_id'])
    for day_len in [1,3,7]:
        for col in [0,1,2,3,4,5]:
            temp_day = temp[(temp.day>=0) & (temp.day<day_len)]
            df_temp_day_cnt = temp_day.groupby(['user_id']).apply(lambda x:sum(x[col].values))
            add_df = pd.DataFrame(df_temp_day_cnt, columns=['action_type_'+str(col)+'_'+str(day_len)+'_cnt']).reset_index()
            res_df = pd.merge(res_df, add_df, on='user_id', how='left').fillna(0)
            if day_len != 1:
                res_df['action_type_'+str(col)+'_'+str(day_len)+'_cnt_arg'] = res_df['action_type_'+str(col)+'_'+str(day_len)+'_cnt'] / day_len
    
    action_type = data.groupby(['user_id','action_type'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    action_type_sum = action_type[0]+action_type[1]+action_type[2]+action_type[3]+action_type[4]+action_type[5]
    res_df['action_type_0_sigle'] = action_type[0] / action_type_sum
    res_df['action_type_1_sigle'] = action_type[1] / action_type_sum
    res_df['action_type_2_sigle'] = action_type[2] / action_type_sum
    res_df['action_type_3_sigle'] = action_type[3] / action_type_sum
    res_df['action_type_4_sigle'] = action_type[4] / action_type_sum
    res_df['action_type_5_sigle'] = action_type[5] / action_type_sum
    #     res_df['action_type_sum'] = action_type_sum
    res_df[['action_type_0','action_type_1','action_type_2','action_type_3','action_type_4','action_type_5']] = action_type[[0,1,2,3,4,5]]
    
    res_df['is_author'] = res_df['user_id'].apply(lambda x: 1 if x in authors else 0)
    
    return res_df.fillna(0)

In [9]:
# 获取最近一天的点击数量
def getCntOfOneDay(data):
    return sum(data[(data.day==0)]['cnt'].values)

# 获取最近一段时间内的点击数量
def getCntOfSomeDay(data, day_len=7):
    return sum(data[(data.day>=0) & (data.day<day_len)]['cnt'].values)

# 获取最近一段时间内的点击数量（改进版，可以自由选择col统计）
def getCntOfSomeDayWithCol(data, day_len=1, col='page'):
    return sum(data[(data.day>=0) & (data.day<day_len)][col].values)

In [10]:
# 通过video 、 act 、 register 、 lauch来构造特征
def constructDataFeature(data_url):
    train_login = pd.read_csv(data_url+'train_login.csv')
    train_act = pd.read_csv(data_url+'train_act.csv')
    train_video = pd.read_csv(data_url+'train_video.csv')
    
    # register data
    train_user = pd.read_csv(data_url+'train_users.csv')
    feature = train_user
    max_day = max(feature['register_day'])
    feature['register_day'] = max_day - feature.register_day  # day改成时间窗口截点距离
#     feature['register_day_diff'] = max_day - feature.register_day  # day改成时间窗口截点距离
    
    # login data
    train_login_feas = getCountFeature(train_login, 'login')
    feature = pd.merge(feature, train_login_feas, on='user_id', how='left')
    
    # video data
    train_video_feas = getCountFeature(train_video, 'video')
    feature = pd.merge(feature, train_video_feas, on='user_id', how='left')
    
    # act data
    train_act_feas = getCountFeatureAboutAct(train_act, 'act')
    feature = pd.merge(feature, train_act_feas, on='user_id', how='left')
    
#     feature[ [c for c in feature.columns if 'day_diff' in c ] ] = feature[ [c for c in feature.columns if 'day_diff' in c ] ].fillna(-1)
    
    # 缺失值全部补NAN
    return feature

# 通过TEST未来几天内的活跃用户来给Train集标签
def getTrainLabel(data_url, data):
    # get activity label of train from test_dataset
    train_label = []
    activity_users = getActivityUsers(data_url)
    for u in data['user_id']:
        if u in activity_users:
            train_label.append(1)
        else:
            train_label.append(0)
    data['label'] = train_label
    return data
    

In [11]:
def getDataProgram():
    start = time.time()
    print("----------------构造训练集-------------------")
    
    data_url = '../Data/data1/'
    data1 = constructDataFeature(data_url)
    data1 = getTrainLabel(data_url, data1)
    print("---------第一组数据集处理完成----------------")
    data1.to_csv(data_url+'data1.csv',index=False)
    end = time.time()
    print (end - start)
    
    data_url = '../Data/data2/'
    data2 = constructDataFeature(data_url)
    data2 = getTrainLabel(data_url, data2)
    print("---------第二组数据集处理完成----------------")
    data2.to_csv(data_url+'data2.csv',index=False)
    end = time.time()
    print (end - start)
    
    data_url = '../Data/data3/'
    data3 = constructDataFeature(data_url)
    print("---------第三组数据集处理完成----------------")
    data3.to_csv(data_url+'data3.csv',index=False)
    
    print("--------------------END----------------------")
    end = time.time()
    print (end - start)

In [12]:
getDataProgram()

----------------构造训练集-------------------
---------第一组数据集处理完成----------------
39.82827806472778
---------第二组数据集处理完成----------------
110.1863021850586
---------第三组数据集处理完成----------------
--------------------END----------------------
197.82431507110596


# 训练模型并进行预测

In [14]:
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

In [15]:
data1_in = pd.read_csv('../Data/data1/data1.csv')
data2_in = pd.read_csv('../Data/data2/data2.csv')
data3_in = pd.read_csv('../Data/data3/data3.csv')

In [18]:
data2_in.shape

(37446, 122)

# merge特征

In [4]:
path = '../Data/cnt_diff_data/'
feature_data1 = pd.read_csv(path + 'data1.csv')
feature_data2 = pd.read_csv(path + 'data2.csv')
feature_data3 = pd.read_csv(path + 'data3.csv')

In [5]:
data1_in = pd.merge(data1_in, feature_data1, on='user_id', how='left')
data2_in = pd.merge(data2_in, feature_data2, on='user_id', how='left')
data3_in = pd.merge(data3_in, feature_data3, on='user_id', how='left')

In [6]:
cates = pd.Categorical(data3_in['device_type'])
categories = cates.categories
data3_in['device_type'] = cates.codes
data1_in['device_type'] = data1_in['device_type'].apply(lambda x:categories.get_loc(x))
data2_in['device_type'] = data2_in['device_type'].apply(lambda x:categories.get_loc(x))

----------------------------------------------------------------------------------------------

In [19]:
# 列举所有的columns
for i in range(1,int(len(data1_in.columns)/5)+1):
    print (list(data1_in.columns)[(i-1)*5: i*5])

['user_id', 'register_day', 'register_type', 'device_type', 'login_day_max']
['login_day_min', 'login_day_std', 'login_cnt', 'login_sum', 'login_max']
['login_var', 'login_mean', 'login_1_cnt', 'login_3_cnt', 'login_3_cnt_arg']
['login_7_cnt', 'login_7_cnt_arg', 'video_day_max', 'video_day_min', 'video_day_std']
['video_cnt', 'video_sum', 'video_max', 'video_var', 'video_mean']
['video_1_cnt', 'video_3_cnt', 'video_3_cnt_arg', 'video_7_cnt', 'video_7_cnt_arg']
['act_day_max', 'act_day_min', 'act_day_std', 'act_cnt', 'act_sum']
['act_max', 'act_var', 'act_mean', 'act_1_cnt', 'act_3_cnt']
['act_3_cnt_arg', 'act_7_cnt', 'act_7_cnt_arg', 'page_0_1_cnt', 'page_1_1_cnt']
['page_2_1_cnt', 'page_3_1_cnt', 'page_4_1_cnt', 'page_0_3_cnt', 'page_0_3_cnt_arg']
['page_1_3_cnt', 'page_1_3_cnt_arg', 'page_2_3_cnt', 'page_2_3_cnt_arg', 'page_3_3_cnt']
['page_3_3_cnt_arg', 'page_4_3_cnt', 'page_4_3_cnt_arg', 'page_0_7_cnt', 'page_0_7_cnt_arg']
['page_1_7_cnt', 'page_1_7_cnt_arg', 'page_2_7_cnt', 'page_

In [20]:
# 手动标定不需要的cols
drop_cols = ['login_sum','login_max','login_var','login_mean','login_3_cnt','login_2_cnt','login_7_cnt','device_map','page_sum','action_type_sum']

In [21]:
# data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols and c in select_cols]]
# data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols and c in select_cols]]
# data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols and c in select_cols]]

data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols]]
data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols]]
data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols]]

In [22]:
print (data1.shape)
print (data2.shape)
print (data3.shape)

(22342, 116)
(37446, 116)
(51709, 115)


In [27]:
# 目前线上的参数，调过
LGBM = lgb.LGBMClassifier(  max_depth=6,
                            n_estimators = 280,
                            learning_rate =0.05,     
                            objective = 'binary',
                            num_leaves=25,
                            boosting_type = 'dart',
                            feature_fraction=0.5,
                            lambda_l1=1,
                            lambda_l2=0.5,
                            subsample=0.7
)

In [24]:
# 题目规定线下 F1计算方法
def sroceF1(pred, real):
    M = set(pred)
    N = set(real)
    Precision = len(M.intersection(N))/len(M)
    Recall = len(M.intersection(N))/len(N)
    F1 = 2*Precision*Recall/(Precision+Recall)

    print("Precision=",Precision,"| Recall=",Recall)
    print("F1=",F1)

In [25]:
# 训练模型，做预测
def buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=0.4, model=LGBM):
    # 线上预测
    # @return: 返回活跃用户
    if (isOnLine):
        train = pd.concat([data1, data2])
        test = data3.copy()
        train.pop('user_id')
        label = train.pop('label')
        
        model.fit(train, label)
        user_list = test.pop('user_id')
        user_df = pd.DataFrame(user_list)
        user_df['pre_act'] = model.predict_proba(test)[:,1]
        return user_df

    # 线下训练调试
    else: 
        train = data1.copy()
        test = data2.copy()
        train.pop('user_id')
        train_df_label = train.pop('label')
        train_df = train
        
        real_user = test[test.label==1]['user_id']
        user_list = test.pop('user_id')
        test.pop('label')
        test_df = test
        
        user_df = pd.DataFrame(user_list)
        model.fit(train_df, train_df_label)
        user_df['pre_act'] = model.predict_proba(test_df)[:,1]
        
        user_pre = user_df[user_df.pre_act>yuzhi]['user_id']
        sroceF1(user_pre, real_user)
        return None

In [30]:
for i in np.arange(0.39, 0.43, 0.002):
    user_df = buildModelAndPredict(isOnLine=False, isTest=False, yuzhi=i, model=LGBM)
    print(i)

Precision= 0.7702635006512374 | Recall= 0.8437688635241178
F1= 0.8053424119418621
0.38
Precision= 0.7710571227788374 | Recall= 0.842945727926247
F1= 0.8054004456678464
0.382
Precision= 0.7724342834122269 | Recall= 0.8417384623827032
F1= 0.8055985924739371
0.384
Precision= 0.7730621719822366 | Recall= 0.8406409482522087
F1= 0.8054365256710219
0.386
Precision= 0.7737691646005161 | Recall= 0.8391593041760412
F1= 0.8051387353235403
0.388
Precision= 0.7748236846110914 | Recall= 0.8380069143390221
F1= 0.8051776863861647
0.39
Precision= 0.775489448258327 | Recall= 0.8368545245020029
F1= 0.8050042229729729
0.392
Precision= 0.7761239677846875 | Recall= 0.8355375075454097
F1= 0.8047356042387886
0.394
Precision= 0.7767168751595609 | Recall= 0.8347692476540636
F1= 0.8046974185357597
0.396
Precision= 0.7774424552429667 | Recall= 0.8340558634692422
F1= 0.8047547189791651
0.398
Precision= 0.7781251601660601 | Recall= 0.8331229764583219
F1= 0.8046854295860496
0.4
Precision= 0.7789073731772438 | Recall

In [33]:
user_df = buildModelAndPredict(isOnLine=True,isTest=False, yuzhi=0.4, model=LGBM)

In [45]:
# user_pre = user_df.sort_values(by='pre_act', axis = 0, ascending = False)['user_id'].head(24500)
user_pre = user_df[user_df.pre_act>0.404]['user_id']

In [46]:
len(user_pre)

24700

# 结果数据提交

In [47]:
user_pre.to_csv('../Output/0624_114_8046854_24700_404.csv', index=False)

// 复现82Line

0624_114_8046854_24832_4    线上：0.819498

0624_114_8046854_24700_404   线上: 