In [1]:
import pandas as pd
from collections import Counter
import datetime
import numpy as np
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
users = pd.read_csv('../Data/raw_data/user_register_log.txt', header=None,names=['user_id','register_day','register_type','device_type'],sep='\t') # 注册用户数据加载
user_login = pd.read_csv('../Data/raw_data/app_launch_log.txt', header=None,names=['user_id','day'],sep='\t') # app登录日志数据加载
user_act = pd.read_csv('../Data/raw_data/user_activity_log.txt', header=None,names=['user_id','day','page','video_id','author_id','action_type'],sep='\t') # 用户行为日志数据加载
user_video = pd.read_csv('../Data/raw_data/video_create_log.txt', header=None,names=['user_id','day'],sep='\t') # 用户拍摄视频日志数据加载

所有数据都是1-30天内的数据

且无缺失值

# 对数据进行滑窗法划分：

data: 

    +-data1
        --feature:1-15;label:16-22
    +-data2
        --feature:9-23;label:24-30
    +-data3
        --feature:1-30;label: 31-37

In [3]:
def cutDataFunc(data, cut_col ,start_day, end_day):
    return data[(data[cut_col]<=end_day)&(data[cut_col]>=start_day)]

def cutDataByTime(data_url, start_day, end_day):
    temp_users = cutDataFunc(users, 'register_day', start_day, end_day)
    temp_login = cutDataFunc(user_login, 'day', start_day, end_day)
    temp_act = cutDataFunc(user_act, 'day', start_day, end_day)
    temp_video = cutDataFunc(user_video, 'day', start_day, end_day)
    
    temp_users.to_csv(data_url+'users.csv',index=False)
    temp_login.to_csv(data_url+'login.csv',index=False)
    temp_act.to_csv(data_url+'act.csv',index=False)
    temp_video.to_csv(data_url+'video.csv',index=False)
 

In [4]:
def cutDataProgram():
    print ("---------START-----------")
    
    cutDataByTime('../Data/data1/train_', 1, 15)
    cutDataByTime('../Data/data1/test_', 16, 22)
    print ("-----第1数据集完成-------")
    
    cutDataByTime('../Data/data2/train_', 9, 23)
    cutDataByTime('../Data/data2/test_', 24, 30)
    print ("-----第2数据集完成-------")
    
    cutDataByTime('../Data/data3/train_', 1, 30)
    print ("-----第3数据集完成-------")
    
    print ("----------END------------")

In [5]:
cutDataProgram()

---------START-----------
-----第1数据集完成-------
-----第2数据集完成-------
-----第3数据集完成-------
----------END------------


# train集构造和标签

In [2]:
# 获取最近一天的点击数量
def getCntOfOneDay(data):
    return sum(data[(data.day==0)]['cnt'].values)

# 获取最近一段时间内的点击数量
def getCntOfSomeDay(data, day_len=7):
    return sum(data[(data.day>=0) & (data.day<day_len)]['cnt'].values)

# 获取最近一段时间内的点击数量（改进版，可以自由选择col统计）
def getCntOfSomeDayWithCol(data, day_len=1, col='page'):
    return sum(data[(data.day>=0) & (data.day<day_len)][col].values)

In [3]:
# 获取有活跃行为的用户集，（从test集中获取）
def getActivityUsers(data_url):
    test_login = pd.read_csv(data_url+'test_login.csv')
    test_act = pd.read_csv(data_url+'test_act.csv')
    test_video = pd.read_csv(data_url+'test_video.csv')
    
    activity_user = np.unique(pd.concat([test_login['user_id'], test_act['user_id'], test_video['user_id']]))
    return activity_user

# （video and lanuch）data create feature method
# 1、将day转换成距离时间窗口截点的距离
# 2、描述性day、cnt统计特征
# 3、连续1、 2、 3、 7天内的统计特征
def getCountFeature(data, name='login'):
    day_max = max(data['day'])
    data['day'] = day_max - data.day
    df = data.groupby(['user_id','day'],as_index=False).apply(lambda x:x.shape[0])
    df_temp = pd.DataFrame(df, columns=['cnt']).reset_index()
    res_df = df_temp.groupby(['user_id'],as_index=False).agg({'day':['max','min','std'],'cnt':['count','sum','max','var','mean']})
    res_df.columns = ['user_id', name+'_day_max',name+'_day_min',name+'_day_std',name+'_cnt',name+'_sum',name+'_max',name+'var',name+'mean']
    
    res_df[name+'_last_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfOneDay(x))
    res_df[name+'_2_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x, 2))
    res_df[name+'_2_arg_cnt'] = res_df[name+'_2_cnt'] / 2
    res_df[name+'_3_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x, 3))
    res_df[name+'_3_arg_cnt'] = res_df[name+'_3_cnt'] / 3
    res_df[name+'_week_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x))
    res_df[name+'_week_arg_cnt'] = res_df[name+'_week_cnt'] / 7
    return res_df.fillna(0)

# act: create feature method
# 1、将day转换成距离时间窗口截点的距离
# 2、描述性day、cnt统计特征
# 3、连续1、 2、 3、 7天内的统计特征
# 4、page和action_type不同类型的所有统计数以及比率特征
# 5、page和action_type不同类型在连续1、 3、 7天内的统计特征
# 6、user_id是否为author_id的成员
def getCountFeatureAboutAct(data, name='act'):
    day_max = max(data['day'])
    data['day'] = day_max - data.day
    authors = set(data['author_id'])
    
    df = data.groupby(['user_id','day'],as_index=False).apply(lambda x:x.shape[0])
    df_temp = pd.DataFrame(df, columns=['cnt']).reset_index()
    res_df = df_temp.groupby(['user_id'],as_index=False).agg({'day':['max','min','std'],'cnt':['count','sum','max','var','mean']})
    res_df.columns = ['user_id', name+'_day_max',name+'_day_min',name+'_day_std',name+'_cnt',name+'_sum',name+'_max',name+'var',name+'mean']
    
    res_df[name+'_last_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfOneDay(x))
    res_df[name+'_2_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x, 2))
    res_df[name+'_2_arg_cnt'] = res_df[name+'_2_cnt'] / 2
    res_df[name+'_3_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x, 3))
    res_df[name+'_3_arg_cnt'] = res_df[name+'_3_cnt'] / 3
    res_df[name+'_week_cnt'] = df_temp.groupby(['user_id'], as_index=False).apply(lambda x:getCntOfSomeDay(x))
    res_df[name+'_week_arg_cnt'] = res_df[name+'_week_cnt'] / 7
    
    temp = data.groupby(['user_id','day','page'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    res_df['page_0_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,0))
    res_df['page_0_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,0))
    res_df['page_0_3_arg_cnt'] = res_df['page_0_3_cnt'] / 3
    res_df['page_0_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,0))
    res_df['page_0_7_arg_cnt'] = res_df['page_0_7_cnt'] / 7
    
    res_df['page_1_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,1))
    res_df['page_1_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,1))
    res_df['page_1_3_arg_cnt'] = res_df['page_1_3_cnt'] / 3
    res_df['page_1_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,1))
    res_df['page_1_7_arg_cnt'] = res_df['page_1_7_cnt'] / 7
    
    res_df['page_2_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,2))
    res_df['page_2_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,2))
    res_df['page_2_3_arg_cnt'] = res_df['page_2_3_cnt'] / 3
    res_df['page_2_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,2))
    res_df['page_2_7_arg_cnt'] = res_df['page_2_7_cnt'] / 7
    
    res_df['page_3_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,3))
    res_df['page_3_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,3))
    res_df['page_3_3_arg_cnt'] = res_df['page_3_3_cnt'] / 3
    res_df['page_3_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,3))
    res_df['page_3_7_arg_cnt'] = res_df['page_3_7_cnt'] / 7
    
    res_df['page_4_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,4))
    res_df['page_4_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,4))
    res_df['page_4_3_arg_cnt'] = res_df['page_4_3_cnt'] / 3
    res_df['page_4_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,4))
    res_df['page_4_7_arg_cnt'] = res_df['page_4_7_cnt'] / 7
    
    page = data.groupby(['user_id','page'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    res_df['page_sum'] = page[0]+page[1]+page[2]+page[3]+page[4]
    res_df['page_0_sigle'] = page[0] / res_df['page_sum']
    res_df['page_1_sigle'] = page[1] / res_df['page_sum']
    res_df['page_2_sigle'] = page[2] / res_df['page_sum']
    res_df['page_3_sigle'] = page[3] / res_df['page_sum']
    res_df['page_4_sigle'] = page[4] / res_df['page_sum']
    res_df[['page_0','page_1','page_2','page_3','page_4']] = page[[0,1,2,3,4]]
    
    temp = data.groupby(['user_id','day','action_type'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index().fillna(0)
    res_df['action_type_0_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,0))
    res_df['action_type_0_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,0))
    res_df['action_type_0_3_arg_cnt'] = res_df['action_type_0_3_cnt'] / 3
    res_df['action_type_0_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,0))
    res_df['action_type_0_7_arg_cnt'] = res_df['action_type_0_7_cnt'] / 7
    
    res_df['action_type_1_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,1))
    res_df['action_type_1_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,1))
    res_df['action_type_1_3_arg_cnt'] = res_df['action_type_1_3_cnt'] / 3
    res_df['action_type_1_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,1))
    res_df['action_type_1_7_arg_cnt'] = res_df['action_type_1_7_cnt'] / 7
    
    res_df['action_type_2_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,2))
    res_df['action_type_2_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,2))
    res_df['action_type_2_3_arg_cnt'] = res_df['action_type_2_3_cnt'] / 3
    res_df['action_type_2_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,2))
    res_df['action_type_2_7_arg_cnt'] = res_df['action_type_2_7_cnt'] / 7
    
    res_df['action_type_3_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,3))
    res_df['action_type_3_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,3))
    res_df['action_type_3_3_arg_cnt'] = res_df['action_type_3_3_cnt'] / 3
    res_df['action_type_3_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,3))
    res_df['action_type_3_7_arg_cnt'] = res_df['action_type_3_7_cnt'] / 7
    
    res_df['action_type_4_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,4))
    res_df['action_type_4_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,4))
    res_df['action_type_4_3_arg_cnt'] = res_df['action_type_4_3_cnt'] / 3
    res_df['action_type_4_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,4))
    res_df['action_type_4_7_arg_cnt'] = res_df['action_type_4_7_cnt'] / 7
    
    res_df['action_type_5_1_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,1,5))
    res_df['action_type_5_3_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,3,5))
    res_df['action_type_5_3_arg_cnt'] = res_df['action_type_5_3_cnt'] / 3
    res_df['action_type_5_7_cnt'] = temp.groupby(['user_id']).apply(lambda x:getCntOfSomeDayWithCol(x,7,5))
    res_df['action_type_5_7_arg_cnt'] = res_df['action_type_5_7_cnt'] / 7
    
    action_type = data.groupby(['user_id','action_type'],as_index=False).apply(lambda x:x.shape[0]).unstack().reset_index()
    res_df['action_type_sum'] = action_type[0]+action_type[1]+action_type[2]+action_type[3]+action_type[4]+action_type[5]
    res_df['action_type_0_sigle'] = action_type[0] / res_df['action_type_sum']
    res_df['action_type_1_sigle'] = action_type[1] / res_df['action_type_sum']
    res_df['action_type_2_sigle'] = action_type[2] / res_df['action_type_sum']
    res_df['action_type_3_sigle'] = action_type[3] / res_df['action_type_sum']
    res_df['action_type_4_sigle'] = action_type[4] / res_df['action_type_sum']
    res_df['action_type_5_sigle'] = action_type[5] / res_df['action_type_sum']
    res_df[['action_type_0','action_type_1','action_type_2','action_type_3','action_type_4','action_type_5']] = action_type[[0,1,2,3,4,5]]
    
    res_df['is_author'] = res_df['user_id'].apply(lambda x: 1 if x in authors else 0)
    return res_df.fillna(0)

In [4]:
# 通过video 、 act 、 register 、 lauch来构造特征
def constructDataFeature(data_url):
    train_login = pd.read_csv(data_url+'train_login.csv')
    train_act = pd.read_csv(data_url+'train_act.csv')
    train_video = pd.read_csv(data_url+'train_video.csv')
    
    # register data
    train_user = pd.read_csv(data_url+'train_users.csv')
    feature = train_user
    max_day = max(feature['register_day'])
    feature['register_day'] = max_day - feature.register_day  # day改成时间窗口截点距离
    
    # login data
    train_login_feas = getCountFeature(train_login, 'login')
    feature = pd.merge(feature, train_login_feas, on='user_id', how='left')
    
    # video data
    train_video_feas = getCountFeature(train_video, 'video')
    feature = pd.merge(feature, train_video_feas, on='user_id', how='left')
    
    # act data
    train_act_feas = getCountFeatureAboutAct(train_act, 'act')
    feature = pd.merge(feature, train_act_feas, on='user_id', how='left')
    
    return feature.fillna(0)

# 通过TEST未来几天内的活跃用户来给Train集标签
def getTrainLabel(data_url, data):
    # get activity label of train from test_dataset
    train_label = []
    activity_users = getActivityUsers(data_url)
    for u in data['user_id']:
        if u in activity_users:
            train_label.append(1)
        else:
            train_label.append(0)
    data['label'] = train_label
    return data
    

In [5]:
def getDataProgram():
    print("----------------构造训练集-------------------")
    
    data_url = '../Data/data1/'
    data1 = constructDataFeature(data_url)
    data1 = getTrainLabel(data_url, data1)
    print("---------第一组数据集处理完成----------------")
    data1.to_csv(data_url+'data1.csv',index=False)
    
    data_url = '../Data/data2/'
    data2 = constructDataFeature(data_url)
    data2 = getTrainLabel(data_url, data2)
    print("---------第二组数据集处理完成----------------")
    data2.to_csv(data_url+'data2.csv',index=False)
    
    data_url = '../Data/data3/'
    data3 = constructDataFeature(data_url)
    print("---------第三组数据集处理完成----------------")
    data3.to_csv(data_url+'data3.csv',index=False)
    
    print("--------------------END----------------------")

In [6]:
getDataProgram()

----------------构造训练集-------------------
---------第一组数据集处理完成----------------
---------第二组数据集处理完成----------------
---------第三组数据集处理完成----------------
--------------------END----------------------


# 训练模型并进行预测

In [7]:
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import f1_score

In [8]:
data1_in = pd.read_csv('../Data/data1/data1.csv')
data2_in = pd.read_csv('../Data/data2/data2.csv')
data3_in = pd.read_csv('../Data/data3/data3.csv')

In [None]:
# 列举所有的columns
for i in range(1,26):
    print (list(data1_in.columns)[(i-1)*5: i*5])

In [10]:
# 手动标定不需要的cols
drop_cols = ['login_sum','login_max','loginvar','loginmean','login_3_cnt','login_2_cnt','login_week_cnt','device_map'] + ['page_sum','page_0_sigle','page_1_sigle','page_2_sigle','page_3_sigle','page_4_sigle',
                 'action_type_sum','action_type_0_sigle','action_type_1_sigle','action_type_2_sigle',
                 'action_type_3_sigle','action_type_4_sigle','action_type_5_sigle']

In [11]:
# 特征选择之后的cols， 目前不起作用
select_cols = ['user_id','label'] + ['login_day_min', 'device_type', 'login_week_arg_cnt', 'register_type',
       'act_last_cnt', 'login_day_std', 'action_type_0', 'act_week_cnt',
       'page_1', 'act_3_cnt', 'login_cnt', 'page_0', 'act_day_std', 'actmean',
       'page_2', 'login_3_arg_cnt', 'register_day', 'act_sum', 'act_cnt',
       'action_type_1', 'action_type_2', 'act_day_min', 'actvar',
       'act_day_max', 'act_max', 'page_3', 'video_last_cnt', 'action_type_3',
       'page_4', 'video_3_cnt', 'videomean', 'video_sum', 'is_author',
       'page_3_7_cnt', 'action_type_1_7_cnt', 'video_day_min',
       'video_week_cnt', 'action_type_0_7_cnt', 'video_day_max', 'video_cnt',
       'videovar', 'action_type_2_7_cnt', 'page_3_3_cnt', 'video_day_std',
       'action_type_0_1_cnt', 'action_type_0_3_cnt', 'action_type_5',
       'action_type_1_3_cnt', 'page_0_1_cnt', 'video_max', 'page_0_7_cnt',
       'page_0_3_cnt']

In [12]:
# device_type 目前感觉挺有操作空间的，但是还没找到合适的处理方式
# map不如直接丢进去
def mapDeviceType(thread_value=0.5):
    con_data = pd.concat([data1_in, data2_in])
    index = con_data['label'].groupby(con_data["device_type"]).mean().index
    values = con_data['label'].groupby(con_data["device_type"]).mean().get_values()
    return index[values>thread_value]

good_index = mapDeviceType()

data1_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))
data2_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))
data3_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))

In [13]:
# data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols and c in select_cols]]
# data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols and c in select_cols]]
# data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols and c in select_cols]]

data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols]]
data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols]]
data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols]]

In [14]:
print (data1.shape)
print (data2.shape)
print (data3.shape)

(22342, 110)
(26571, 110)
(51709, 109)


In [None]:
data1.columns

In [15]:
# 目前线上的参数，调过
LGBM = lgb.LGBMClassifier(  max_depth=6,
                            n_estimators = 280,
                            learning_rate =0.05,     
                            objective = 'binary',
                            num_leaves=25,
                            boosting_type = 'dart',
                            feature_fraction=0.5,
                            lambda_l1=1,
                            lambda_l2=0.5,
                            subsample=0.7
)

In [16]:
# 题目规定线下 F1计算方法
def sroceF1(pred, real):
    M = set(pred)
    N = set(real)
    Precision = len(M.intersection(N))/len(M)
    Recall = len(M.intersection(N))/len(N)
    F1 = 2*Precision*Recall/(Precision+Recall)

    print("Precision=",Precision,"| Recall=",Recall)
    print("F1=",F1)

In [34]:
# 训练模型，做预测
def buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=0.4, model=lgb.LGBMClassifier(  max_depth=3,
                                                                    n_estimators = 120,
                                                                    learning_rate = 0.05,     
                                                                    objective = 'binary',
                                                                    subsample = 0.7,
                                                                    colsample_bytree = 0.74,
                                                                    num_leaves = 8)
                        ):
    # 线上预测
    # @return: 返回活跃用户
    if (isOnLine):
        train = pd.concat([data1, data2])
        test = data3.copy()
        train.pop('user_id')
        label = train.pop('label')
        
        model.fit(train, label)
        user_list = test.pop('user_id')
        user_df = pd.DataFrame(user_list)
        user_df['pre_act'] = model.predict_proba(test)[:,1]
        return user_df[user_df.pre_act>yuzhi]['user_id']

    # 线下训练调试
    else: 
        train = data1.copy()
        test = data2.copy()
        train.pop('user_id')
        train_df_label = train.pop('label')
        train_df = train
        
        real_user = test[test.label==1]['user_id']
        user_list = test.pop('user_id')
        test.pop('label')
        test_df = test
        
        user_df = pd.DataFrame(user_list)
        model.fit(train_df, train_df_label)
        user_df['pre_act'] = model.predict_proba(test_df)[:,1]
        
        user_pre = user_df[user_df.pre_act>yuzhi]['user_id']
        sroceF1(user_pre, real_user)
        return None

In [None]:
# 遍历阈值，结合用户个数和F1值取阈值
for i in np.arange(0.4, 0.45, 0.001):
    print (i)
    user_pre = buildModelAndPredict(isOnLine=False, isTest=False, yuzhi=i, model=LGBM)
    user_pre = buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=i, model=LGBM)
    print (len(user_pre))

In [None]:
user_pre = buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=0.4, model=LGBM)

In [32]:
len(user_pre)

24353

# 结果数据提交

In [30]:
user_pre.to_csv('../Output/108_8029_23973_42.csv', index=False)