In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging


In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
cd '/home/hisense/wx'

/home/hisense/wx


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:

def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))


In [6]:
# 加载邀请回答数据

train = pd.read_csv(f'data/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

train.head()

[2019-12-20 10:25:37,926] INFO in <ipython-input-6-13aaad248983>: invite (9489162, 4)


Unnamed: 0,qid,uid,dt,label
0,Q2166419046,M401693808,D3865-H22,0
1,Q1550017551,M3392373099,D3844-H11,0
2,Q604029601,M2317670257,D3862-H15,0
3,Q2350061229,M1618461867,D3849-H11,0
4,Q2443223942,M3544409350,D3867-H4,0


In [7]:
# 测试集
test = pd.read_csv(f'data/invite_info_evaluate_2_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

test.head()

[2019-12-20 10:25:39,246] INFO in <ipython-input-7-b0a93c17eb40>: test (1141718, 3)


Unnamed: 0,qid,uid,dt
0,Q3273481096,M1267743167,D3871-H6
1,Q4224184733,M2715893043,D3871-H23
2,Q1832714071,M2244950365,D3874-H15
3,Q3594972263,M2321407666,D3872-H10
4,Q403456350,M1091084170,D3870-H9


In [8]:
train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

del train['dt']

train.head()

Unnamed: 0,qid,uid,label,day,hour
0,Q2166419046,M401693808,0,3865,22
1,Q1550017551,M3392373099,0,3844,11
2,Q604029601,M2317670257,0,3862,15
3,Q2350061229,M1618461867,0,3849,11
4,Q2443223942,M3544409350,0,3867,4


In [9]:
test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del test['dt']

test.head()

Unnamed: 0,qid,uid,day,hour
0,Q3273481096,M1267743167,3871,6
1,Q4224184733,M2715893043,3871,23
2,Q1832714071,M2244950365,3874,15
3,Q3594972263,M2321407666,3872,10
4,Q403456350,M1091084170,3870,9


In [10]:
# 加载问题
ques = pd.read_csv(f'data/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

ques.head()

[2019-12-20 10:26:17,798] INFO in <ipython-input-10-771a6cd866da>: ques (1829900, 3)


Unnamed: 0,qid,topic,q_day,q_hour
0,Q2234111670,"T321,T730,T5784,T4389",1018,5
1,Q760329790,"T278,T12673,T4677",1745,20
2,Q741313548,T226,2032,21
3,Q3481466230,"T51,T4468",2185,15
4,Q3966197028,"T54700,T81,T57,T17670,T43574",2269,17


In [11]:
#在上面将文本信息都删除了，只剩下了topic信息

In [12]:
# 加载回答
ans = pd.read_csv(f'data/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)
ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans.head()

[2019-12-20 10:27:27,140] INFO in <ipython-input-12-ab994cc7c995>: ans (4513735, 18)


Unnamed: 0,aid,qid,uid,is_good,is_rec,is_dest,has_img,has_video,word_count,reci_cheer,reci_uncheer,reci_comment,reci_mark,reci_tks,reci_xxx,reci_no_help,reci_dis,a_day,a_hour
0,A2502060945,Q1867533817,M625498202,0,0,0,0,0,41,1,0,1,0,1,0,0,0,3808,7
1,A2847829478,Q3366788616,M142330444,0,0,0,0,0,204,1,0,0,3,1,0,0,0,3810,17
2,A2005999231,Q4264694221,M771499642,0,0,0,0,0,54,2,0,1,0,0,0,0,0,3853,13
3,A14821523,Q1088851650,M2282072267,0,0,0,0,0,42,1,0,1,0,1,0,0,0,3859,18
4,A731550034,Q1023877868,M2282072267,0,0,0,0,0,44,0,0,0,0,0,0,0,0,3855,22


In [13]:
#将回答和问题信息按照qid进行合并
ans = pd.merge(ans, ques, on='qid')
del ques

ans.head()

Unnamed: 0,aid,qid,uid,is_good,is_rec,is_dest,has_img,has_video,word_count,reci_cheer,...,reci_mark,reci_tks,reci_xxx,reci_no_help,reci_dis,a_day,a_hour,topic,q_day,q_hour
0,A2502060945,Q1867533817,M625498202,0,0,0,0,0,41,1,...,0,1,0,0,0,3808,7,"T381,T8211,T3144,T4936,T823",3806,21
1,A247365975,Q1867533817,M279074599,0,0,0,0,0,123,0,...,0,0,0,0,1,3821,9,"T381,T8211,T3144,T4936,T823",3806,21
2,A2661909364,Q1867533817,M3094956744,0,0,0,0,0,27,1,...,0,3,0,0,0,3808,5,"T381,T8211,T3144,T4936,T823",3806,21
3,A2881302290,Q1867533817,M2926469058,0,0,0,0,0,144,0,...,1,0,0,0,1,3821,13,"T381,T8211,T3144,T4936,T823",3806,21
4,A2847829478,Q3366788616,M142330444,0,0,0,0,0,204,1,...,3,1,0,0,0,3810,17,"T5490,T2180,T17098",3808,9


In [14]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']


[2019-12-20 10:27:43,616] INFO in utils: Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2019-12-20 10:27:43,617] INFO in utils: NumExpr defaulting to 8 threads.


In [15]:

# 时间窗口划分
# train  
# val
# 一个月的邀请数据当做训练集
train_start = 3838
train_end = 3867

# 之后一个星期的评测数据作为验证集
val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

In [16]:
train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-12-20 10:27:44,463] INFO in <ipython-input-16-db9ac0b937b4>: train_label_feature (6895493, 5)
[2019-12-20 10:27:44,864] INFO in <ipython-input-16-db9ac0b937b4>: val_label_feature (7583553, 5)
[2019-12-20 10:27:45,089] INFO in <ipython-input-16-db9ac0b937b4>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-12-20 10:27:45,111] INFO in <ipython-input-16-db9ac0b937b4>: test feature start 3845 end 3867, label start 3868 end 3874


In [17]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']


[2019-12-20 10:27:49,440] INFO in <ipython-input-17-aa15f72a2344>: train ans feature (3700178, 23), start 3810 end 3860
[2019-12-20 10:27:49,450] INFO in <ipython-input-17-aa15f72a2344>: val ans feature (3992334, 23), start 3817 end 3867


In [18]:
def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target

In [19]:
train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)

[2019-12-20 10:28:30,976] INFO in <ipython-input-18-0fafb00ac390>: extract is_good
[2019-12-20 10:28:44,078] INFO in <ipython-input-18-0fafb00ac390>: extract is_rec
[2019-12-20 10:29:00,731] INFO in <ipython-input-18-0fafb00ac390>: extract is_dest
[2019-12-20 10:29:14,201] INFO in <ipython-input-18-0fafb00ac390>: extract has_img
[2019-12-20 10:29:28,085] INFO in <ipython-input-18-0fafb00ac390>: extract has_video
[2019-12-20 10:29:42,088] INFO in <ipython-input-18-0fafb00ac390>: extract word_count
[2019-12-20 10:29:56,200] INFO in <ipython-input-18-0fafb00ac390>: extract reci_cheer
[2019-12-20 10:30:09,811] INFO in <ipython-input-18-0fafb00ac390>: extract reci_uncheer
[2019-12-20 10:30:23,381] INFO in <ipython-input-18-0fafb00ac390>: extract reci_comment
[2019-12-20 10:30:37,001] INFO in <ipython-input-18-0fafb00ac390>: extract reci_mark
[2019-12-20 10:30:50,784] INFO in <ipython-input-18-0fafb00ac390>: extract reci_tks
[2019-12-20 10:31:05,762] INFO in <ipython-input-18-0fafb00ac390>: 

In [20]:
pd.options.display.max_columns = None

In [21]:
train_label.head()

Unnamed: 0,qid,uid,label,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,u_inv_sum,u_inv_std,u_inv_count,q_ans_count,u_ans_count,u_is_good_sum,u_is_good_max,u_is_good_mean,q_is_good_sum,q_is_good_max,q_is_good_mean,u_is_rec_sum,u_is_rec_max,u_is_rec_mean,q_is_rec_sum,q_is_rec_max,q_is_rec_mean,u_is_dest_sum,u_is_dest_max,u_is_dest_mean,q_is_dest_sum,q_is_dest_max,q_is_dest_mean,u_has_img_sum,u_has_img_max,u_has_img_mean,q_has_img_sum,q_has_img_max,q_has_img_mean,u_has_video_sum,u_has_video_max,u_has_video_mean,q_has_video_sum,q_has_video_max,q_has_video_mean,u_word_count_sum,u_word_count_max,u_word_count_mean,q_word_count_sum,q_word_count_max,q_word_count_mean,u_reci_cheer_sum,u_reci_cheer_max,u_reci_cheer_mean,q_reci_cheer_sum,q_reci_cheer_max,q_reci_cheer_mean,u_reci_uncheer_sum,u_reci_uncheer_max,u_reci_uncheer_mean,q_reci_uncheer_sum,q_reci_uncheer_max,q_reci_uncheer_mean,u_reci_comment_sum,u_reci_comment_max,u_reci_comment_mean,q_reci_comment_sum,q_reci_comment_max,q_reci_comment_mean,u_reci_mark_sum,u_reci_mark_max,u_reci_mark_mean,q_reci_mark_sum,q_reci_mark_max,q_reci_mark_mean,u_reci_tks_sum,u_reci_tks_max,u_reci_tks_mean,q_reci_tks_sum,q_reci_tks_max,q_reci_tks_mean,u_reci_xxx_sum,u_reci_xxx_max,u_reci_xxx_mean,q_reci_xxx_sum,q_reci_xxx_max,q_reci_xxx_mean,u_reci_no_help_sum,u_reci_no_help_max,u_reci_no_help_mean,q_reci_no_help_sum,q_reci_no_help_max,q_reci_no_help_mean,u_reci_dis_sum,u_reci_dis_max,u_reci_dis_mean,q_reci_dis_sum,q_reci_dis_max,q_reci_dis_mean,u_diff_qa_days_sum,u_diff_qa_days_max,u_diff_qa_days_mean,q_diff_qa_days_sum,q_diff_qa_days_max,q_diff_qa_days_mean
0,Q2166419046,M401693808,0,3865,22,,,,,0.0,0.0,0.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Q604029601,M2317670257,0,3862,15,,,,,0.090909,1.0,0.301511,11.0,,2.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,2.0,1.0,1.0,,,,0.0,0.0,0.0,,,,274.0,139.0,137.0,,,,1.0,1.0,0.5,,,,0.0,0.0,0.0,,,,2.0,2.0,1.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,2.0,2.0,1.0,,,
2,Q2443223942,M3544409350,0,3867,4,0.375,57.0,0.485723,152.0,0.0,0.0,0.0,2.0,32.0,,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,2.0,1.0,0.0625,,,,0.0,0.0,0.0,,,,1348.0,136.0,42.125,,,,12.0,9.0,0.375,,,,0.0,0.0,0.0,,,,2.0,2.0,0.0625,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,305.0,13.0,9.53125
3,Q795459266,M2818659842,0,3861,20,0.166667,1.0,0.408248,6.0,0.285714,2.0,0.48795,7.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.5,1.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,2602.0,1210.0,433.666667,352.0,332.0,117.333333,6.0,2.0,1.0,4.0,3.0,1.333333,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,1.5,5.0,4.0,1.666667,2.0,1.0,0.333333,0.0,0.0,0.0,1.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2260.0,1391.0,376.666667,667.0,234.0,222.333333
4,Q110462128,M848334644,1,3862,8,,,,,0.634146,26.0,0.487652,41.0,,56.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,3721.0,244.0,66.446429,,,,7.0,2.0,0.125,,,,1.0,1.0,0.017857,,,,18.0,3.0,0.321429,,,,0.0,0.0,0.0,,,,6.0,1.0,0.107143,,,,1.0,1.0,0.017857,,,,0.0,0.0,0.0,,,,1.0,1.0,0.017857,,,,2156.0,1426.0,38.5,,,


In [22]:
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-12-20 10:32:31,089] INFO in <ipython-input-18-0fafb00ac390>: extract is_good
[2019-12-20 10:32:43,332] INFO in <ipython-input-18-0fafb00ac390>: extract is_rec
[2019-12-20 10:32:55,468] INFO in <ipython-input-18-0fafb00ac390>: extract is_dest
[2019-12-20 10:33:07,726] INFO in <ipython-input-18-0fafb00ac390>: extract has_img
[2019-12-20 10:33:20,031] INFO in <ipython-input-18-0fafb00ac390>: extract has_video
[2019-12-20 10:33:32,548] INFO in <ipython-input-18-0fafb00ac390>: extract word_count
[2019-12-20 10:33:44,994] INFO in <ipython-input-18-0fafb00ac390>: extract reci_cheer
[2019-12-20 10:33:57,500] INFO in <ipython-input-18-0fafb00ac390>: extract reci_uncheer
[2019-12-20 10:34:10,389] INFO in <ipython-input-18-0fafb00ac390>: extract reci_comment
[2019-12-20 10:34:22,530] INFO in <ipython-input-18-0fafb00ac390>: extract reci_mark
[2019-12-20 10:34:35,204] INFO in <ipython-input-18-0fafb00ac390>: extract reci_tks
[2019-12-20 10:34:48,117] INFO in <ipython-input-18-0fafb00ac390>: 

In [23]:
# 加载用户
user = pd.read_csv(f'data/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

user.head()

[2019-12-20 10:35:37,135] INFO in <ipython-input-23-f14cdc5dd49a>: user (1931654, 19)


Unnamed: 0,uid,gender,creat_keyword,level,hot,reg_type,reg_plat,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score
0,M1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764
1,M595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671
2,M1473482940,female,-1,0.0,0.0,unknown,unknown,weekly,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454
3,M578477092,male,-1,0.0,0.0,unknown,unknown,daily,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588
4,M1088794709,male,-1,0.0,0.0,unknown,unknown,weekly,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361


In [25]:
#删除用户特征中的常量
unq = user.nunique()
logging.info("user unq %s", unq)

# 去掉没有区分作用的五个特征
for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

user.head()

[2019-12-20 10:35:44,902] INFO in <ipython-input-25-88c7991b9107>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score
0,M1934753188,male,monthly,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764
1,M595924114,male,daily,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671
2,M1473482940,female,weekly,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454
3,M578477092,male,daily,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588
4,M1088794709,male,weekly,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361


In [26]:
#对多特征值进行编码
t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
    
user.head()

[2019-12-20 10:35:44,926] INFO in <ipython-input-26-aeb6ff1de9f0>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-20 10:35:45,653] INFO in <ipython-input-26-aeb6ff1de9f0>: encode gender
[2019-12-20 10:35:46,379] INFO in <ipython-input-26-aeb6ff1de9f0>: encode freq
[2019-12-20 10:35:47,044] INFO in <ipython-input-26-aeb6ff1de9f0>: encode uf_c1
[2019-12-20 10:35:47,656] INFO in <ipython-input-26-aeb6ff1de9f0>: encode uf_c2
[2019-12-20 10:35:48,225] INFO in <ipython-input-26-aeb6ff1de9f0>: encode uf_c3
[2019-12-20 10:35:48,736] INFO in <ipython-input-26-aeb6ff1de9f0>: encode uf_c4
[2019-12-20 10:35:49,207] INFO in <ipython-input-26-aeb6ff1de9f0>: encode uf_c5


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score
0,M1934753188,1,1,0,1,0,1,0,1190,130,396,1438,1,764
1,M595924114,1,0,0,0,0,1,1,486,170,0,1311,1,671
2,M1473482940,0,4,0,1,0,1,0,294,190,73,758,1,454
3,M578477092,1,0,1,1,0,1,0,2282,232,237,864,1,588
4,M1088794709,1,4,0,1,0,0,0,2113,190,32,758,1,361


In [29]:
#对uid和qid进行编码
q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

train.head()

Unnamed: 0,qid,uid,label,day,hour
0,Q2166419046,M401693808,0,3865,22
1,Q1550017551,M3392373099,0,3844,11
2,Q604029601,M2317670257,0,3862,15
3,Q2350061229,M1618461867,0,3849,11
4,Q2443223942,M3544409350,0,3867,4


In [30]:
u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])


test.head()

Unnamed: 0,qid,uid,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,u_inv_sum,u_inv_std,u_inv_count,q_ans_count,u_ans_count,u_is_good_sum,u_is_good_max,u_is_good_mean,q_is_good_sum,q_is_good_max,q_is_good_mean,u_is_rec_sum,u_is_rec_max,u_is_rec_mean,q_is_rec_sum,q_is_rec_max,q_is_rec_mean,u_is_dest_sum,u_is_dest_max,u_is_dest_mean,q_is_dest_sum,q_is_dest_max,q_is_dest_mean,u_has_img_sum,u_has_img_max,u_has_img_mean,q_has_img_sum,q_has_img_max,q_has_img_mean,u_has_video_sum,u_has_video_max,u_has_video_mean,q_has_video_sum,q_has_video_max,q_has_video_mean,u_word_count_sum,u_word_count_max,u_word_count_mean,q_word_count_sum,q_word_count_max,q_word_count_mean,u_reci_cheer_sum,u_reci_cheer_max,u_reci_cheer_mean,q_reci_cheer_sum,q_reci_cheer_max,q_reci_cheer_mean,u_reci_uncheer_sum,u_reci_uncheer_max,u_reci_uncheer_mean,q_reci_uncheer_sum,q_reci_uncheer_max,q_reci_uncheer_mean,u_reci_comment_sum,u_reci_comment_max,u_reci_comment_mean,q_reci_comment_sum,q_reci_comment_max,q_reci_comment_mean,u_reci_mark_sum,u_reci_mark_max,u_reci_mark_mean,q_reci_mark_sum,q_reci_mark_max,q_reci_mark_mean,u_reci_tks_sum,u_reci_tks_max,u_reci_tks_mean,q_reci_tks_sum,q_reci_tks_max,q_reci_tks_mean,u_reci_xxx_sum,u_reci_xxx_max,u_reci_xxx_mean,q_reci_xxx_sum,q_reci_xxx_max,q_reci_xxx_mean,u_reci_no_help_sum,u_reci_no_help_max,u_reci_no_help_mean,q_reci_no_help_sum,q_reci_no_help_max,q_reci_no_help_mean,u_reci_dis_sum,u_reci_dis_max,u_reci_dis_mean,q_reci_dis_sum,q_reci_dis_max,q_reci_dis_mean,u_diff_qa_days_sum,u_diff_qa_days_max,u_diff_qa_days_mean,q_diff_qa_days_sum,q_diff_qa_days_max,q_diff_qa_days_mean,qid_enc,uid_enc
0,Q3273481096,M1267743167,3871,6,,,,,0.347826,16.0,0.481543,46.0,,43.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,6124.0,1091.0,142.418605,,,,40.0,24.0,0.930233,,,,4.0,3.0,0.093023,,,,30.0,11.0,0.697674,,,,11.0,10.0,0.255814,,,,11.0,4.0,0.255814,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,4.0,3.0,0.093023,,,,4490.0,2566.0,104.418605,,,,300178,133504
1,Q4224184733,M2715893043,3871,23,,,,,0.095238,2.0,0.300793,21.0,,3.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,3.0,1.0,1.0,,,,0.0,0.0,0.0,,,,1298.0,609.0,432.666667,,,,2.0,1.0,0.666667,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,2.0,2.0,0.666667,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,176.0,103.0,58.666667,,,,425704,857246
2,Q1832714071,M2244950365,3874,15,,,,,0.142857,1.0,0.377964,7.0,,2.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,172.0,147.0,86.0,,,,2.0,2.0,1.0,,,,0.0,0.0,0.0,,,,2.0,2.0,1.0,,,,0.0,0.0,0.0,,,,1.0,1.0,0.5,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,30.0,16.0,15.0,,,,110051,621709
3,Q3594972263,M2321407666,3872,10,0.62963,17.0,0.492103,27.0,,,,,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,13.0,2544.0,931.0,169.6,0.0,0.0,0.0,18.0,8.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.133333,0.0,0.0,0.0,3.0,2.0,0.2,0.0,0.0,0.0,8.0,2.0,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21.0,21.0,64.0,11.0,4.266667,342701,659918
4,Q403456350,M1091084170,3870,9,,,,,0.2,1.0,0.447214,5.0,,2.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,74.0,47.0,37.0,,,,3.0,3.0,1.5,,,,0.0,0.0,0.0,,,,2.0,2.0,1.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,102.0,102.0,51.0,,,,400732,45358


In [31]:
# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
del train_label, test

data.head()

[2019-12-20 10:39:51,650] INFO in <ipython-input-31-19dd6aad6754>: train shape (2593669, 120), test shape (1141718, 119)


Unnamed: 0,day,freq,gender,hour,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,q_diff_qa_days_sum,q_has_img_max,q_has_img_mean,q_has_img_sum,q_has_video_max,q_has_video_mean,q_has_video_sum,q_inv_count,q_inv_mean,q_inv_std,q_inv_sum,q_is_dest_max,q_is_dest_mean,q_is_dest_sum,q_is_good_max,q_is_good_mean,q_is_good_sum,q_is_rec_max,q_is_rec_mean,q_is_rec_sum,q_reci_cheer_max,q_reci_cheer_mean,q_reci_cheer_sum,q_reci_comment_max,q_reci_comment_mean,q_reci_comment_sum,q_reci_dis_max,q_reci_dis_mean,q_reci_dis_sum,q_reci_mark_max,q_reci_mark_mean,q_reci_mark_sum,q_reci_no_help_max,q_reci_no_help_mean,q_reci_no_help_sum,q_reci_tks_max,q_reci_tks_mean,q_reci_tks_sum,q_reci_uncheer_max,q_reci_uncheer_mean,q_reci_uncheer_sum,q_reci_xxx_max,q_reci_xxx_mean,q_reci_xxx_sum,q_word_count_max,q_word_count_mean,q_word_count_sum,qid,qid_enc,score,u_ans_count,u_diff_qa_days_max,u_diff_qa_days_mean,u_diff_qa_days_sum,u_has_img_max,u_has_img_mean,u_has_img_sum,u_has_video_max,u_has_video_mean,u_has_video_sum,u_inv_count,u_inv_mean,u_inv_std,u_inv_sum,u_is_dest_max,u_is_dest_mean,u_is_dest_sum,u_is_good_max,u_is_good_mean,u_is_good_sum,u_is_rec_max,u_is_rec_mean,u_is_rec_sum,u_reci_cheer_max,u_reci_cheer_mean,u_reci_cheer_sum,u_reci_comment_max,u_reci_comment_mean,u_reci_comment_sum,u_reci_dis_max,u_reci_dis_mean,u_reci_dis_sum,u_reci_mark_max,u_reci_mark_mean,u_reci_mark_sum,u_reci_no_help_max,u_reci_no_help_mean,u_reci_no_help_sum,u_reci_tks_max,u_reci_tks_mean,u_reci_tks_sum,u_reci_uncheer_max,u_reci_uncheer_mean,u_reci_uncheer_sum,u_reci_xxx_max,u_reci_xxx_mean,u_reci_xxx_sum,u_word_count_max,u_word_count_mean,u_word_count_sum,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,uid,uid_enc
0,3865,4,2,22,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q2166419046,154133,297,,,,,,,,,,,2.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,2113,190,261,927,1,M401693808,1508098
1,3862,4,2,15,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q604029601,458113,415,2.0,2.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,11.0,0.090909,0.301511,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0,137.0,274.0,1,0,0,0,0,1519,229,0,506,1,M2317670257,657985
2,3867,1,2,4,0.0,32.0,13.0,9.53125,305.0,1.0,0.0625,2.0,0.0,0.0,0.0,152.0,0.375,0.485723,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.375,12.0,2.0,0.0625,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0,42.125,1348.0,Q2443223942,190601,296,,,,,,,,,,,2.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,551,226,188,815,1,M3544409350,1272353
3,3861,0,1,20,0.0,3.0,234.0,222.333333,667.0,1.0,0.333333,1.0,0.0,0.0,0.0,6.0,0.166667,0.408248,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.333333,4.0,4.0,1.666667,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,332.0,117.333333,352.0,Q795459266,483270,380,6.0,1391.0,376.666667,2260.0,1.0,0.5,3.0,0.0,0.0,0.0,7.0,0.285714,0.48795,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,6.0,3.0,1.5,9.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.0,0.0,0.0,1.0,0.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1210.0,433.666667,2602.0,1,0,0,0,0,1519,229,0,506,1,M2818659842,909154
4,3862,4,0,8,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q110462128,13599,719,56.0,1426.0,38.5,2156.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,0.634146,0.487652,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.125,7.0,3.0,0.321429,18.0,1.0,0.017857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.107143,6.0,1.0,0.017857,1.0,1.0,0.017857,1.0,244.0,66.446429,3721.0,1,0,0,0,0,2161,31,396,1438,1,M848334644,1856019


In [32]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [33]:
data.head()

Unnamed: 0,day,freq,gender,hour,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,q_diff_qa_days_sum,q_has_img_max,q_has_img_mean,q_has_img_sum,q_has_video_max,q_has_video_mean,q_has_video_sum,q_inv_count,q_inv_mean,q_inv_std,q_inv_sum,q_is_dest_max,q_is_dest_mean,q_is_dest_sum,q_is_good_max,q_is_good_mean,q_is_good_sum,q_is_rec_max,q_is_rec_mean,q_is_rec_sum,q_reci_cheer_max,q_reci_cheer_mean,q_reci_cheer_sum,q_reci_comment_max,q_reci_comment_mean,q_reci_comment_sum,q_reci_dis_max,q_reci_dis_mean,q_reci_dis_sum,q_reci_mark_max,q_reci_mark_mean,q_reci_mark_sum,q_reci_no_help_max,q_reci_no_help_mean,q_reci_no_help_sum,q_reci_tks_max,q_reci_tks_mean,q_reci_tks_sum,q_reci_uncheer_max,q_reci_uncheer_mean,q_reci_uncheer_sum,q_reci_xxx_max,q_reci_xxx_mean,q_reci_xxx_sum,q_word_count_max,q_word_count_mean,q_word_count_sum,qid,qid_enc,score,u_ans_count,u_diff_qa_days_max,u_diff_qa_days_mean,u_diff_qa_days_sum,u_has_img_max,u_has_img_mean,u_has_img_sum,u_has_video_max,u_has_video_mean,u_has_video_sum,u_inv_count,u_inv_mean,u_inv_std,u_inv_sum,u_is_dest_max,u_is_dest_mean,u_is_dest_sum,u_is_good_max,u_is_good_mean,u_is_good_sum,u_is_rec_max,u_is_rec_mean,u_is_rec_sum,u_reci_cheer_max,u_reci_cheer_mean,u_reci_cheer_sum,u_reci_comment_max,u_reci_comment_mean,u_reci_comment_sum,u_reci_dis_max,u_reci_dis_mean,u_reci_dis_sum,u_reci_mark_max,u_reci_mark_mean,u_reci_mark_sum,u_reci_no_help_max,u_reci_no_help_mean,u_reci_no_help_sum,u_reci_tks_max,u_reci_tks_mean,u_reci_tks_sum,u_reci_uncheer_max,u_reci_uncheer_mean,u_reci_uncheer_sum,u_reci_xxx_max,u_reci_xxx_mean,u_reci_xxx_sum,u_word_count_max,u_word_count_mean,u_word_count_sum,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,uid,uid_enc,uid_enc_count,qid_enc_count,gender_count,freq_count,uf_c1_count,uf_c2_count,uf_c3_count,uf_c4_count,uf_c5_count
0,3865,5,3,22,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q2166419046,154134,297,,,,,,,,,,,2.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,2114,191,262,928,2,M401693808,1508099,0.0,0.000369,1.0,0.960571,0.022114,0.37193,0.218308,0.062702,1.0
1,3862,5,3,15,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q604029601,458114,415,2.0,2.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,11.0,0.090909,0.301511,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0,137.0,274.0,1,0,0,0,0,1520,230,1,507,2,M2317670257,657986,4e-06,3.6e-05,1.0,0.960571,0.024451,0.111552,0.444739,0.002961,1.0
2,3867,2,3,4,0.0,32.0,13.0,9.53125,305.0,1.0,0.0625,2.0,0.0,0.0,0.0,152.0,0.375,0.485723,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.375,12.0,2.0,0.0625,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0,42.125,1348.0,Q2443223942,190602,296,,,,,,,,,,,2.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,552,227,189,816,2,M3544409350,1272354,2.2e-05,0.001441,1.0,0.290809,0.006221,0.034154,0.402727,0.044313,1.0
3,3861,1,2,20,0.0,3.0,234.0,222.333333,667.0,1.0,0.333333,1.0,0.0,0.0,0.0,6.0,0.166667,0.408248,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.333333,4.0,4.0,1.666667,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,332.0,117.333333,352.0,Q795459266,483271,380,6.0,1391.0,376.666667,2260.0,1.0,0.5,3.0,0.0,0.0,0.0,7.0,0.285714,0.48795,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,6.0,3.0,1.5,9.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.0,0.0,0.0,1.0,0.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1210.0,433.666667,2602.0,1,0,0,0,0,1520,230,1,507,2,M2818659842,909155,1.1e-05,1.8e-05,0.0,1.0,0.024451,0.111552,0.444739,0.002961,1.0
4,3862,5,1,8,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Q110462128,13600,719,56.0,1426.0,38.5,2156.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,0.634146,0.487652,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.125,7.0,3.0,0.321429,18.0,1.0,0.017857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.107143,6.0,1.0,0.017857,1.0,1.0,0.017857,1.0,244.0,66.446429,3721.0,1,0,0,0,0,2162,32,397,1439,2,M848334644,1856020,4.3e-05,2.7e-05,0.008695,0.960571,0.005739,0.007515,0.438648,0.125989,1.0


In [34]:
# 压缩数据   int64->int32
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7


In [36]:
feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]
# target编码
logging.info("feature size %s", len(feature_cols))


[2019-12-20 11:33:38,986] INFO in <ipython-input-36-baeb035653b9>: feature size 126


In [None]:
train_label = train[(train['day'] > train_label_feature_end)]

In [None]:
X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

In [None]:
import pickle

In [None]:
with open('pkl/data.pkl','wb') as file:
    pickle.dump(data,file)

In [None]:
logging.info("train shape %s, test shape %s", X_train_all.shape, test.shape)

In [None]:
train_label.shape

In [None]:
# 5折交叉验证
# StratifiedKFold 采取分层抽样
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]

In [None]:

model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=64, learning_rate=0.01, n_estimators=2000,
                           max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                           min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1,
                           colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=-1, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=10)

In [None]:
sub = pd.read_csv(f'data/invite_info_evaluate_2_0926.txt', sep='\t', header=None)
sub.columns = ['qid', 'uid', 'dt']
logging.info("test %s", sub.shape)


In [38]:
sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]

In [39]:
sub.head()

Unnamed: 0,qid,uid,dt,label
0,Q3273481096,M1267743167,D3871-H6,0.282009
1,Q4224184733,M2715893043,D3871-H23,0.071644
2,Q1832714071,M2244950365,D3874-H15,0.029766
3,Q3594972263,M2321407666,D3872-H10,0.386641
4,Q403456350,M1091084170,D3870-H9,0.015012


In [40]:
sub.to_csv('result/result.txt', index=None, header=None, sep='\t')