In [1]:
import gc
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import joblib
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#data from https://www.kaggle.com/its7171/riiid-cross-validation-files 去kaggle下载
columns = ['row_id','user_id','timestamp','content_id', "content_type_id", 'task_container_id',
           'answered_correctly','prior_question_elapsed_time','prior_question_had_explanation']
train_df = pd.read_pickle("D:/kaggle/input/riiid-test-answer-prediction/cv_data/cv1_train.pickle")[columns]
valid_df = pd.read_pickle("D:/kaggle/input/riiid-test-answer-prediction/cv_data/cv1_valid.pickle")[columns]
print("Train size:", train_df.shape,"Valid size:", valid_df.shape)

Train size: (98730332, 9) Valid size: (2500000, 9)


In [3]:
# 这个notebook直接运行会需要大约30多g的内存
# 如果内存不够建议运行以下代码，通过修改train_data_size来减少数据量，来适应机器
# train_data_size = 50_000_000
# train_df = train_df.iloc[:train_data_size,:]

# lectures.csv
merge lectures.csv文件和question.csv 生成数据

In [4]:
%%time
# 生成讲座相关的特征（和时间序列相关）
# 具体生成特征解释在函数末尾处

lectures_df = pd.read_csv('D:/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])
lectures_df['content_type_id'] = 1

q_taglist_df = pd.read_csv("D:/kaggle/input/riiid-test-answer-prediction/questions.csv")[['tags']].astype(str)
q_taglist_df["tags_l"] = [x.split() for x in q_taglist_df.tags.values]
q_taglist_df['content_type_id'] = 0
q_taglist_df.drop("tags", axis=1, inplace=True)
q_taglist_df.drop(10033, axis=0, inplace=True) # 第10033个问题，tags 是 nan

def add_lectures_feats(df, curr_dict):
    new_df = df[["row_id", "user_id", "timestamp", "content_id", "content_type_id"]]
    new_df = new_df.merge(lectures_df, how="left", left_on = ["content_id","content_type_id"], right_on = ["lecture_id","content_type_id"])
    new_df = new_df.merge(q_taglist_df, how="left", left_on = ["content_id","content_type_id"], right_on = [q_taglist_df.index,"content_type_id"])
    new_df = new_df.sort_values(["timestamp"])
    new_df = new_df[['timestamp', 'user_id', 'content_type_id','tag','part_1','part_2','part_3','part_4','part_5','part_6','part_7',
                     'type_of_concept','type_of_intention','type_of_solving_question','type_of_starter','tags_l','row_id']]
    ulc_lb = np.zeros(len(df), dtype="int8")
    part1_l = np.zeros(len(df), dtype="uint16")
    part2_l = np.zeros(len(df), dtype="uint16")
    part3_l = np.zeros(len(df), dtype="uint16")
    part4_l = np.zeros(len(df), dtype="uint16")
    part5_l = np.zeros(len(df), dtype="uint16")
    part6_l = np.zeros(len(df), dtype="uint16")
    part7_l = np.zeros(len(df), dtype="uint16")
    type_of_concept_l = np.zeros(len(df), dtype="uint16")
    type_of_intention_l = np.zeros(len(df), dtype="uint16")
    type_of_solving_question_l = np.zeros(len(df), dtype="uint16")
    type_of_starter_l = np.zeros(len(df), dtype="uint16")
    has_tags_l = np.zeros(len(df), dtype="float32")
    
    # 0.'timestamp', 1.'user_id', 2.'content_type_id',3.'tag',4.'part_1',5.'part_2',6.'part_3',7.'part_4',8.'part_5',9.'part_6',10.'part_7',
    # 11.'type_of_concept',12.'type_of_intention',13.'type_of_solving_question',14.'type_of_starter',15.'tags_l', 16.'row_id'
    for cnt,row in enumerate(tqdm(new_df.itertuples(index=False), total=new_df.shape[0])):
        if row[1] in curr_dict:
            if row[2] == 1:
                curr_dict[row[1]]["lecture_bool"] = 1
                curr_dict[row[1]]["part_1_cnt"] += int(row[4])
                curr_dict[row[1]]["part_2_cnt"] += int(row[5])
                curr_dict[row[1]]["part_3_cnt"] += int(row[6])
                curr_dict[row[1]]["part_4_cnt"] += int(row[7])
                curr_dict[row[1]]["part_5_cnt"] += int(row[8])
                curr_dict[row[1]]["part_6_cnt"] += int(row[9])
                curr_dict[row[1]]["part_7_cnt"] += int(row[10])
                curr_dict[row[1]]["type_of_concept_cnt"] += int(row[11])
                curr_dict[row[1]]["type_of_intention_cnt"] += int(row[12])
                curr_dict[row[1]]["type_of_solving_question_cnt"] += int(row[13])
                curr_dict[row[1]]["type_of_starter_cnt"] += int(row[14])
                curr_dict[row[1]]["has_tags"].add(int(row[3]))
        else:
            curr_dict[row[1]] = {}
            if row[2] == 1:
                curr_dict[row[1]]["lecture_bool"] = 1
                curr_dict[row[1]]["part_1_cnt"] = int(row[4])
                curr_dict[row[1]]["part_2_cnt"] = int(row[5])
                curr_dict[row[1]]["part_3_cnt"] = int(row[6])
                curr_dict[row[1]]["part_4_cnt"] = int(row[7])
                curr_dict[row[1]]["part_5_cnt"] = int(row[8])
                curr_dict[row[1]]["part_6_cnt"] = int(row[9])
                curr_dict[row[1]]["part_7_cnt"] = int(row[10])
                curr_dict[row[1]]["type_of_concept_cnt"] = int(row[11])
                curr_dict[row[1]]["type_of_intention_cnt"] = int(row[12])
                curr_dict[row[1]]["type_of_solving_question_cnt"] = int(row[13])
                curr_dict[row[1]]["type_of_starter_cnt"] = int(row[14])
                curr_dict[row[1]]["has_tags"] = set([int(row[3])])
            else:
                curr_dict[row[1]]["lecture_bool"] = 0
                curr_dict[row[1]]["part_1_cnt"] = 0
                curr_dict[row[1]]["part_2_cnt"] = 0
                curr_dict[row[1]]["part_3_cnt"] = 0
                curr_dict[row[1]]["part_4_cnt"] = 0
                curr_dict[row[1]]["part_5_cnt"] = 0
                curr_dict[row[1]]["part_6_cnt"] = 0
                curr_dict[row[1]]["part_7_cnt"] = 0
                curr_dict[row[1]]["type_of_concept_cnt"] = 0
                curr_dict[row[1]]["type_of_intention_cnt"] = 0
                curr_dict[row[1]]["type_of_solving_question_cnt"] = 0
                curr_dict[row[1]]["type_of_starter_cnt"] = 0
                curr_dict[row[1]]["has_tags"] = set()
        
        ulc_lb[cnt] = curr_dict[row[1]]["lecture_bool"]
        part1_l[cnt] = curr_dict[row[1]]["part_1_cnt"]
        part2_l[cnt] = curr_dict[row[1]]["part_2_cnt"]
        part3_l[cnt] = curr_dict[row[1]]["part_3_cnt"]
        part4_l[cnt] = curr_dict[row[1]]["part_4_cnt"]
        part5_l[cnt] = curr_dict[row[1]]["part_5_cnt"]
        part6_l[cnt] = curr_dict[row[1]]["part_6_cnt"]
        part7_l[cnt] = curr_dict[row[1]]["part_7_cnt"]
        type_of_concept_l[cnt] = curr_dict[row[1]]["type_of_concept_cnt"]
        type_of_intention_l[cnt] = curr_dict[row[1]]["type_of_intention_cnt"]
        type_of_solving_question_l[cnt] = curr_dict[row[1]]["type_of_solving_question_cnt"]
        type_of_starter_l[cnt] = curr_dict[row[1]]["type_of_starter_cnt"]
        
        if type(row[15]) == list:
            tags_has = 0
            for tag in row[15]:
                if int(tag) in curr_dict[row[1]]["has_tags"]:
                    tags_has += 1
            has_tags_l[cnt] = tags_has/len(row[15])

    has_tags_lb = (has_tags_l > 0).astype("int8")

    lectures_feats_df = pd.DataFrame({"curr_lecture_bool":ulc_lb, # 这个用户之前是否听过讲座
                                      "part_1_cnt":part1_l, # 这个用户听过多少次part类型为1的讲座
                                      "part_2_cnt":part2_l, # 这个用户听过多少次part类型为2的讲座
                                      "part_3_cnt":part3_l, # 这个用户听过多少次part类型为3的讲座
                                      "part_4_cnt":part4_l, # 这个用户听过多少次part类型为4的讲座
                                      "part_5_cnt":part5_l, # 这个用户听过多少次part类型为5的讲座
                                      "part_6_cnt":part6_l, # 这个用户听过多少次part类型为6的讲座
                                      "part_7_cnt":part7_l, # 这个用户听过多少次part类型为7的讲座
                                      "type_of_concept_cnt":type_of_concept_l, # 这个用户听过多少次type类型为concept的讲座
                                      "type_of_intention_cnt":type_of_intention_l, # 这个用户听过多少次type类型为intention的讲座
                                      "type_of_solving_question_cnt":type_of_solving_question_l, # 这个用户听过多少次type类型为solving_question的讲座
                                      "type_of_starter_cnt":type_of_starter_l, # 这个用户听过多少次type类型为starter的讲座
                                      "watched_tags_rate":has_tags_l, # 这个用户在做的这个question所包含的tags，有多少比例是他以前看过相同tags的讲座
                                      "watched_tags_bool":has_tags_lb,# 这个用户在做的这个question所包含的tags，是否至少有一个tag是他以前看过相同tag的讲座
                                     }).set_index(new_df["row_id"])

    df = df.merge(lectures_feats_df,how="left",left_on="row_id",right_index=True)
    return df

curr_lectures_dict = {}
train_df = add_lectures_feats(train_df, curr_lectures_dict)
valid_df = add_lectures_feats(valid_df, curr_lectures_dict)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


Wall time: 13min 33s


# 剔除lectures行

## train.csv

In [5]:
# 只保留question行
train_df = train_df[train_df['content_type_id'] == 0].reset_index(drop=True)
valid_df = valid_df[valid_df['content_type_id'] == 0].reset_index(drop=True)
print("Train_ques:", train_df.shape,"Valid_ques:", valid_df.shape)

Train_ques: (96817414, 23) Valid_ques: (2453886, 23)


In [6]:
# 生成每个content 的 elapsed_time 和 had_explanation的平均值
content_elapsed_time_agg=train_df.groupby('content_id')['prior_question_elapsed_time'].agg(['mean']).astype('float32')
content_had_explanation_agg=train_df.groupby('content_id')['prior_question_had_explanation'].agg(['mean']).astype('float32')
content_elapsed_time_agg.columns = ["content_elapsed_time"]
content_had_explanation_agg.columns = ["content_had_explanation"]

train_df = train_df.merge(content_elapsed_time_agg, how="left", left_on="content_id", right_index=True)
valid_df = valid_df.merge(content_elapsed_time_agg, how="left", left_on="content_id", right_index=True)
train_df = train_df.merge(content_had_explanation_agg, how="left", left_on="content_id", right_index=True)
valid_df = valid_df.merge(content_had_explanation_agg, how="left", left_on="content_id", right_index=True)

In [7]:
%%time
# 生成 每个task_container_id 会被他所关联过的所有用户，平均关联几次
task_user_df = train_df[['task_container_id', 'user_id']].groupby(['task_container_id']).agg(['count','nunique'])
task_user_df.columns = ['cnt',"unq"]
task_user_df["avg_task_seen"] = task_user_df["cnt"]/task_user_df["unq"]
task_user_df['avg_task_seen_cumsum'] = task_user_df.avg_task_seen.cumsum()
task_user_df = task_user_df[["avg_task_seen_cumsum"]].astype(np.float32)

train_df = train_df.merge(task_user_df, how='left', left_on='task_container_id',right_index=True)
valid_df = valid_df.merge(task_user_df, how='left', left_on='task_container_id',right_index=True)

Wall time: 2min 12s


In [8]:
%%time
# 每个question的正确率
content_answers_df = train_df[['content_id','answered_correctly']].groupby('content_id').agg(["mean","count"])
content_answers_df.columns = ['content_mean_acc','content_cnt']
content_answers_df['content_cnt'] = content_answers_df['content_cnt'].astype("uint32")
content_answers_df['content_mean_acc'] = content_answers_df['content_mean_acc'].astype(np.float32)

train_df = train_df.merge(content_answers_df, how='left', left_on='content_id', right_index=True)
valid_df = valid_df.merge(content_answers_df, how='left', left_on='content_id', right_index=True)

Wall time: 19.4 s


In [9]:
%%time
# 生成 回答正确的question平均花费的elapsed_time
# 生成 回答正确的question平均花费的elapsed_time
# 文件在百度网盘
with open("D:/kaggle/input/riiid-test-answer-prediction/question_elapsed_time_mean.pkl.data","rb") as f:
    question_elapsed_time_df = pickle.load(f)
train_df = train_df.merge(question_elapsed_time_df,on = "content_id", how = "left")
valid_df = valid_df.merge(question_elapsed_time_df,on = "content_id", how = "left")

train_df.corr_question_elapsed_time_mean = train_df.corr_question_elapsed_time_mean.fillna(-1).astype("float32")
train_df.incorr_question_elapsed_time_mean = train_df.incorr_question_elapsed_time_mean.fillna(-1).astype("float32")
valid_df.corr_question_elapsed_time_mean = valid_df.corr_question_elapsed_time_mean.fillna(-1).astype("float32")
valid_df.incorr_question_elapsed_time_mean = valid_df.incorr_question_elapsed_time_mean.fillna(-1).astype("float32")

Wall time: 28 s


In [10]:
%%time
# 生成 回答正确的question，用户在回答问题前看了之前题目的详解的概率
# 生成 回答错误的question，用户在回答问题前看了之前题目的详解的概率
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
content_explation_agg=train_df[["content_id","prior_question_had_explanation",'answered_correctly']].groupby(
    ["content_id","prior_question_had_explanation"])['answered_correctly'].agg(['mean'])

content_explation_agg = content_explation_agg.unstack()
content_explation_agg = content_explation_agg.reset_index()
content_explation_agg.columns = ['content_id', 'content_explation_false_mean','content_explation_true_mean']
content_explation_agg.content_id = content_explation_agg.content_id.astype('int16')
content_explation_agg.content_explation_false_mean = content_explation_agg.content_explation_false_mean.astype('float32')
content_explation_agg.content_explation_true_mean = content_explation_agg.content_explation_true_mean.astype('float32')

train_df = train_df.merge(content_explation_agg,how="left",left_on="content_id",right_on="content_id")
valid_df = valid_df.merge(content_explation_agg,how="left",left_on="content_id",right_on="content_id")
train_df["content_explation_false_mean"].fillna(0,inplace=True)
train_df["content_explation_true_mean"].fillna(0,inplace=True)
valid_df["content_explation_false_mean"].fillna(0,inplace=True)
valid_df["content_explation_true_mean"].fillna(0,inplace=True)

Wall time: 48.9 s


In [11]:
# 生成 用户做这题时，距离上次（前一次）做题过去了多少时间
# 生成 用户做这题时，距离上上次（前两次）做题过去了多少时间
# 生成 用户做这题时，距离上上上次（前三次）做题过去了多少时间
# section1
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
max_timestamp_u.user_id=max_timestamp_u.user_id.astype('int32')

train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()
max_timestamp_u2 = train_df[['user_id','lagtime']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u2.columns = ['user_id', 'max_time_stamp2']
max_timestamp_u2.user_id=max_timestamp_u2.user_id.astype('int32')

train_df['lagtime']=train_df['timestamp']-train_df['lagtime']
lagtime_mean=train_df['lagtime'].mean()
train_df['lagtime'].fillna(lagtime_mean, inplace=True)


# section2
train_df['lagtime2'] = train_df.groupby('user_id')['timestamp'].shift(2)
max_timestamp_u3 = train_df[['user_id','lagtime2']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u3.columns = ['user_id', 'max_time_stamp3']
max_timestamp_u3.user_id=max_timestamp_u3.user_id.astype('int32')
train_df['lagtime2']=train_df['timestamp']-train_df['lagtime2']
lagtime_mean2=train_df['lagtime2'].mean()
train_df['lagtime2'].fillna(0, inplace=True)

# section3
train_df['lagtime3'] = train_df.groupby('user_id')['timestamp'].shift(3)

train_df['lagtime3']=train_df['timestamp']-train_df['lagtime3']
lagtime_mean3=train_df['lagtime3'].mean()
train_df['lagtime3'].fillna(0, inplace=True)

# gen_dict
max_timestamp_u_dict=max_timestamp_u.set_index('user_id').to_dict()
max_timestamp_u_dict2=max_timestamp_u2.set_index('user_id').to_dict()
max_timestamp_u_dict3=max_timestamp_u3.set_index('user_id').to_dict()
del max_timestamp_u
del max_timestamp_u2
del max_timestamp_u3

In [12]:
# 同上
lagtime2 = np.zeros(len(valid_df), dtype=np.float32)
lagtime3 = np.zeros(len(valid_df), dtype=np.float32)
for i, (user_id,
        content_type_id,
        timestamp,
        content_id,) in enumerate(zip(
    valid_df['user_id'].values,
    valid_df['content_type_id'].values,
    valid_df['timestamp'].values,
    valid_df['content_id'].values)):
    if content_type_id==0:
        if user_id in max_timestamp_u_dict['max_time_stamp'].keys():
            if(max_timestamp_u_dict2['max_time_stamp2'][user_id]==0):
                lagtime2[i]=0
                lagtime3[i]=0
            else:
                lagtime2[i]=timestamp-max_timestamp_u_dict2['max_time_stamp2'][user_id]
                if(max_timestamp_u_dict3['max_time_stamp3'][user_id]==0):
                    lagtime3[i]=0
                else:
                    lagtime3[i]=timestamp-max_timestamp_u_dict3['max_time_stamp3'][user_id]
                max_timestamp_u_dict3['max_time_stamp3'][user_id]=max_timestamp_u_dict2['max_time_stamp2'][user_id]
            max_timestamp_u_dict2['max_time_stamp2'][user_id]=max_timestamp_u_dict['max_time_stamp'][user_id]
            max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp
        else:
            max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})
            lagtime2[i]=0
            max_timestamp_u_dict2['max_time_stamp2'].update({user_id:0})
            lagtime3[i]=0
            max_timestamp_u_dict3['max_time_stamp3'].update({user_id:0})
            
valid_df["lagtime2"]=lagtime2
valid_df["lagtime3"]=lagtime3
valid_df["lagtime2"].fillna(0, inplace=True)
valid_df["lagtime3"].fillna(0, inplace=True)
train_df["lagtime2"] = train_df["lagtime2"].astype("uint64")
train_df["lagtime3"] = train_df["lagtime3"].astype("uint64")
valid_df["lagtime2"] = valid_df["lagtime2"].astype("uint64")
valid_df["lagtime3"] = valid_df["lagtime3"].astype("uint64")

## questions.csv
merge question.csv文件 生成数据

In [13]:
%%time
# 将 part 和 bundle_id 直接组合形成一个新的特征
ques_df3 = pd.read_csv("D:/kaggle/input/riiid-test-answer-prediction/questions.csv")
ques_df3['part_bundle_id']=(ques_df3['part']*100000+ques_df3['bundle_id']).astype('int32')
ques_df3 = ques_df3[["part_bundle_id"]]

train_df = train_df.merge(ques_df3, how="left", left_on="content_id", right_index=True)
valid_df = valid_df.merge(ques_df3, how="left", left_on="content_id", right_index=True)

Wall time: 14 s


In [14]:
%%time
# 生成 每个question的这些tags的平均正确率。
# 解释一下：
# 首先每个question是有一个所有用户答题的正确率（就是关心这个question是难题还是简单题）
# 然后每个tags归属于若干的question。那么可以求出每个tags所属的那些问题的平均正确率。
# 得到tags的正确率以后，又因为每个question又包含若干的tags，那么可以求出每个question的这些tags的平均正确率。
qdf = pd.read_csv("D:/kaggle/input/riiid-test-answer-prediction/questions.csv")[['tags']].astype(str)
qdf['tags_l'] = [x.split() for x in qdf.tags.values]
tags_set_list = [str(i) for i in list(range(188))]

ques_correct_df = train_df.groupby(["content_id"]).agg({"answered_correctly": ["count","sum"]})
ques_correct_df.columns = ["total", "right"]
ques_correct_df[['total', 'right']] = ques_correct_df[['total', 'right']].astype(int)
qdf = qdf.merge(ques_correct_df, left_index=True, right_index=True, how = "left")

tags_dict = {}
for x in tags_set_list:
    tags_dict[x] = [0, 0]
    for y in range(len(qdf)):
        if x in qdf.tags_l[y]:
            tags_dict[x][0] += qdf.right[y]
            tags_dict[x][1] += qdf.total[y]
            
def get_tags_acc(x):
    if ["nan"] == x:
        return 0.65
    right = 0; total = 0
    for tag in x:
        right += tags_dict[tag][0]
        total += tags_dict[tag][1]
    return right/total

qdf = qdf["tags_l"].apply(get_tags_acc).rename("tags_acc").astype(np.float32)

train_df = train_df.merge(qdf,how="left",left_on="content_id",right_index=True)
valid_df = valid_df.merge(qdf,how="left",left_on="content_id",right_index=True)

Wall time: 35.7 s


In [15]:
%%time
#生成 每个 part-bundle对（groupby(['bundle_id', 'part']），他的平均正确率
ques_columns = ["question_id", "part", "bundle_id"]
ques_df = pd.read_csv("D:/kaggle/input/riiid-test-answer-prediction/questions.csv")[ques_columns]
ques_df["part"] = ques_df["part"].astype("int8")

part_bundle_df = train_df.merge(ques_df, how="left",left_on='content_id', right_on='question_id')
part_bundle_df = part_bundle_df[['bundle_id','part','answered_correctly']].groupby(['bundle_id', 'part'],as_index=False).agg({"answered_correctly":["mean"]}).reset_index(drop=True)
part_bundle_df.columns = ["bundle_id", "part", 'part_bundle_acc']

ques_df = ques_df.merge(part_bundle_df,how="left", on=["part","bundle_id"])
ques_df = ques_df[["question_id","part","part_bundle_acc"]].set_index("question_id")
ques_df["part_bundle_acc"] = ques_df["part_bundle_acc"].astype(np.float32)

train_df = train_df.merge(ques_df, how='left', left_on='content_id',right_index=True)
valid_df = valid_df.merge(ques_df, how='left', left_on='content_id',right_index=True)

Wall time: 1min 4s


In [16]:
%%time
# 用于user-part对（groupby(['user_id', 'part'])）的正确率
#注意！这里首次出现了关于用户的正确率。用户的正确率会和做题的数量成正比，所以不能全局的直接求出一个用户平均的正确率，一定要求他当前做题时的过往正确率。

# 用于train data
curr_up_dict_df = train_df.groupby(['user_id', 'part'])["answered_correctly"].agg(['count', 'sum']).astype('uint16')
train_df['lag'] = train_df.groupby(['user_id', 'part'])["answered_correctly"].shift()
cum = train_df.groupby(['user_id', 'part'])['lag'].agg(['cumsum', 'cumcount'])
train_df["curr_user_part_count"] = cum['cumcount'].astype('uint16')
train_df["curr_user_part_sum"] = cum['cumsum']
train_df['curr_user_part_acc'] = (train_df["curr_user_part_sum"] / train_df["curr_user_part_count"]).fillna(0.68).astype("float32")
train_df["curr_user_part_sum"] = train_df["curr_user_part_sum"].fillna(0).astype('uint16')
train_df.drop(columns=['lag'], inplace=True)

# 用于valid data和test data
part_user_d = curr_up_dict_df.to_dict("index")
np_up_cnt = np.zeros((len(valid_df),2), dtype=np.uint16)
for idx, (user_id, answered_correctly, part) in enumerate(zip(valid_df['user_id'].values, valid_df['answered_correctly'].values, valid_df['part'].values)):
    if (user_id,part) in part_user_d:
        np_up_cnt[idx] = [part_user_d[(user_id,part)]["count"], part_user_d[(user_id,part)]["sum"]]
        part_user_d[(user_id,part)]["count"] += 1
        part_user_d[(user_id,part)]["sum"] += answered_correctly
    else:
        part_user_d[(user_id,part)] = {'count': 1, 'sum': 1} if answered_correctly == 1 else {'count': 1, 'sum': 0}
            
curr_user_part_df = pd.DataFrame(np_up_cnt,columns=["curr_user_part_count", "curr_user_part_sum"])
curr_user_part_df["curr_user_part_acc"] = (curr_user_part_df["curr_user_part_sum"] / curr_user_part_df["curr_user_part_count"]).fillna(0.68).astype(np.float32)
valid_df = valid_df.merge(curr_user_part_df,how="left", left_index=True, right_index=True)
  
del cum
del curr_user_part_df
curr_up_dict_df = pd.DataFrame.from_dict(part_user_d,"index")

Wall time: 1min 15s


## lectures和question混合

In [None]:
%%time
# 生成 用户在做某个part类型的题目前，看过多少个part类型的讲座
def same_part(df):
    same_part_l = np.zeros(len(df), dtype="uint8")
    for idx,row in enumerate(tqdm(df.itertuples(), total=df.shape[0])):
        part_cnt = eval(f"row.part_{str(row.part)}_cnt")
        same_part_l[idx] = part_cnt
    df["same_part_cnt"] = same_part_l
    return df

train_df = same_part(train_df)
valid_df = same_part(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=96817414.0), HTML(value='')))

## user loop

In [None]:
%%time
# 生成讲座相关的特征（和时间序列相关）， 具体生成特征解释在函数末尾处
# 以下三个函数，第一个用于train_data or valid_data，后两个用于test_data
# 为什么要分三个函数？
# 因为比赛test_data 以api方式生成，每一批传过来数据，包含当前批次的数据的特征，和上一批的数据的label。
# 所以在当你拿到test_data需要生成特征时，你需要用你记录的，这个用户以前答题情况来生成特征（第二个函数的作用）
# 但是你需要在下一批test_data传过来的时候才能拿到label，你才能（用第三个函数）更新你的对这个用户的记录。
def add_user_feats(df):
    '''
    本函数用作train_data和valid_data的生成
    input: train_data or valid_data
    return: train_data or valid_data
    '''
    ucc = np.zeros(len(df), dtype=np.uint16)
    uac = np.zeros(len(df), dtype=np.uint16)
    uqcor = np.zeros(len(df), dtype=np.uint8)
    uqcnt = np.zeros(len(df), dtype=np.uint8)
    utdiff = np.zeros(len(df), dtype=np.uint64)
    utdiff_mean = np.zeros(len(df), dtype=np.uint64) 
    uelapdiff = np.zeros(len(df), dtype=np.float32)  
    uq_timediff = np.zeros(len(df), dtype=np.uint64) 
    global idx
    
    for cnt,row in enumerate(tqdm(df[['user_id','content_id','answered_correctly',
                                      'timestamp','prior_question_elapsed_time',
                                     ]].itertuples(index=False),total=df.shape[0])): 
        if row[0] in curr_u_dict:
            # 写入np
            ucc[cnt] = curr_u_dict[row[0]]["ucc"]
            uac[cnt] = curr_u_dict[row[0]]["uac"]
            utdiff[cnt] = row[3] - curr_u_dict[row[0]]["uts"]
            utdiff_mean[cnt] = curr_u_dict[row[0]]["utsdiff"][1] / curr_u_dict[row[0]]["utsdiff"][0]
            uelapdiff[cnt] = row[4] - curr_u_dict[row[0]]["uelapdiff"]
            # 写入字典
            curr_u_dict[row[0]]["uts"] = row[3]
            curr_u_dict[row[0]]["ucc"] += row[2]
            curr_u_dict[row[0]]["uac"] += 1
            curr_u_dict[row[0]]["utsdiff"][0] += 1 
            curr_u_dict[row[0]]["utsdiff"][1] += row[3] 
            curr_u_dict[row[0]]["uelapdiff"] = row[4] 
            if row[1] in curr_u_dict[row[0]]:
                tmp_idx = curr_u_dict[row[0]][row[1]]
                uq_timediff[cnt] =  row[3] - np_uctdiff_cnt[tmp_idx] 
                uqcor[cnt] = np_cor_cnt[tmp_idx]
                uqcnt[cnt] = np_all_cnt[tmp_idx]
                np_uctdiff_cnt[tmp_idx] = row[3] 
                np_cor_cnt[tmp_idx] += row[2]
                np_all_cnt[tmp_idx] += 1
            else:
                uqcor[cnt] = 0; uqcnt[cnt] = 0;
                uq_timediff[cnt] = 0 
                curr_u_dict[row[0]][row[1]] = idx
                np_uctdiff_cnt[idx] = row[3] 
                np_cor_cnt[idx] += row[2]
                np_all_cnt[idx] += 1
                idx += 1
        else:
            # 写入np
            ucc[cnt] = 0; uac[cnt] = 0;
            uqcor[cnt] = 0; uqcnt[cnt] = 0;
            utdiff[cnt] = 0; utdiff_mean[cnt] = 0; 
            uelapdiff[cnt] = 0; uq_timediff[cnt] = 0 
            # 写入字典
            curr_u_dict[row[0]] = {}
            curr_u_dict[row[0]]["ucc"] = row[2]
            curr_u_dict[row[0]]["uac"] = 1
            curr_u_dict[row[0]]["uts"] = row[3]
            curr_u_dict[row[0]]["utsdiff"] = [1, row[3]] 
            curr_u_dict[row[0]]["uelapdiff"] = row[4] 
            curr_u_dict[row[0]][row[1]] = idx
            np_uctdiff_cnt[idx] = row[3] 
            np_cor_cnt[idx] += row[2]
            np_all_cnt[idx] += 1
            idx += 1
            
    user_feats_df = pd.DataFrame({'curr_user_correct_cnt':ucc, # 用户当前答题正确的次数
                                  'curr_user_answer_cnt':uac, # 用户当前答题总次数
                                  'curr_uq_correct_cnt':uqcor, # 用户回答某一个问题正确的次数
                                  'curr_uq_answer_cnt':uqcnt, # 用户回答某一个问题的总次数
                                  'curr_user_time_diff':utdiff, # 用户当前距离他第一次答题，过去的时间
                                  'curr_user_time_diff_mean':utdiff_mean,  # 用户每一次答题的平均间隔
                                  'curr_user_elapsed_time_diff':uelapdiff, # 用户回答上一组问题的平均时间
                                  'curr_uq_time_diff':uq_timediff # 用户答题时，距离上次回答这个相同问题过去多少时间
                                 }) 
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_correct_cnt'] / user_feats_df['curr_uq_answer_cnt']
    user_feats_df['curr_uq_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_acc'].astype(np.float32)
    user_feats_df['curr_uq_correct_cnt'] = user_feats_df['curr_uq_correct_cnt'].where(user_feats_df['curr_uq_correct_cnt'] <= 4, 4)
    user_feats_df['curr_uq_answer_cnt'] = user_feats_df['curr_uq_answer_cnt'].where(user_feats_df['curr_uq_answer_cnt'] <= 4, 4)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_correct_cnt'] / user_feats_df['curr_user_answer_cnt']
    user_feats_df['curr_user_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_acc'].astype(np.float32)
    user_feats_df['curr_user_elapsed_time_diff'].fillna(0, inplace=True) 
    df = pd.concat([df, user_feats_df], axis=1)
    return df



def add_user_feats_without_update(df):
    '''
    本函数用作test_data的生成
    input: test_data
    return: test_data
    '''
    ucc = np.zeros(len(df), dtype=np.uint16)
    uac = np.zeros(len(df), dtype=np.uint16)
    uqcor = np.zeros(len(df), dtype=np.uint8)
    uqcnt = np.zeros(len(df), dtype=np.uint8)
    utdiff = np.zeros(len(df), dtype=np.uint64)
    utdiff_mean = np.zeros(len(df), dtype=np.uint64) 
    uelapdiff = np.zeros(len(df), dtype=np.float32)  
    uq_timediff = np.zeros(len(df), dtype=np.uint64) 
    for cnt,row in enumerate(df[['user_id', 'content_id','timestamp','prior_question_elapsed_time']].itertuples(index=False)): 
        if row[0] in curr_u_dict:
            ucc[cnt] = curr_u_dict[row[0]]["ucc"]
            uac[cnt] = curr_u_dict[row[0]]["uac"]
            utdiff[cnt] = row[2] - curr_u_dict[row[0]]["uts"]
            utdiff_mean[cnt] = curr_u_dict[row[0]]["utsdiff"][1] / curr_u_dict[row[0]]["utsdiff"][0] 
            uelapdiff[cnt] = row[3] - curr_u_dict[row[0]]["uelapdiff"] 
            if row[1] in curr_u_dict[row[0]]:
                tmp_idx = curr_u_dict[row[0]][row[1]]
                uq_timediff[cnt] =  row[2] - np_uctdiff_cnt[tmp_idx] 
                uqcor[cnt] = np_cor_cnt[tmp_idx]
                uqcnt[cnt] = np_all_cnt[tmp_idx]
            else:
                uqcor[cnt] = 0; uqcnt[cnt] = 0
                uq_timediff[cnt] = 0 
        else:
            ucc[cnt] = 0; uac[cnt] = 0
            uqcor[cnt] = 0; uqcnt[cnt] = 0
            utdiff[cnt] = 0; utdiff_mean[cnt] = 0; 
            uelapdiff[cnt] = 0; uq_timediff[cnt] = 0 
            
    user_feats_df = pd.DataFrame({'curr_user_correct_cnt':ucc, 'curr_user_answer_cnt':uac,
                                  'curr_uq_correct_cnt':uqcor, 'curr_uq_answer_cnt':uqcnt,
                                  'curr_user_time_diff':utdiff, 'curr_user_time_diff_mean':utdiff_mean, 
                                  'curr_user_elapsed_time_diff':uelapdiff, 'curr_uq_time_diff':uq_timediff 
                                 }) 
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_correct_cnt'] / user_feats_df['curr_uq_answer_cnt']
    user_feats_df['curr_uq_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_acc'].astype(np.float32)
    user_feats_df['curr_uq_correct_cnt'] = user_feats_df['curr_uq_correct_cnt'].where(user_feats_df['curr_uq_correct_cnt'] <= 4, 4)
    user_feats_df['curr_uq_answer_cnt'] = user_feats_df['curr_uq_answer_cnt'].where(user_feats_df['curr_uq_answer_cnt'] <= 4, 4)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_correct_cnt'] / user_feats_df['curr_user_answer_cnt']
    user_feats_df['curr_user_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_acc'].astype(np.float32)
    user_feats_df['curr_user_elapsed_time_diff'].fillna(0, inplace=True) 
    df = pd.concat([df, user_feats_df], axis=1)
    return df



def update_user_feats(df):
    '''
    本函数用作test_data的生成
    input: test_data
    return: test_data
    '''
    global idx
    for row in df[['user_id','content_id','answered_correctly','timestamp', 'content_type_id','prior_question_elapsed_time',]].values: 
        if row[4] == 0:
            if row[0] in curr_u_dict:
                curr_u_dict[row[0]]["ucc"] += row[2]
                curr_u_dict[row[0]]["uac"] += 1
                curr_u_dict[row[0]]["uts"] = row[3]
                curr_u_dict[row[0]]["utsdiff"][0] += 1 
                curr_u_dict[row[0]]["utsdiff"][1] += row[3] 
                curr_u_dict[row[0]]["uelapdiff"] = row[5] 
                if row[1] in curr_u_dict[row[0]]:
                    tmp_idx = curr_u_dict[row[0]][row[1]]
                    np_uctdiff_cnt[tmp_idx] = row[3] 
                    np_cor_cnt[tmp_idx] += row[2]
                    np_all_cnt[tmp_idx] += 1
                else:
                    curr_u_dict[row[0]][row[1]] = idx
                    np_uctdiff_cnt[idx] = row[3] 
                    np_cor_cnt[idx] += row[2]
                    np_all_cnt[idx] += 1
                    idx += 1
            else:
                curr_u_dict[row[0]] = {}
                curr_u_dict[row[0]]["ucc"] = row[2]
                curr_u_dict[row[0]]["uac"] = 1
                curr_u_dict[row[0]]["uts"] = row[3]
                curr_u_dict[row[0]]["utsdiff"] = [1, row[3]] 
                curr_u_dict[row[0]]["uelapdiff"] = row[5] 
                curr_u_dict[row[0]][row[1]] = idx
                np_uctdiff_cnt[idx] = row[3] 
                np_cor_cnt[idx] += row[2]
                np_all_cnt[idx] += 1
                idx += 1
                
                
idx = 0
curr_u_dict = {}
np_cor_cnt = np.zeros(90_000_000, dtype=np.uint8)
np_all_cnt = np.zeros(90_000_000, dtype=np.uint8)
np_uctdiff_cnt = np.zeros(90_000_000, dtype=np.uint64)
train_df = add_user_feats(train_df)
valid_df = add_user_feats(valid_df)
print(f"idx:{idx}")

In [None]:
%%time
# 一些简单的空值填充和异常值处理，前面特征看得懂话，这里应该没问题。

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(0).astype(np.int8)
valid_df['prior_question_had_explanation'] = valid_df['prior_question_had_explanation'].fillna(0).astype(np.int8)

prior_question_elapsed_time_mean = train_df.prior_question_elapsed_time.dropna().values.mean()
train_df['prior_question_elapsed_time'] = train_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
valid_df['prior_question_elapsed_time'] = valid_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

train_df['hmean_acc'] = 2*((train_df['curr_user_acc']*train_df['content_mean_acc']) /(train_df['curr_user_acc']+train_df['content_mean_acc'])).astype(np.float32)
valid_df['hmean_acc'] = 2*((valid_df['curr_user_acc']*valid_df['content_mean_acc']) /(valid_df['curr_user_acc']+valid_df['content_mean_acc'])).astype(np.float32)

train_df["content_mean_acc"] = train_df.content_mean_acc.mask((train_df["content_cnt"] < 3), 0.65)
train_df["content_mean_acc"] = train_df.content_mean_acc.mask((train_df["content_mean_acc"] < 0.2) & (train_df["content_cnt"] < 21), 0.2)
train_df["content_mean_acc"] = train_df.content_mean_acc.mask((train_df["content_mean_acc"] > 0.95) & (train_df["content_cnt"] < 21), 0.95)

train_df["curr_user_acc"] = train_df.curr_user_acc.mask((train_df["curr_user_acc"] < 0.2) & (train_df["curr_user_answer_cnt"] < 21), 0.2)
train_df["curr_user_acc"] = train_df.curr_user_acc.mask((train_df["curr_user_acc"] > 0.95) & (train_df["curr_user_answer_cnt"] < 21), 0.95)
print("Train:", train_df.shape,"Valid:", valid_df.shape)

In [None]:
SUFFIX = "_0818"

In [None]:
# 保存一份粗糙的train_df和valid_df，这个不重要，我用来debug用的，真正训练的数据在下面保存
# train_df.to_pickle(f"D:/kaggle/input/riiid-test-answer-prediction/train_df{SUFFIX}_raw.pkl")
# valid_df.to_pickle(f"D:/kaggle/input/riiid-test-answer-prediction/valid_df{SUFFIX}_raw.pkl")

In [None]:
# 所有要用的特征
features = [
    "part_bundle_id",
    
    'content_elapsed_time',
    'content_had_explanation',
    
    'lagtime2',
    'lagtime3',
    
    'content_explation_false_mean',
    'content_explation_true_mean',
    
    'curr_user_part_acc', 
    'curr_user_part_count', 
    'curr_user_part_sum',  
    'curr_uq_time_diff', 
    'curr_user_time_diff',
    'curr_user_time_diff_mean',
    'curr_user_elapsed_time_diff',

    'avg_task_seen_cumsum',
    'content_mean_acc',
    'content_cnt',
    'corr_question_elapsed_time_mean', 
    'incorr_question_elapsed_time_mean',
    
    "watched_tags_rate",
    "watched_tags_bool",
    'tags_acc',
    'part',
    'part_bundle_acc', 
    
    'part_1_cnt', 'part_2_cnt', 'part_3_cnt', 'part_4_cnt', 'part_5_cnt', 'part_6_cnt', 'part_7_cnt', 
    'type_of_concept_cnt', 'type_of_intention_cnt', 'type_of_solving_question_cnt', 'type_of_starter_cnt', 
    "same_part_cnt",
    
    'curr_lecture_bool',
    'curr_user_correct_cnt', 
    'curr_user_answer_cnt',
    'curr_user_acc',
    'hmean_acc',
    'curr_uq_correct_cnt',
    'curr_uq_answer_cnt',
    'curr_uq_acc',
    'prior_question_elapsed_time',
    'prior_question_had_explanation', 
]

target = 'answered_correctly'

In [None]:
train_df = train_df[features + [target]]
valid_df = valid_df[features + [target]]

In [None]:
display(train_df.head(10))
display(train_df.info())
display(train_df.isna().sum())

In [None]:
display(valid_df.head(10))
display(valid_df.info())
display(valid_df.isna().sum())

### 保存数据 train_df valid_df

In [None]:
# 保存前面生成的所有特征，这些特征之后上传到kaggle，直接merge就能用，这样kaggle上不用再生成一遍了。
tmp_dict1 = pd.DataFrame.from_dict(curr_lectures_dict, orient='index')
tmp_dict1.to_csv(f"D:/kaggle/input/riiid-test-answer-prediction/curr_lectures_dict{SUFFIX}.csv.data")

pickle.dump(content_answers_df, open(f"D:/kaggle/input/riiid-test-answer-prediction/content_answers_df{SUFFIX}.pkl.data","wb"))
pickle.dump(ques_df, open(f"D:/kaggle/input/riiid-test-answer-prediction/ques_df{SUFFIX}.pkl.data","wb"))
pickle.dump(task_user_df, open(f"D:/kaggle/input/riiid-test-answer-prediction/task_user_df{SUFFIX}.pkl.data","wb"))
pickle.dump(qdf, open(f"D:/kaggle/input/riiid-test-answer-prediction/qdf{SUFFIX}.pkl.data","wb"))
pickle.dump(content_explation_agg, open(f"D:/kaggle/input/riiid-test-answer-prediction/content_explation_agg{SUFFIX}.pkl.data","wb"))
pickle.dump(max_timestamp_u_dict, open(f"D:/kaggle/input/riiid-test-answer-prediction/max_timestamp_u_dict{SUFFIX}.pkl.data","wb"))
pickle.dump(max_timestamp_u_dict2, open(f"D:/kaggle/input/riiid-test-answer-prediction/max_timestamp_u_dict2{SUFFIX}.pkl.data","wb"))
pickle.dump(max_timestamp_u_dict3, open(f"D:/kaggle/input/riiid-test-answer-prediction/max_timestamp_u_dict3{SUFFIX}.pkl.data","wb"))

pickle.dump(curr_u_dict, open(f"D:/kaggle/input/riiid-test-answer-prediction/curr_u_dict{SUFFIX}.pkl.data","wb"))
pickle.dump(np_cor_cnt,open(f"D:/kaggle/input/riiid-test-answer-prediction/np_cor_cnt{SUFFIX}.pkl.data","wb"))    
pickle.dump(np_all_cnt,open(f"D:/kaggle/input/riiid-test-answer-prediction/np_all_cnt{SUFFIX}.pkl.data","wb"))
pickle.dump(np_uctdiff_cnt,open(f"D:/kaggle/input/riiid-test-answer-prediction/np_uctdiff_cnt{SUFFIX}.pkl.data","wb")) 

pickle.dump(ques_df3,open(f"D:/kaggle/input/riiid-test-answer-prediction/ques_df3{SUFFIX}.pkl.data","wb"))    
pickle.dump(content_elapsed_time_agg,open(f"D:/kaggle/input/riiid-test-answer-prediction/content_elapsed_time_agg{SUFFIX}.pkl.data","wb"))
pickle.dump(content_had_explanation_agg,open(f"D:/kaggle/input/riiid-test-answer-prediction/content_had_explanation_agg{SUFFIX}.pkl.data","wb")) 

pickle.dump(curr_up_dict_df, open(f'D:/kaggle/input/riiid-test-answer-prediction/curr_up_dict_df{SUFFIX}.pkl.data', 'wb'))

In [None]:
# 保存真正的训练数据
train_df.to_pickle(f"D:/kaggle/input/riiid-test-answer-prediction/train_df{SUFFIX}.pkl")
valid_df.to_pickle(f"D:/kaggle/input/riiid-test-answer-prediction/valid_df{SUFFIX}.pkl")