In [None]:
import gc
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import joblib
from tqdm.notebook import tqdm

import psutil
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import os
import riiideducation

# LGBM

## 以下是一些生成特征的函数

In [None]:
# 因为在比赛中test_data 以api方式生成，每一批传过来数据，包含当前批次的数据的特征，和上一批的数据的label。
# 所以当你拿到test_data需要生成特征时，你需要用你记录的这个用户以前答题情况来生成特征（第1个函数的作用）
# 但是你需要在下一批test_data传过来的时候才能拿到label，你才能（用第2个函数）更新你的对这个用户的记录。
def add_user_feats_without_update(df):
    ucc = np.zeros(len(df), dtype=np.uint16)
    uac = np.zeros(len(df), dtype=np.uint16)
    uqcor = np.zeros(len(df), dtype=np.uint8)
    uqcnt = np.zeros(len(df), dtype=np.uint8)
    utdiff = np.zeros(len(df), dtype=np.uint64)
    utdiff_mean = np.zeros(len(df), dtype=np.uint64) 
    uelapdiff = np.zeros(len(df), dtype=np.float32)  
    uq_timediff = np.zeros(len(df), dtype=np.uint64) 
    for cnt,row in enumerate(df[['user_id', 'content_id','timestamp','prior_question_elapsed_time']].itertuples(index=False)): 
        if row[0] in curr_u_dict:
            ucc[cnt] = curr_u_dict[row[0]]["ucc"]
            uac[cnt] = curr_u_dict[row[0]]["uac"]
            utdiff[cnt] = row[2] - curr_u_dict[row[0]]["uts"]
            utdiff_mean[cnt] = curr_u_dict[row[0]]["utsdiff"][1] / curr_u_dict[row[0]]["utsdiff"][0] 
            uelapdiff[cnt] = row[3] - curr_u_dict[row[0]]["uelapdiff"] 
            if row[1] in curr_u_dict[row[0]]:
                tmp_idx = curr_u_dict[row[0]][row[1]]
                uq_timediff[cnt] =  row[2] - np_uctdiff_cnt[tmp_idx] 
                uqcor[cnt] = np_cor_cnt[tmp_idx]
                uqcnt[cnt] = np_all_cnt[tmp_idx]
            else:
                uqcor[cnt] = 0; uqcnt[cnt] = 0
                uq_timediff[cnt] = 0 
        else:
            ucc[cnt] = 0; uac[cnt] = 0
            uqcor[cnt] = 0; uqcnt[cnt] = 0
            utdiff[cnt] = 0; utdiff_mean[cnt] = 0; 
            uelapdiff[cnt] = 0; uq_timediff[cnt] = 0 
            
    user_feats_df = pd.DataFrame({'curr_user_correct_cnt':ucc, # 用户当前答题正确的次数
                                  'curr_user_answer_cnt':uac, # 用户当前答题总次数
                                  'curr_uq_correct_cnt':uqcor, # 用户回答某一个问题正确的次数
                                  'curr_uq_answer_cnt':uqcnt, # 用户回答某一个问题的总次数
                                  'curr_user_time_diff':utdiff, # 用户当前距离他第一次答题，过去的时间
                                  'curr_user_time_diff_mean':utdiff_mean,  # 用户每一次答题的平均间隔
                                  'curr_user_elapsed_time_diff':uelapdiff, # 用户回答上一组问题的平均时间
                                  'curr_uq_time_diff':uq_timediff # 用户答题时，距离上次回答这个相同问题过去多少时间
                                 }) 
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_correct_cnt'] / user_feats_df['curr_uq_answer_cnt']
    user_feats_df['curr_uq_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_uq_acc'] = user_feats_df['curr_uq_acc'].astype(np.float32)
    user_feats_df['curr_uq_correct_cnt'] = user_feats_df['curr_uq_correct_cnt'].where(user_feats_df['curr_uq_correct_cnt'] <= 4, 4)
    user_feats_df['curr_uq_answer_cnt'] = user_feats_df['curr_uq_answer_cnt'].where(user_feats_df['curr_uq_answer_cnt'] <= 4, 4)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_correct_cnt'] / user_feats_df['curr_user_answer_cnt']
    user_feats_df['curr_user_acc'].fillna(0.680, inplace=True)
    user_feats_df['curr_user_acc'] = user_feats_df['curr_user_acc'].astype(np.float32)
    user_feats_df['curr_user_time_diff_mean'].fillna(0, inplace=True)  
    user_feats_df['curr_user_elapsed_time_diff'].fillna(0, inplace=True) 
    user_feats_df['curr_uq_time_diff'].fillna(0, inplace=True)  
    df = pd.concat([df, user_feats_df], axis=1)
    return df


def update_user_feats(df):
    global idx
    for row in df[['user_id','content_id','answered_correctly','timestamp', 'content_type_id','prior_question_elapsed_time',]].values: 
        if row[4] == 0:
            if row[0] in curr_u_dict:
                curr_u_dict[row[0]]["ucc"] += row[2]
                curr_u_dict[row[0]]["uac"] += 1
                curr_u_dict[row[0]]["uts"] = row[3]
                curr_u_dict[row[0]]["utsdiff"][0] += 1 
                curr_u_dict[row[0]]["utsdiff"][1] += row[3] 
                curr_u_dict[row[0]]["uelapdiff"] = row[5] 
                if row[1] in curr_u_dict[row[0]]:
                    tmp_idx = curr_u_dict[row[0]][row[1]]
                    np_uctdiff_cnt[tmp_idx] = row[3] 
                    np_cor_cnt[tmp_idx] += row[2]
                    np_all_cnt[tmp_idx] += 1
                else:
                    curr_u_dict[row[0]][row[1]] = idx
                    np_uctdiff_cnt[idx] = row[3] 
                    np_cor_cnt[idx] += row[2]
                    np_all_cnt[idx] += 1
                    idx += 1
            else:
                curr_u_dict[row[0]] = {}
                curr_u_dict[row[0]]["ucc"] = row[2]
                curr_u_dict[row[0]]["uac"] = 1
                curr_u_dict[row[0]]["uts"] = row[3]
                curr_u_dict[row[0]]["utsdiff"] = [1, row[3]] 
                curr_u_dict[row[0]]["uelapdiff"] = row[5] 
                curr_u_dict[row[0]][row[1]] = idx
                np_uctdiff_cnt[idx] = row[3] 
                np_cor_cnt[idx] += row[2]
                np_all_cnt[idx] += 1
                idx += 1

In [None]:
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])
lectures_df['content_type_id'] = 1

q_taglist_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")[['tags']].astype(str)
q_taglist_df["tags_l"] = [x.split() for x in q_taglist_df.tags.values]
q_taglist_df['content_type_id'] = 0
q_taglist_df.drop("tags", axis=1, inplace=True)
q_taglist_df.drop(10033, axis=0, inplace=True) # nan

def add_lectures_feats(df, curr_dict):
    new_df = df[["row_id", "user_id", "timestamp", "content_id", "content_type_id"]]
    new_df = new_df.merge(lectures_df, how="left", left_on = ["content_id","content_type_id"], right_on = ["lecture_id","content_type_id"])
    new_df = new_df.merge(q_taglist_df, how="left", left_on = ["content_id","content_type_id"], right_on = [q_taglist_df.index,"content_type_id"])
    new_df = new_df.sort_values(["timestamp"])
    new_df = new_df[['timestamp', 'user_id', 'content_type_id','tag','part_1','part_2','part_3','part_4','part_5','part_6','part_7',
                     'type_of_concept','type_of_intention','type_of_solving_question','type_of_starter','tags_l','row_id']]
    ulc_lb = np.zeros(len(df), dtype="int8")
    part1_l = np.zeros(len(df), dtype="uint16")
    part2_l = np.zeros(len(df), dtype="uint16")
    part3_l = np.zeros(len(df), dtype="uint16")
    part4_l = np.zeros(len(df), dtype="uint16")
    part5_l = np.zeros(len(df), dtype="uint16")
    part6_l = np.zeros(len(df), dtype="uint16")
    part7_l = np.zeros(len(df), dtype="uint16")
    type_of_concept_l = np.zeros(len(df), dtype="uint16")
    type_of_intention_l = np.zeros(len(df), dtype="uint16")
    type_of_solving_question_l = np.zeros(len(df), dtype="uint16")
    type_of_starter_l = np.zeros(len(df), dtype="uint16")
    has_tags_l = np.zeros(len(df), dtype="float32")
    
    # 0.'timestamp', 1.'user_id', 2.'content_type_id',3.'tag',4.'part_1',5.'part_2',6.'part_3',7.'part_4',8.'part_5',9.'part_6',10.'part_7',
    # 11.'type_of_concept',12.'type_of_intention',13.'type_of_solving_question',14.'type_of_starter',15.'tags_l', 16.'row_id'
    for cnt,row in enumerate(new_df.itertuples(index=False)):
        if row[1] in curr_dict:
            if row[2] == 1:
                curr_dict[row[1]]["lecture_bool"] = 1
                curr_dict[row[1]]["part_1_cnt"] += int(row[4])
                curr_dict[row[1]]["part_2_cnt"] += int(row[5])
                curr_dict[row[1]]["part_3_cnt"] += int(row[6])
                curr_dict[row[1]]["part_4_cnt"] += int(row[7])
                curr_dict[row[1]]["part_5_cnt"] += int(row[8])
                curr_dict[row[1]]["part_6_cnt"] += int(row[9])
                curr_dict[row[1]]["part_7_cnt"] += int(row[10])
                curr_dict[row[1]]["type_of_concept_cnt"] += int(row[11])
                curr_dict[row[1]]["type_of_intention_cnt"] += int(row[12])
                curr_dict[row[1]]["type_of_solving_question_cnt"] += int(row[13])
                curr_dict[row[1]]["type_of_starter_cnt"] += int(row[14])
                curr_dict[row[1]]["has_tags"].add(int(row[3]))
        else:
            curr_dict[row[1]] = {}
            if row[2] == 1:
                curr_dict[row[1]]["lecture_bool"] = 1
                curr_dict[row[1]]["part_1_cnt"] = int(row[4])
                curr_dict[row[1]]["part_2_cnt"] = int(row[5])
                curr_dict[row[1]]["part_3_cnt"] = int(row[6])
                curr_dict[row[1]]["part_4_cnt"] = int(row[7])
                curr_dict[row[1]]["part_5_cnt"] = int(row[8])
                curr_dict[row[1]]["part_6_cnt"] = int(row[9])
                curr_dict[row[1]]["part_7_cnt"] = int(row[10])
                curr_dict[row[1]]["type_of_concept_cnt"] = int(row[11])
                curr_dict[row[1]]["type_of_intention_cnt"] = int(row[12])
                curr_dict[row[1]]["type_of_solving_question_cnt"] = int(row[13])
                curr_dict[row[1]]["type_of_starter_cnt"] = int(row[14])
                curr_dict[row[1]]["has_tags"] = set([int(row[3])])
            else:
                curr_dict[row[1]]["lecture_bool"] = 0
                curr_dict[row[1]]["part_1_cnt"] = 0
                curr_dict[row[1]]["part_2_cnt"] = 0
                curr_dict[row[1]]["part_3_cnt"] = 0
                curr_dict[row[1]]["part_4_cnt"] = 0
                curr_dict[row[1]]["part_5_cnt"] = 0
                curr_dict[row[1]]["part_6_cnt"] = 0
                curr_dict[row[1]]["part_7_cnt"] = 0
                curr_dict[row[1]]["type_of_concept_cnt"] = 0
                curr_dict[row[1]]["type_of_intention_cnt"] = 0
                curr_dict[row[1]]["type_of_solving_question_cnt"] = 0
                curr_dict[row[1]]["type_of_starter_cnt"] = 0
                curr_dict[row[1]]["has_tags"] = set()
        
        ulc_lb[cnt] = curr_dict[row[1]]["lecture_bool"]
        part1_l[cnt] = curr_dict[row[1]]["part_1_cnt"]
        part2_l[cnt] = curr_dict[row[1]]["part_2_cnt"]
        part3_l[cnt] = curr_dict[row[1]]["part_3_cnt"]
        part4_l[cnt] = curr_dict[row[1]]["part_4_cnt"]
        part5_l[cnt] = curr_dict[row[1]]["part_5_cnt"]
        part6_l[cnt] = curr_dict[row[1]]["part_6_cnt"]
        part7_l[cnt] = curr_dict[row[1]]["part_7_cnt"]
        type_of_concept_l[cnt] = curr_dict[row[1]]["type_of_concept_cnt"]
        type_of_intention_l[cnt] = curr_dict[row[1]]["type_of_intention_cnt"]
        type_of_solving_question_l[cnt] = curr_dict[row[1]]["type_of_solving_question_cnt"]
        type_of_starter_l[cnt] = curr_dict[row[1]]["type_of_starter_cnt"]
        
        if type(row[15]) == list:
            tags_has = 0
            for tag in row[15]:
                if int(tag) in curr_dict[row[1]]["has_tags"]:
                    tags_has += 1
            has_tags_l[cnt] = tags_has/len(row[15])

    has_tags_lb = (has_tags_l > 0).astype("int8")

    lectures_feats_df = pd.DataFrame({"curr_lecture_bool":ulc_lb, # 这个用户之前是否听过讲座
                                      "part_1_cnt":part1_l, # 这个用户听过多少次part类型为1的讲座
                                      "part_2_cnt":part2_l, # 这个用户听过多少次part类型为2的讲座
                                      "part_3_cnt":part3_l, # 这个用户听过多少次part类型为3的讲座
                                      "part_4_cnt":part4_l, # 这个用户听过多少次part类型为4的讲座
                                      "part_5_cnt":part5_l, # 这个用户听过多少次part类型为5的讲座
                                      "part_6_cnt":part6_l, # 这个用户听过多少次part类型为6的讲座
                                      "part_7_cnt":part7_l, # 这个用户听过多少次part类型为7的讲座
                                      "type_of_concept_cnt":type_of_concept_l, # 这个用户听过多少次type类型为concept的讲座
                                      "type_of_intention_cnt":type_of_intention_l, # 这个用户听过多少次type类型为intention的讲座
                                      "type_of_solving_question_cnt":type_of_solving_question_l, # 这个用户听过多少次type类型为solving_question的讲座
                                      "type_of_starter_cnt":type_of_starter_l, # 这个用户听过多少次type类型为starter的讲座
                                      "watched_tags_rate":has_tags_l, # 这个用户在做的这个question所包含的tags，有多少比例是他以前看过相同tags的讲座
                                      "watched_tags_bool":has_tags_lb,# 这个用户在做的这个question所包含的tags，是否至少有一个tag是他以前看过相同tag的讲座
                                     }).set_index(new_df["row_id"])

    df = df.merge(lectures_feats_df,how="left",left_on="row_id",right_index=True)
    return df


def to_letures_dict(df):
    da = {}
    for t in tqdm(df.itertuples(name=None), total=df.shape[0]):
        key = t[0]
        sub_dict = {}
        for i, col in enumerate(df.columns, 1):
            if col == "has_tags":
                sub_dict[col] = eval(t[i])
            else:
                sub_dict[col] = t[i]
        da[key] = sub_dict
    return da

curr_lectures_dict = pd.read_csv("../input/merge-data1615/curr_lectures_dict_1615.csv.data", index_col=0)
curr_lectures_dict = to_letures_dict(curr_lectures_dict)

In [None]:
# 生成 用户在做某个part类型的题目前，看过多少个part类型的讲座
def same_part(df):
    same_part_l = np.zeros(len(df), dtype="uint16")
    for idx,row in enumerate(df.itertuples()):
        part_cnt = eval(f"row.part_{str(int(row.part))}_cnt")
        same_part_l[idx] = part_cnt
    df["same_part_cnt"] = same_part_l
    return df

In [None]:
# 用于user-part对（groupby(['user_id', 'part'])）的正确率
# 分成两个函数的道理，同上，是因为在比赛中test_data的api生成方式
def add_up_without_update(df):
    np_up_cnt = np.zeros((len(test_df),2), dtype=np.uint16)
    for cnt,row in enumerate(df[['user_id', 'part']].itertuples(index=False)): 
        if (row[0],row[1]) in part_user_d:
            np_up_cnt[cnt] = [part_user_d[(row[0],row[1])]["count"], part_user_d[(row[0],row[1])]["sum"]] 
    curr_user_part_df = pd.DataFrame(np_up_cnt,columns=["curr_user_part_count", "curr_user_part_sum"],index=df.row_id)
    curr_user_part_df["curr_user_part_acc"] = (curr_user_part_df["curr_user_part_sum"] / curr_user_part_df["curr_user_part_count"]).fillna(0.68).astype(np.float32)
    df = df.merge(curr_user_part_df,how="left", left_on="row_id", right_index=True)
    return df

def update_up(df):
    for row in df[['user_id','part','answered_correctly']].values: 
        if (row[0], row[1]) in part_user_d:
            part_user_d[(row[0], row[1])]["count"] += 1
            part_user_d[(row[0], row[1])]["sum"] += row[2]
        else:
            part_user_d[(row[0], row[1])] = {'count': 1, 'sum': 1} if row[2] == 1 else {'count': 1, 'sum': 0}

In [None]:
# 生成 用户做这题时，距离上次（前一次）做题过去了多少时间
# 生成 用户做这题时，距离上上次（前两次）做题过去了多少时间
# 生成 用户做这题时，距离上上上次（前三次）做题过去了多少时间
def lagtime_for_test(df):
    lagtime_mean = 0
    lagtime_mean2 = 0
    lagtime_mean3 = 0
    lagtime = np.zeros(len(df), dtype=np.float32)
    lagtime2 = np.zeros(len(df), dtype=np.float32)
    lagtime3 = np.zeros(len(df), dtype=np.float32)
    for i, (user_id,
            content_type_id,
            timestamp,
            content_id,) in enumerate(zip(df['user_id'].values, df['content_type_id'].values, df['timestamp'].values, df['content_id'].values)):
        if content_type_id==0:
            if user_id in max_timestamp_u_dict['max_time_stamp'].keys():
                lagtime[i]=timestamp-max_timestamp_u_dict['max_time_stamp'][user_id]
                if(max_timestamp_u_dict2['max_time_stamp2'][user_id]==lagtime_mean2):
                    lagtime2[i]=lagtime_mean2
                    lagtime3[i]=lagtime_mean3
                else:
                    lagtime2[i]=timestamp-max_timestamp_u_dict2['max_time_stamp2'][user_id]
                    if(max_timestamp_u_dict3['max_time_stamp3'][user_id]==lagtime_mean3):
                        lagtime3[i]=lagtime_mean3
                    else:
                        lagtime3[i]=timestamp-max_timestamp_u_dict3['max_time_stamp3'][user_id]
                    max_timestamp_u_dict3['max_time_stamp3'][user_id]=max_timestamp_u_dict2['max_time_stamp2'][user_id]
                max_timestamp_u_dict2['max_time_stamp2'][user_id]=max_timestamp_u_dict['max_time_stamp'][user_id]
                max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp
            else:
                lagtime[i]=lagtime_mean
                max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})
                lagtime2[i]=lagtime_mean2
                max_timestamp_u_dict2['max_time_stamp2'].update({user_id:lagtime_mean2})
                lagtime3[i]=lagtime_mean3
                max_timestamp_u_dict3['max_time_stamp3'].update({user_id:lagtime_mean3})
    df["lagtime"]= lagtime
    df["lagtime2"]= lagtime2
    df["lagtime3"]= lagtime3
    df["lagtime"].fillna(lagtime_mean, inplace=True)
    df["lagtime2"].fillna(lagtime_mean2, inplace=True)
    df["lagtime3"].fillna(lagtime_mean3, inplace=True)
    df["lagtime"] = df["lagtime"].astype("uint64")
    df["lagtime2"] = df["lagtime2"].astype("uint64")
    df["lagtime3"] = df["lagtime3"].astype("uint64")
    return df

## 导入特征和模型

In [None]:
# 在gen_data中dump的那些特征，现在全部load进来，
content_answers_df = pickle.load(open("../input/merge-data1615/content_answers_df_1615.pkl.data","rb"))
ques_df = pickle.load(open("../input/merge-data1615/ques_df_1615.pkl.data","rb"))
task_user_df = pickle.load(open("../input/merge-data1615/task_user_df_1615.pkl.data","rb"))
qdf = pickle.load(open("../input/merge-data1615/qdf_1615.pkl.data","rb"))
question_elapsed_time_df = pickle.load(open("../input/merge-data1615/question_elapsed_time_mean.pkl.data","rb")) 
content_explation_agg = pickle.load(open("../input/merge-data1615/content_explation_agg_2015.pkl.data","rb")) 
max_timestamp_u_dict = pickle.load(open("../input/merge-data1615/max_timestamp_u_dict_2015.pkl.data","rb")) 
max_timestamp_u_dict2 = pickle.load(open("../input/merge-data1615/max_timestamp_u_dict2_2015.pkl.data","rb")) 
max_timestamp_u_dict3 = pickle.load(open("../input/merge-data1615/max_timestamp_u_dict3_2015.pkl.data","rb")) 

curr_u_dict = pickle.load(open("../input/merge-data1615/curr_u_dict_1615.pkl.data","rb"))
np_cor_cnt = pickle.load(open("../input/merge-data1615/np_cor_cnt_1615.pkl.data","rb"))
np_all_cnt = pickle.load(open("../input/merge-data1615/np_all_cnt_1615.pkl.data","rb"))
np_uctdiff_cnt = pickle.load(open("../input/merge-data1615/np_uctdiff_cnt_1615.pkl.data","rb"))
    
ques_df3 = pickle.load(open("../input/merge-data1615/ques_df3_2015.pkl.data","rb")) 
content_elapsed_time_agg = pickle.load(open("../input/merge-data1615/content_elapsed_time_agg_2015.pkl.data","rb")) 
content_had_explanation_agg = pickle.load(open("../input/merge-data1615/content_had_explanation_agg_2015.pkl.data","rb")) 
    
curr_up_dict_df = pickle.load(open("../input/merge-data1615/curr_up_dict_df_1615.pkl.data","rb"))
part_user_d = curr_up_dict_df.to_dict("index")

In [None]:
idx = 86867031 #记curr_u_dict目前的序号

In [None]:
lgb_model = joblib.load('../input/lgb-2113model/lgb_2113.model') # 导入train出来的model

In [None]:
# 所有要用的特征
features = [
    "part_bundle_id",
    
    'content_elapsed_time',
    'content_had_explanation',
    
    'lagtime2',
    'lagtime3',
    
    'content_explation_false_mean',
    'content_explation_true_mean',
    
    'curr_user_part_acc', 
    'curr_user_part_count', 
    'curr_user_part_sum',  
    'curr_uq_time_diff', 
    'curr_user_time_diff',
    'curr_user_time_diff_mean',
    'curr_user_elapsed_time_diff',

    'avg_task_seen_cumsum',
    'content_mean_acc',
    'content_cnt',
    'corr_question_elapsed_time_mean', 
    'incorr_question_elapsed_time_mean',
    
    "watched_tags_rate",
    "watched_tags_bool",
    'tags_acc',
    'part',
    'part_bundle_acc', 
    
    'part_1_cnt', 'part_2_cnt', 'part_3_cnt', 'part_4_cnt', 'part_5_cnt', 'part_6_cnt', 'part_7_cnt', 
    'type_of_concept_cnt', 'type_of_intention_cnt', 'type_of_solving_question_cnt', 'type_of_starter_cnt', 
    "same_part_cnt",
    
    'curr_lecture_bool',
    'curr_user_correct_cnt', 
    'curr_user_answer_cnt',
    'curr_user_acc',
    'hmean_acc',
    'curr_uq_correct_cnt',
    'curr_uq_answer_cnt',
    'curr_uq_acc',
    'prior_question_elapsed_time',
    'prior_question_had_explanation', 
]

target = 'answered_correctly'

# SAINT

In [None]:
MAX_SEQ = 100
n_part = 7
D_MODEL = 256
N_LAYER = 2
DROPOUT = 0.1

In [None]:
def feature_time_lag(df, time_dict):
    tt = np.zeros(len(df), dtype=np.int64)
    for ind, row in enumerate(df[['user_id','timestamp','task_container_id']].values):
        if row[0] in time_dict.keys():
            if row[2]-time_dict[row[0]][1] == 0:
                tt[ind] = time_dict[row[0]][2]
            else:
                t_last = time_dict[row[0]][0]
                task_ind_last = time_dict[row[0]][1]
                tt[ind] = row[1]-t_last
                time_dict[row[0]] = (row[1], row[2], tt[ind])
        else:
            # time_dict : timestamp, task_container_id, lag_time
            time_dict[row[0]] = (row[1], row[2], -1)
            tt[ind] =  0
    df["time_lag"] = tt
    return df

In [None]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(DROPOUT)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAINTModel(nn.Module):
    def __init__(self, n_skill, n_part, max_seq=MAX_SEQ, embed_dim= 128, elapsed_time_cat_flag = True):
        super(SAINTModel, self).__init__()

        self.n_skill = n_skill
        self.embed_dim = embed_dim
        self.n_cat = n_part
        self.elapsed_time_cat_flag = elapsed_time_cat_flag

        self.e_embedding = nn.Embedding(self.n_skill+1, embed_dim) ## exercise
        self.c_embedding = nn.Embedding(self.n_cat+1, embed_dim) ## category
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim) ## position
        self.res_embedding = nn.Embedding(2+1, embed_dim) ## response


        if self.elapsed_time_cat_flag == True:
            self.elapsed_time_embedding = nn.Embedding(300+1, embed_dim) ## elapsed time (the maximum elasped time is 300)
            self.lag_embedding1 = nn.Embedding(300+1, embed_dim) ## lag time1 for 300 seconds
            self.lag_embedding2 = nn.Embedding(1440+1, embed_dim) ## lag time2 for 1440 minutes
            self.lag_embedding3 = nn.Embedding(365+1, embed_dim) ## lag time3 for 365 days

        else:
            self.elapsed_time_embedding = nn.Linear(1, embed_dim, bias=False) ## elapsed time
            self.lag_embedding = nn.Linear(1, embed_dim, bias=False) ## lag time


        self.exp_embedding = nn.Embedding(2+1, embed_dim) ## user had explain

        self.transformer = nn.Transformer(nhead=8, d_model = embed_dim, num_encoder_layers= N_LAYER, num_decoder_layers= N_LAYER, dropout = DROPOUT)

        self.dropout = nn.Dropout(DROPOUT)
        self.layer_normal = nn.LayerNorm(embed_dim) 
        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, question, part, response, elapsed_time, lag_time, exp):

        device = question.device  

        ## embedding layer
        question = self.e_embedding(question)
        part = self.c_embedding(part)
        pos_id = torch.arange(question.size(1)).unsqueeze(0).to(device)
        pos_id = self.pos_embedding(pos_id)
        res = self.res_embedding(response)
        exp = self.exp_embedding(exp)

        if self.elapsed_time_cat_flag == True:

            ## feature engineering
            ## elasped time
            elapsed_time = torch.true_divide(elapsed_time, 1000)
            elapsed_time = torch.round(elapsed_time)
            elapsed_time = torch.where(elapsed_time.float() <= 300, elapsed_time, torch.tensor(300.0).to(device)).long()
            elapsed_time = self.elapsed_time_embedding(elapsed_time)

            ## lag_time1
            lag_time = torch.true_divide(lag_time, 1000)
            lag_time = torch.round(lag_time)
            lag_time1 = torch.where(lag_time.float() <= 300, lag_time, torch.tensor(300.0).to(device)).long()

            ## lag_time2
            lag_time = torch.true_divide(lag_time, 60)
            lag_time = torch.round(lag_time)
            lag_time2 = torch.where(lag_time.float() <= 1440, lag_time, torch.tensor(1440.0).to(device)).long()

            ## lag_time3
            lag_time = torch.true_divide(lag_time, 1440)
            lag_time = torch.round(lag_time)
            lag_time3 = torch.where(lag_time.float() <= 365, lag_time, torch.tensor(365.0).to(device)).long()

            ## lag time
            lag_time1 = self.lag_embedding1(lag_time1) 
            lag_time2 = self.lag_embedding2(lag_time2) 
            lag_time3 = self.lag_embedding3(lag_time3)

        else:

            elapsed_time = elapsed_time.view(-1,1)
            elapsed_time = self.elapsed_time_embedding(elapsed_time)
            elapsed_time = elapsed_time.view(-1, MAX_SEQ-1, self.embed_dim)

            lag_time = lag_time.view(-1,1)
            lag_time = self.lag_embedding(lag_time)
            lag_time = lag_time.view(-1, MAX_SEQ-1, self.embed_dim)

            # elapsed_time = elapsed_time.view(-1, MAX_SEQ-1, 1)  ## [batch, s_len] => [batch, s_len, 1]
            # elapsed_time = self.elapsed_time_embedding(elapsed_time)


        enc = question + part + pos_id + exp
        dec = pos_id + res + elapsed_time + lag_time1 + lag_time2 + lag_time3

        enc = enc.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        dec = dec.permute(1, 0, 2)
        mask = future_mask(enc.size(0)).to(device)

        att_output = self.transformer(enc, dec, src_mask=mask, tgt_mask=mask, memory_mask = mask)
        att_output = self.layer_normal(att_output)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1)

## Load Pretrained Models

In [None]:
n_skill = 13523
group = joblib.load("./group.pkl")
questions_df = pd.read_csv('D:/kaggle/input/riiid-test-answer-prediction/questions.csv')
time_dict = joblib.load("./time_dict.pkl")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

saint_model = SAINTModel(n_skill, n_part, embed_dim= D_MODEL)

saint_model.load_state_dict(torch.load("./saint_plus_model.pt")) # 用你自己生成的模型

saint_model.to(device)
saint_model.eval()

In [None]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skills, max_seq=MAX_SEQ): 
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_skill = n_skills
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        part = test_info["part"]
        pri_quest_elap = test_info["prior_question_elapsed_time"]
        time_lag = test_info["time_lag"]
        pri_quest_exp = test_info["prior_question_had_explanation"]
        
        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        res = np.zeros(self.max_seq, dtype=int)
        p = np.zeros(self.max_seq, dtype=int)
        pri_elap = np.zeros(self.max_seq, dtype=int)
        lag = np.zeros(self.max_seq, dtype=int)
        pri_exp = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_, p_, pri_elap_, lag_, pri_exp_ = self.samples[user_id]
            
            seq_len = len(q_)
            
            ## for zero padding
            q_ = q_+1
            pri_exp_ = pri_exp_ + 1
            res_ = qa_ + 1
            

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
                res = res_[-self.max_seq:]
                p = p_[-self.max_seq:]
                pri_elap = pri_elap_[-self.max_seq:]
                lag = lag_[-self.max_seq:]
                pri_exp = pri_exp_[-self.max_seq:]
                
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_
                res[-seq_len:] = res_
                p[-seq_len:] = p_
                pri_elap[-seq_len:] = pri_elap_
                lag[-seq_len:] = lag_
                pri_exp[-seq_len:] = pri_exp_
                
        
        exercise = np.append(q[2:], [target_id+1])
        part = np.append(p[2:], [part])
        elap = np.append(pri_elap[2:], [pri_quest_elap])
        lag = np.append(lag[2:], [time_lag])
        pri_exp = np.append(pri_exp[2:], [pri_quest_exp+1])

        response = res[1:]

        return  exercise, part, response, elap, lag, pri_exp

# inference

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
previous_test_df = None

for (test_df, sample_prediction_df) in iter_test:
    if (previous_test_df is not None) & (psutil.virtual_memory().percent < 90):
        previous_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])  # 将上一批的test_df的特征和label合并起来
        update_user_feats(previous_test_df) # 更新用户相关的特征的记录
        previous_test_df = previous_test_df[previous_test_df.content_type_id == False] # 只保留question行
        
        update_up(previous_test_df) # 更新user-part对（groupby(['user_id', 'part'])）的记录
        
        previous_test_df = feature_time_lag(previous_test_df, time_dict)# 生成lag time

        prev_group = previous_test_df[['user_id', 'content_id', 'answered_correctly', 'part', 'prior_question_elapsed_time', 'time_lag', 'prior_question_had_explanation']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['part'].values,
            r['prior_question_elapsed_time'].values,
            r['time_lag'].values,
            r['prior_question_had_explanation'].values))
        
        for prev_user_id in prev_group.index:
            if prev_user_id in group.index:
                group[prev_user_id] = (
                    np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], 
                    np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:],
                    np.append(group[prev_user_id][2], prev_group[prev_user_id][2])[-MAX_SEQ:],
                    np.append(group[prev_user_id][3], prev_group[prev_user_id][3])[-MAX_SEQ:],
                    np.append(group[prev_user_id][4], prev_group[prev_user_id][4])[-MAX_SEQ:],
                    np.append(group[prev_user_id][5], prev_group[prev_user_id][5])[-MAX_SEQ:]
                )
 
            else:
                group[prev_user_id] = (
                    prev_group[prev_user_id][0], 
                    prev_group[prev_user_id][1],
                    prev_group[prev_user_id][2],
                    prev_group[prev_user_id][3],
                    prev_group[prev_user_id][4],
                    prev_group[prev_user_id][5]
                )

    
    test_df.prior_question_elapsed_time = test_df.prior_question_elapsed_time.fillna(0) # 填充空值为0
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype(int) # 填充空值为false
    test_df = test_df.merge(questions_df[["question_id","part"]], how = "left",left_on = 'content_id', right_on = 'question_id') #加入part特征
              
    previous_test_df = test_df.copy()  # 复制一份test_df，留到下一批test_df来的时候（有当前批的label）使用。
    test_df = add_lectures_feats(test_df, curr_lectures_dict) # 生成一些和lecture相关的特征        
    test_df = test_df[test_df.content_type_id == False]  # 只保留question行
    
    test_df = add_user_feats_without_update(test_df) # 生成一些用户相关的特征
    
    # 导入的特征，直接merge
    test_df = test_df.merge(content_answers_df, how='left', left_on='content_id', right_index=True)
    test_df = test_df.merge(ques_df, how='left', left_on='content_id',right_index=True)
    test_df = test_df.merge(task_user_df, how='left', left_on='task_container_id',right_index=True)
    test_df = test_df.merge(qdf,how="left",left_on="content_id",right_index=True)
    test_df = test_df.merge(ques_df3, how="left", left_on="content_id", right_index=True)
    test_df = test_df.merge(content_elapsed_time_agg, how="left", left_on="content_id", right_index=True)
    test_df = test_df.merge(content_had_explanation_agg, how="left", left_on="content_id", right_index=True)
    test_df = test_df.merge(question_elapsed_time_df,on = "content_id", how = "left")
    test_df.corr_question_elapsed_time_mean = test_df.corr_question_elapsed_time_mean.fillna(-1).astype("float32")
    test_df.incorr_question_elapsed_time_mean = test_df.incorr_question_elapsed_time_mean.fillna(-1).astype("float32")
    test_df = test_df.merge(content_explation_agg,how="left",left_on="content_id",right_on="content_id") 
    test_df["content_explation_false_mean"].fillna(0,inplace=True) 
    test_df["content_explation_true_mean"].fillna(0,inplace=True)
    
    test_df = lagtime_for_test(test_df) # 生成 用户做这题时，距离前1~3次做题过去了多少时间
    test_df = add_up_without_update(test_df) # 生成user-part对（groupby(['user_id', 'part'])）的正确率
    test_df = same_part(test_df) # 生成 用户在做某个part类型的题目前，看过多少个part类型的讲座
    
    # 和gen_data时一样，一些简单的空值填充和异常值处理。
    test_df['hmean_acc'] = 2*((test_df['curr_user_acc']*test_df['content_mean_acc']) /(test_df['curr_user_acc']+test_df['content_mean_acc']))
    test_df["content_mean_acc"] = test_df.content_mean_acc.mask((test_df["content_cnt"] < 3), 0.65)
    test_df["content_mean_acc"] = test_df.content_mean_acc.mask((test_df["content_mean_acc"] < 0.2) & (test_df["content_cnt"] < 21), 0.2)
    test_df["content_mean_acc"] = test_df.content_mean_acc.mask((test_df["content_mean_acc"] > 0.95) & (test_df["content_cnt"] < 21), 0.95)
    test_df["curr_user_acc"] = test_df.curr_user_acc.mask((test_df["curr_user_acc"] < 0.2) & (test_df["curr_user_answer_cnt"] < 21), 0.2)
    test_df["curr_user_acc"] = test_df.curr_user_acc.mask((test_df["curr_user_acc"] > 0.95) & (test_df["curr_user_answer_cnt"] < 21), 0.95)
    
    test_df = feature_time_lag(test_df, time_dict) # 生成lag time
    
    # saint data pipeline
    test_dataset = TestDataset(group, test_df, n_skill)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    saint_outs = []

    for item in test_dataloader:
        exercise = item[0].to(device).long()
        part = item[1].to(device).long()
        response = item[2].to(device).long()
        elapsed_time = item[3].to(device).long()
        lag_time = item[4].to(device).long()
        pri_exp = item[5].to(device).long()
        
        with torch.no_grad():
            output = saint_model(exercise, part, response, elapsed_time, lag_time, pri_exp)
        saint_outs.extend(torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())
    
    saint_outs = np.array(outs,dtype=np.float64) # saint的output
    lgb_outs = lgb_model.predict(test_df[features]) # lgbm的output
    
    test_df['answered_correctly'] = saint_outs * 0.9 + lgb_outs * 0.1 # 9:1的比例
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])