In [2]:
import numpy as np
import random
import pandas as pd
import joblib

In [3]:
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from sklearn.metrics import roc_auc_score
import gc

_ = np.seterr(divide='ignore', invalid='ignore')

# Preprocess

In [4]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [5]:
print('start read train data...')
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

start read train data...


In [6]:
# train_df=train_df.sample(frac=0.1).reset_index(drop=True)

In [7]:
print('start handle lecture data...')

start handle lecture data...


In [8]:
#reading in lecture df
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

In [9]:
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')

lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])

part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]

types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

In [10]:
train_lectures = train_df[train_df.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')

In [11]:
user_lecture_stats_part = train_lectures.groupby('user_id',as_index = False)[part_lectures_columns + types_of_lectures_columns].sum()

In [12]:
lecturedata_types_dict = {   
    'user_id': 'int32', 
    'part_1': 'int8',
    'part_2': 'int8',
    'part_3': 'int8',
    'part_4': 'int8',
    'part_5': 'int8',
    'part_6': 'int8',
    'part_7': 'int8',
    'type_of_concept': 'int8',
    'type_of_intention': 'int8',
    'type_of_solving_question': 'int8',
    'type_of_starter': 'int8'
}
user_lecture_stats_part = user_lecture_stats_part.astype(lecturedata_types_dict)

In [13]:
for column in user_lecture_stats_part.columns:
    #bool_column = column + '_boolean'
    if(column !='user_id'):
        user_lecture_stats_part[column] = (user_lecture_stats_part[column] > 0).astype('int8')

In [14]:
user_lecture_stats_part.dtypes

user_id                     int32
part_1                       int8
part_2                       int8
part_3                       int8
part_4                       int8
part_5                       int8
part_6                       int8
part_7                       int8
type_of_concept              int8
type_of_intention            int8
type_of_solving_question     int8
type_of_starter              int8
dtype: object

In [15]:
#clearing memory
del(train_lectures)
gc.collect()

80

In [16]:
user_lecture_agg = train_df.groupby('user_id')['content_type_id'].agg(['sum', 'count'])
user_lecture_agg=user_lecture_agg.astype('int16')

In [17]:

#1= if the event was the user watching a lecture.
cum = train_df.groupby('user_id')['content_type_id'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['user_interaction_count'] = cum['cumcount'] 
train_df['user_interaction_timestamp_mean'] = train_df['timestamp']/cum['cumcount'] 
train_df['user_lecture_sum'] = cum['cumsum'] 
train_df['user_lecture_lv'] = cum['cumsum'] / cum['cumcount']


train_df.user_lecture_lv=train_df.user_lecture_lv.astype('float16')
train_df.user_lecture_sum=train_df.user_lecture_sum.astype('int16')
train_df.user_interaction_count=train_df.user_interaction_count.astype('int16')
train_df['user_interaction_timestamp_mean']=train_df['user_interaction_timestamp_mean']/(1000*3600)
train_df.user_interaction_timestamp_mean=train_df.user_interaction_timestamp_mean.astype('float32')


In [18]:
#pd.options.display.max_rows = 200

In [19]:
del cum
gc.collect()

97

In [20]:
print('start handle train_df...')

start handle train_df...


In [21]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

In [22]:
content_explation_agg=train_df[["content_id","prior_question_had_explanation",target]].groupby(["content_id","prior_question_had_explanation"])[target].agg(['mean'])

In [23]:
content_explation_agg.dtypes

mean    float64
dtype: object

In [24]:
content_explation_agg=content_explation_agg.unstack()

content_explation_agg=content_explation_agg.reset_index()
content_explation_agg.columns = ['content_id', 'content_explation_false_mean','content_explation_true_mean']

In [25]:
content_explation_agg.content_id=content_explation_agg.content_id.astype('int16')
content_explation_agg.content_explation_false_mean=content_explation_agg.content_explation_false_mean.astype('float16')
content_explation_agg.content_explation_true_mean=content_explation_agg.content_explation_true_mean.astype('float16')

In [26]:
print('start handle attempt_no...')

start handle attempt_no...


In [27]:
train_df["attempt_no"] = 1
train_df.attempt_no=train_df.attempt_no.astype('int8')
#
attempt_no_agg=train_df.groupby(["user_id","content_id"])["attempt_no"].agg(['sum']).astype('int8')
#attempt_no_agg=attempt_no_agg.astype('int8')
train_df["attempt_no"] = train_df[["user_id","content_id",'attempt_no']].groupby(["user_id","content_id"])["attempt_no"].cumsum()

In [28]:
attempt_no_agg=attempt_no_agg[attempt_no_agg['sum'] >1]

In [29]:
print('start handle timestamp...')
prior_question_elapsed_time_mean=train_df['prior_question_elapsed_time'].mean()
train_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

start handle timestamp...


In [30]:
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
max_timestamp_u.user_id=max_timestamp_u.user_id.astype('int32')

In [31]:
train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()

max_timestamp_u2 = train_df[['user_id','lagtime']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u2.columns = ['user_id', 'max_time_stamp2']
max_timestamp_u2.user_id=max_timestamp_u2.user_id.astype('int32')

In [32]:
train_df['lagtime']=train_df['timestamp']-train_df['lagtime']
lagtime_mean=train_df['lagtime'].mean()
train_df['lagtime'].fillna(lagtime_mean, inplace=True)

In [33]:
train_df['lagtime']=train_df['lagtime']/(1000*3600)
train_df.lagtime=train_df.lagtime.astype('float32')

In [34]:
train_df['lagtime2'] = train_df.groupby('user_id')['timestamp'].shift(2)

max_timestamp_u3 = train_df[['user_id','lagtime2']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u3.columns = ['user_id', 'max_time_stamp3']
max_timestamp_u3.user_id=max_timestamp_u3.user_id.astype('int32')

train_df['lagtime2']=train_df['timestamp']-train_df['lagtime2']
lagtime_mean2=train_df['lagtime2'].mean()
train_df['lagtime2'].fillna(lagtime_mean2, inplace=True)


In [35]:
train_df['lagtime2']=train_df['lagtime2']/(1000*3600)
train_df.lagtime2=train_df.lagtime2.astype('float32')

In [36]:

train_df['lagtime3'] = train_df.groupby('user_id')['timestamp'].shift(3)

train_df['lagtime3']=train_df['timestamp']-train_df['lagtime3']
lagtime_mean3=train_df['lagtime3'].mean()
train_df['lagtime3'].fillna(lagtime_mean3, inplace=True)
train_df['lagtime3']=train_df['lagtime3']/(1000*3600)
train_df.lagtime3=train_df.lagtime3.astype('float32')

In [37]:
train_df['timestamp']=train_df['timestamp']/(1000*3600)
#
train_df.timestamp=train_df.timestamp.astype('float16')

In [38]:
user_prior_question_elapsed_time = train_df[['user_id','prior_question_elapsed_time']].groupby(['user_id']).tail(1)
user_prior_question_elapsed_time.columns = ['user_id', 'prior_question_elapsed_time']

In [39]:
train_df['question_elapsed_time'] = train_df['prior_question_elapsed_time'].shift(-1)
train_df['question_elapsed_time'] = train_df['question_elapsed_time'].astype('float16')
train_df['delta_prior_question_elapsed_time'] = train_df.groupby('user_id')['prior_question_elapsed_time'].shift()
train_df['delta_prior_question_elapsed_time']=train_df['prior_question_elapsed_time']-train_df['delta_prior_question_elapsed_time']

In [40]:
content_elapsed_time_agg=train_df.loc[train_df[target] == 1]
content_elapsed_time_agg=content_elapsed_time_agg.groupby('content_id')['question_elapsed_time'].agg(['median','mean','max','min','skew','var'])
content_elapsed_time_agg=content_elapsed_time_agg.astype('float16')
content_had_explanation_agg=train_df.groupby('content_id')['prior_question_had_explanation'].agg(['mean'])
content_had_explanation_agg=content_had_explanation_agg.astype('float16')
del train_df['question_elapsed_time']
gc.collect()

133

In [41]:
delta_prior_question_elapsed_time_mean=train_df['delta_prior_question_elapsed_time'].mean()
train_df['delta_prior_question_elapsed_time'].fillna(delta_prior_question_elapsed_time_mean, inplace=True)
train_df.delta_prior_question_elapsed_time=train_df.delta_prior_question_elapsed_time.astype('int32')

In [42]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()

cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
##cum['cumcount']=cum['cumcount']+1
user_agg = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')
cum['cumsum'].fillna(0, inplace=True)

train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df['user_correct_count'] = cum['cumsum']
train_df['user_uncorrect_count'] = cum['cumcount']-cum['cumsum']
#train_df['user_answer_count'] = cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)
train_df['user_correctness'].fillna(0.67, inplace=True)

train_df.user_correctness=train_df.user_correctness.astype('float16')
train_df.user_correct_count=train_df.user_correct_count.astype('int16')
train_df.user_uncorrect_count=train_df.user_uncorrect_count.astype('int16')
train_df['true_user_correctness'] = (train_df['user_correct_count']+1)/(train_df['user_correct_count']/train_df['user_correctness'] + 2) 
train_df.true_user_correctness=train_df.true_user_correctness.astype('float16')
#train_df.user_answer_count=train_df.user_answer_count.astype('int16')

In [43]:
del cum
gc.collect()

33

In [44]:
train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')
explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])
explanation_agg=explanation_agg.astype('int16')

In [45]:
cum = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']
train_df['explanation_true_count'] = cum['cumsum'] 
train_df['explanation_false_count'] =  cum['cumcount']-cum['cumsum']

train_df.explanation_mean=train_df.explanation_mean.astype('float16')
train_df.explanation_true_count=train_df.explanation_true_count.astype('int16')
train_df.explanation_false_count=train_df.explanation_false_count.astype('int16')

In [46]:
del cum
gc.collect()

73

In [47]:
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count','var','skew'])
task_container_agg = train_df.groupby('task_container_id')[target].agg(['sum', 'count','var'])
content_agg=content_agg.astype('float32')
task_container_agg=task_container_agg.astype('float32')

In [48]:
train_df['task_container_uncor_count'] = train_df['task_container_id'].map(task_container_agg['count']-task_container_agg['sum']).astype('int32')
train_df['task_container_cor_count'] = train_df['task_container_id'].map(task_container_agg['sum']).astype('int32')
train_df['task_container_std'] = train_df['task_container_id'].map(task_container_agg['var']).astype('float16')
train_df['task_container_correctness'] = train_df['task_container_id'].map(task_container_agg['sum'] / task_container_agg['count'])
train_df.task_container_correctness=train_df.task_container_correctness.astype('float16')

In [49]:
train_df.dtypes

timestamp                            float16
user_id                                int32
content_id                             int16
content_type_id                         int8
task_container_id                      int16
answered_correctly                      int8
prior_question_elapsed_time          float32
prior_question_had_explanation          int8
user_interaction_count                 int16
user_interaction_timestamp_mean      float32
user_lecture_sum                       int16
user_lecture_lv                      float16
attempt_no                              int8
lagtime                              float32
lagtime2                             float32
lagtime3                             float32
delta_prior_question_elapsed_time      int32
user_correctness                     float16
user_correct_count                     int16
user_uncorrect_count                   int16
true_user_correctness                float16
explanation_mean                     float16
explanatio

In [50]:
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_interaction_count,user_interaction_timestamp_mean,...,user_correct_count,user_uncorrect_count,true_user_correctness,explanation_mean,explanation_true_count,explanation_false_count,task_container_uncor_count,task_container_cor_count,task_container_std,task_container_correctness
0,0.0,115,5692,0,1,1,13005.081055,0,1,0.0,...,0,0,0.5,0.0,0,1,187175,208704,0.249268,0.527344
1,0.015823,115,5716,0,2,1,37000.0,0,2,0.007909,...,1,0,0.666504,0.0,0,2,223198,172342,0.24585,0.435791
2,0.032867,115,128,0,0,1,55000.0,0,3,0.01096,...,2,0,0.75,0.0,0,3,126682,269233,0.217651,0.680176
3,0.036438,115,7860,0,3,1,19000.0,0,4,0.009109,...,3,0,0.799805,0.0,0,4,180574,214721,0.248169,0.542969
4,0.03833,115,7922,0,4,1,11000.0,0,5,0.007665,...,4,0,0.833496,0.0,0,5,361957,330079,0.249512,0.477051


In [51]:
print('start questions data...')

start questions data...


In [52]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 1,3,4],
    dtype={'question_id': 'int16','bundle_id': 'int16', 'part': 'int8','tags': 'str'}
)

In [53]:
que_fea_merge = pd.read_csv(
    '../input/offline-group-by-table/que_fea_merge.csv', 
    usecols=[0, 2,3,4,5,6,7,8,9],
    dtype={'content_id': 'int16','question_elapsed_time_mean': 'float16','question_had_explanation_mean': 'float16', 'question_correctly_q_count': 'int16','question_correctly_q_mean': 'float16','tags_lsi': 'float16', 'tag_acc_count': 'float16','tag_acc_max':'float16','tag_acc_min': 'float16'}
)
part_fea_merge = pd.read_csv(
    '../input/offline-group-by-table/part_fea_merge.csv', 
    usecols=[0, 1,2,3],
    dtype={'part': 'int8','part_elapsed_time_mean': 'float16','part_had_explanation_mean': 'float16','part_correctly_q_mean':'float16'}
)

In [54]:
questions_cmnts = pd.read_csv(
     '../input/riiid-sakt-base/question_cmnts (1).csv', 
     usecols=[1,2],
     dtype={'question_id': 'int16','community': 'int8'}
 )
questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#

In [55]:
questions_df['community'] = questions_df['community']
questions_df = pd.get_dummies(questions_df, columns=['community'])
part_questions_columns = [column for column in questions_df.columns if column.startswith('community')]

In [56]:
bundle_agg = questions_df.groupby('bundle_id')['question_id'].agg(['count'])

In [57]:
questions_df['content_sub_bundle'] = questions_df['bundle_id'].map(bundle_agg['count']).astype('int8')

In [58]:
questions_df['tags'].fillna('188', inplace=True)

In [59]:
def gettags(tags,num):
    tags_splits=tags.split(" ")
    result='' 
    for t in tags_splits:
        x=int(t)
        if(x<32*(num+1) and x>=32*num):#num 
            result=result+' '+t
    return result

In [60]:
from sklearn.preprocessing import LabelEncoder
for num in range(0,6):
    questions_df["tags"+str(num)] = questions_df["tags"].apply(lambda row: gettags(row,num))
    le = LabelEncoder()
    le.fit(np.unique(questions_df['tags'+str(num)].values))
    #questions_df[['tags'+str(num)]=
    questions_df['tags'+str(num)]=questions_df[['tags'+str(num)]].apply(le.transform)

In [61]:
questions_df_dict = {   
    'tags0': 'int8',
    'tags1': 'int8',
    'tags2': 'int8',
    'tags3': 'int8',
    'tags4': 'int8',
    'tags5': 'int8',
    #'tags6': 'int8',
    #'tags7': 'int8'
}
questions_df = questions_df.astype(questions_df_dict)

In [62]:
questions_df.drop(columns=['tags'], inplace=True)

In [63]:
questions_df['part_bundle_id']=questions_df['part']*100000+questions_df['bundle_id']
questions_df.part_bundle_id=questions_df.part_bundle_id.astype('int32')

In [64]:
# questions_cmnts = pd.read_csv(
#     '../input/2020-r3id-clustering-question-tags/question_cmnts.csv', 
#     usecols=[1,2],
#     dtype={'question_id': 'int16','community': 'int8'}
# )

In [65]:
# questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#

In [66]:
questions_df.rename(columns={'question_id':'content_id'}, inplace=True)

In [67]:
questions_df = pd.merge(questions_df, que_fea_merge, on='content_id', how='left',right_index=True)
questions_df = pd.merge(questions_df, part_fea_merge, on='part', how='left',right_index=True)

In [68]:
questions_df = pd.merge(questions_df, content_explation_agg, on='content_id', how='left',right_index=True)#
# questions_df.content_explation_false_mean=questions_df.content_explation_false_mean.astype('float16')
# questions_df.content_explation_true_mean=questions_df.content_explation_true_mean.astype('float16')

In [69]:
del content_explation_agg

In [70]:
questions_df['content_uncorrect_count'] = questions_df['content_id'].map(content_agg['count']-content_agg['sum']).astype('int32')
questions_df['content_correct_count'] = questions_df['content_id'].map(content_agg['sum']).astype('int32')
questions_df['content_count'] = questions_df['content_id'].map(content_agg['count'])
questions_df.content_count=questions_df.content_count.astype('int32')

questions_df['content_correctness'] = questions_df['content_id'].map(content_agg['sum'] / content_agg['count'])
questions_df.content_correctness=questions_df.content_correctness.astype('float16')

questions_df['true_content_correctness'] = (questions_df['content_correct_count']+1)/(questions_df['content_correct_count']/questions_df['content_correctness'] + 2) 
questions_df.true_content_correctness=questions_df.true_content_correctness.astype('float16')

questions_df['content_correctness_skew'] = questions_df['content_id'].map(content_agg['skew'])
questions_df.content_correctness_skew=questions_df.content_correctness_skew.astype('float16')

questions_df['content_correctness_std'] = questions_df['content_id'].map(content_agg['var'])
questions_df.content_correctness_std=questions_df.content_correctness_std.astype('float16')

In [71]:
questions_df['content_max_time'] = questions_df['content_id'].map(content_elapsed_time_agg['max'])
questions_df.content_max_time=questions_df.content_max_time.astype('float16')
questions_df['content_min_time'] = questions_df['content_id'].map(content_elapsed_time_agg['min'])
questions_df.content_min_time=questions_df.content_min_time.astype('float16')

questions_df['content_med_time'] = questions_df['content_id'].map(content_elapsed_time_agg['median'])
questions_df.content_med_time=questions_df.content_med_time.astype('float16')
questions_df['part_med_time'] = questions_df.groupby('part')['content_med_time'].agg(['mean'])
questions_df['part_med_time'] = questions_df['part_med_time'].astype('float16')

questions_df['content_time_std'] = questions_df['content_id'].map(content_elapsed_time_agg['var'])
questions_df.content_time_std=questions_df.content_time_std.astype('float16')
questions_df['part_time_std'] = questions_df.groupby('part')['content_time_std'].agg(['mean'])
questions_df['part_time_std'] = questions_df['part_time_std'].astype('float16')

questions_df['content_time_skew'] = questions_df['content_id'].map(content_elapsed_time_agg['skew'])
questions_df.content_time_skew=questions_df.content_time_skew.astype('float16')
questions_df['part_time_skew'] = questions_df.groupby('part')['content_time_skew'].agg(['mean'])
questions_df['part_time_skew'] = questions_df['part_time_skew'].astype('float16')

questions_df['content_mean_time'] = questions_df['content_id'].map(content_elapsed_time_agg['mean'])
questions_df['content_mean_time'] = questions_df['content_mean_time'].astype('float16')
questions_df['part_mean_time'] = questions_df.groupby('part')['content_mean_time'].agg(['mean'])
questions_df['part_mean_time'] = questions_df['part_mean_time'].astype('float16')

questions_df['mean_time_standard'] = questions_df['content_mean_time']/questions_df['part_mean_time']
questions_df['mean_time_standard'] = questions_df['mean_time_standard'].astype('float16')

questions_df['content_time_range'] = questions_df['content_max_time']-questions_df['content_min_time']
questions_df.content_time_range=questions_df.content_time_range.astype('float16')

questions_df['content_had_explanation_mean'] = questions_df['content_id'].map(content_had_explanation_agg['mean'])
questions_df.content_had_explanation_mean=questions_df.content_had_explanation_mean.astype('float16')

In [72]:
del content_elapsed_time_agg
del content_had_explanation_agg
gc.collect()

20

In [73]:
part_agg = questions_df.groupby('part')['content_correctness'].agg(['mean', 'var'])
questions_df['part_correctness_mean'] = questions_df['part'].map(part_agg['mean'])
questions_df['part_correctness_std'] = questions_df['part'].map(part_agg['var'])
questions_df.part_correctness_mean=questions_df.part_correctness_mean.astype('float16')
questions_df.part_correctness_std=questions_df.part_correctness_std.astype('float16')

In [74]:
part_agg = questions_df.groupby('part')['content_uncorrect_count'].agg(['sum'])
questions_df['part_uncor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')
#
part_agg = questions_df.groupby('part')['content_correct_count'].agg(['sum'])
questions_df['part_cor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')

In [75]:
bundle_agg = questions_df.groupby('bundle_id')['content_correctness'].agg(['mean'])
questions_df['bundle_correctness_mean'] = questions_df['bundle_id'].map(bundle_agg['mean'])
questions_df.bundle_correctness_mean=questions_df.bundle_correctness_mean.astype('float16')

In [76]:
tags1_agg = questions_df.groupby('tags0')['mean_time_standard'].agg(['skew'])
questions_df['tags0_mean_time'] = questions_df['tags0'].map(tags1_agg['skew'])
questions_df.tags0_mean_time=questions_df.tags0_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags0')['mean_time_standard'].agg(['var'])
questions_df['tags0_med_time'] = questions_df['tags0'].map(tags1_agg['var'])
questions_df.tags0_med_time=questions_df.tags0_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['mean'])
questions_df['tags0_correctness_mean'] = questions_df['tags0'].map(tags1_agg['mean'])
questions_df.tags0_correctness_mean=questions_df.tags0_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['skew'])
questions_df['tags0_correctness_skew'] = questions_df['tags0'].map(tags1_agg['skew'])
questions_df.tags0_correctness_skew=questions_df.tags0_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['var'])
questions_df['tags0_correctness_var'] = questions_df['tags0'].map(tags1_agg['var'])
questions_df.tags0_correctness_var=questions_df.tags0_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags1')['mean_time_standard'].agg(['skew'])
questions_df['tags1_mean_time'] = questions_df['tags1'].map(tags1_agg['skew'])
questions_df.tags1_mean_time=questions_df.tags1_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags1')['mean_time_standard'].agg(['var'])
questions_df['tags1_med_time'] = questions_df['tags1'].map(tags1_agg['var'])
questions_df.tags1_med_time=questions_df.tags1_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['mean'])
questions_df['tags1_correctness_mean'] = questions_df['tags1'].map(tags1_agg['mean'])
questions_df.tags1_correctness_mean=questions_df.tags1_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['skew'])
questions_df['tags1_correctness_skew'] = questions_df['tags1'].map(tags1_agg['skew'])
questions_df.tags1_correctness_skew=questions_df.tags1_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['var'])
questions_df['tags1_correctness_var'] = questions_df['tags1'].map(tags1_agg['var'])
questions_df.tags1_correctness_var=questions_df.tags1_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags2')['mean_time_standard'].agg(['skew'])
questions_df['tags2_mean_time'] = questions_df['tags2'].map(tags1_agg['skew'])
questions_df.tags2_mean_time=questions_df.tags2_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags2')['mean_time_standard'].agg(['var'])
questions_df['tags2_med_time'] = questions_df['tags2'].map(tags1_agg['var'])
questions_df.tags2_med_time=questions_df.tags2_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['mean'])
questions_df['tags2_correctness_mean'] = questions_df['tags2'].map(tags1_agg['mean'])
questions_df.tags2_correctness_mean=questions_df.tags2_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['skew'])
questions_df['tags2_correctness_skew'] = questions_df['tags2'].map(tags1_agg['skew'])
questions_df.tags2_correctness_skew=questions_df.tags2_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['var'])
questions_df['tags2_correctness_var'] = questions_df['tags2'].map(tags1_agg['var'])
questions_df.tags2_correctness_var=questions_df.tags2_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags3')['mean_time_standard'].agg(['skew'])
questions_df['tags3_mean_time'] = questions_df['tags3'].map(tags1_agg['skew'])
questions_df.tags3_mean_time=questions_df.tags3_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags3')['mean_time_standard'].agg(['var'])
questions_df['tags3_med_time'] = questions_df['tags3'].map(tags1_agg['var'])
questions_df.tags3_med_time=questions_df.tags3_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['mean'])
questions_df['tags3_correctness_mean'] = questions_df['tags3'].map(tags1_agg['mean'])
questions_df.tags3_correctness_mean=questions_df.tags3_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['skew'])
questions_df['tags3_correctness_skew'] = questions_df['tags3'].map(tags1_agg['skew'])
questions_df.tags3_correctness_skew=questions_df.tags3_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['var'])
questions_df['tags3_correctness_var'] = questions_df['tags3'].map(tags1_agg['var'])
questions_df.tags3_correctness_var=questions_df.tags3_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags4')['mean_time_standard'].agg(['skew'])
questions_df['tags4_mean_time'] = questions_df['tags4'].map(tags1_agg['skew'])
questions_df.tags4_mean_time=questions_df.tags4_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags4')['mean_time_standard'].agg(['var'])
questions_df['tags4_med_time'] = questions_df['tags4'].map(tags1_agg['var'])
questions_df.tags4_med_time=questions_df.tags4_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['mean'])
questions_df['tags4_correctness_mean'] = questions_df['tags4'].map(tags1_agg['mean'])
questions_df.tags4_correctness_mean=questions_df.tags4_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['skew'])
questions_df['tags4_correctness_skew'] = questions_df['tags4'].map(tags1_agg['skew'])
questions_df.tags4_correctness_skew=questions_df.tags4_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['var'])
questions_df['tags4_correctness_var'] = questions_df['tags4'].map(tags1_agg['var'])
questions_df.tags4_correctness_var=questions_df.tags4_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags5')['mean_time_standard'].agg(['skew'])
questions_df['tags5_mean_time'] = questions_df['tags5'].map(tags1_agg['skew'])
questions_df.tags5_mean_time=questions_df.tags5_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags5')['mean_time_standard'].agg(['var'])
questions_df['tags5_med_time'] = questions_df['tags5'].map(tags1_agg['var'])
questions_df.tags5_med_time=questions_df.tags5_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['mean'])
questions_df['tags5_correctness_mean'] = questions_df['tags5'].map(tags1_agg['mean'])
questions_df.tags5_correctness_mean=questions_df.tags5_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['skew'])
questions_df['tags5_correctness_skew'] = questions_df['tags5'].map(tags1_agg['skew'])
questions_df.tags5_correctness_skew=questions_df.tags5_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['var'])
questions_df['tags5_correctness_var'] = questions_df['tags5'].map(tags1_agg['var'])
questions_df.tags5_correctness_var=questions_df.tags5_correctness_var.astype('float16')

In [77]:
questions_df.dtypes

content_id                  int16
bundle_id                   int16
part                         int8
community_0                 uint8
community_1                 uint8
                           ...   
tags5_mean_time           float16
tags5_med_time            float16
tags5_correctness_mean    float16
tags5_correctness_skew    float16
tags5_correctness_var     float16
Length: 84, dtype: object

In [78]:
del content_agg
del bundle_agg
del part_agg
del tags1_agg
gc.collect()

40

In [79]:
#pd.set_option("display.max_columns",500)

In [80]:
#questions_df.drop(columns=['tags4','tags5','tags6'], inplace=True)

In [81]:
len(train_df)

99271300

In [82]:
#train_df.drop(columns=['content_type_id'], inplace=True)

# Train

In [83]:
features_dict = {
    #'user_id',
    #'timestamp':'float16',#
    'user_interaction_count':'int16',
    'user_interaction_timestamp_mean':'float32',
    'lagtime':'float32',#
    'lagtime2':'float32',
    'lagtime3':'float32',
    #'lagtime_mean':'int32',
    #'content_id':'int16',
    #'task_container_id':'int16',
    'user_lecture_sum':'int16',#
    'user_lecture_lv':'float16',##
    'prior_question_elapsed_time':'float32',#
    #'delta_prior_question_elapsed_time':'int32',#
    'user_correctness':'float16',#
    'true_user_correctness':'float16',
    'user_uncorrect_count':'int16',#
    'user_correct_count':'int16',#
    'content_correctness':'float16',
    'true_content_correctness':'float16',
    'content_correctness_std':'float16',
    'content_count':'int32',
    'content_correct_count':'int32',
    'content_uncorrect_count':'int32',#
    'content_correctness_skew':'float16',
    #'content_time_std':'float16',
    #'part_time_std':'float16',
    #'content_time_skew':'float16',
    #'part_time_skew':'float16',
    'content_mean_time':'float16',
    'part_mean_time':'float16',
    'mean_time_standard':'float16',
    'content_med_time':'float16',
    'part_med_time':'float16',
    'content_time_range':'float16',
    #'content_elapsed_time_mean':'float16',
    'content_had_explanation_mean':'float16',
    'content_explation_false_mean':'float16',
    'content_explation_true_mean':'float16',
    'task_container_correctness':'float16',
    #'task_container_std':'float16',
    'task_container_cor_count':'int32',#
    'task_container_uncor_count':'int32',#
    'attempt_no':'int8',#
    #'part':'int8',
    'part_correctness_mean':'float16',
    'part_correctness_std':'float16',
    'part_uncor_count':'int32',
    'part_cor_count':'int32',
    #'tags0': 'int8',
    #'tags1': 'int8',
    #'tags2': 'int8',
    #'tags3': 'int8',
    #'tags4': 'int8',
    #'tags5': 'int8',
    #'tags6': 'int8',
    #'tags7': 'int8',
     #'tags0_correctness_mean':'float16',
     #'tags1_correctness_mean':'float16',
     #'tags2_correctness_mean':'float16',
     #'tags3_correctness_mean':'float16',
     #'tags4_correctness_mean':'float16',
     #'tags5_correctness_mean':'float16',
     'tags0_correctness_skew':'float16',
     'tags1_correctness_skew':'float16',
     'tags2_correctness_skew':'float16',
     'tags3_correctness_skew':'float16',
     'tags4_correctness_skew':'float16',
     'tags5_correctness_skew':'float16',
     #'tags0_correctness_var':'float16',
     #'tags1_correctness_var':'float16',
     #'tags2_correctness_var':'float16',
     #'tags3_correctness_var':'float16',
     #'tags4_correctness_var':'float16',
     #'tags5_correctness_var':'float16',
    'question_elapsed_time_mean': 'float16',
    'question_had_explanation_mean': 'float16',
    'question_correctly_q_count': 'int16',
    'question_correctly_q_mean': 'float16',
    'tags_lsi': 'float16', 
    'tag_acc_count': 'float16',	
    'tag_acc_max':'float16',
    'tag_acc_min': 'float16',
    'part_elapsed_time_mean': 'float16',
    'part_had_explanation_mean': 'float16',
    'part_correctly_q_mean':'float16',
#     'bundle_id':'int16',
#     'bundle_correctness_mean':'float16',
#     'bundle_uncor_count':'int32',
#     'bundle_cor_count':'int32',
    #'part_bundle_id':'int32',
    'content_sub_bundle':'int8',
    'prior_question_had_explanation':'int8',
    'explanation_mean':'float16', #
    #'explanation_var',#
    'explanation_false_count':'int16',#
    'explanation_true_count':'int16',#
    'community_0':'int8',
    'community_1':'int8',
    'community_2':'int8',
    'community_3':'int8',
    'community_4':'int8',
   # 'community':'int8',
     #'parts_1':'int8',
     #'parts_2':'int8',
     #'parts_3':'int8',
     #'parts_4':'int8',
     #'parts_5':'int8',
     #'parts_6':'int8',
     #'parts_7':'int8',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
}
categorical_columns= [
    #'user_id',
    #'content_id',
    #'task_container_id',
    #'part',
   # 'community',
   #'tags0',
    #'tags1',
    #'tags2',
    #'tags3',
    #'tags4',
    #'tags5',
    #'tags6',
    #'tags7',
    #'bundle_id',
    #'part_bundle_id',
    'content_sub_bundle',
    'prior_question_had_explanation',
    'community_0',
    'community_1',
    'community_2',
    'community_3',
    'community_4', 
     #'parts_1',
     #'parts_2',
     #'parts_3',
     #'parts_4',
     #'parts_5',
     #'parts_6',
     #'parts_7',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
]

features=list(features_dict.keys())


In [84]:
flag_lgbm=True
clfs = list()
params = {
'num_leaves': 200,
'max_bin':450,
# 'min_child_weight': 0.03454472573214212,
'feature_fraction': 0.52,
'bagging_fraction': 0.52,
#'min_data_in_leaf': 106,
# 'max_depth': -1,
'objective': 'binary',
'learning_rate': 0.05,
"boosting_type": "gbdt",
"metric": 'auc',
# "bagging_seed": 11,
# "verbosity": -1,
# 'reg_alpha': 0.3899927210061127,
# 'reg_lambda': 0.6485237330340494,
# 'random_state': 47
}
trains=list()
valids=list()
num=1
for i in range(0,num):
    
    #train_df=train_df.reset_index(drop=True)
    #train_df_clf=train_df.sample(n=1200*10000)
    train_df_clf=train_df[1300*10000:2*1300*10000]
    print('sample end')
    #train_df.drop(train_df_clf.index, inplace=True)
    #print('train_df drop end')
    
   
    del train_df
    
    
    
    users=train_df_clf['user_id'].drop_duplicates()#
    #
    users=users.sample(frac=0.08)
    users_df=pd.DataFrame()
    users_df['user_id']=users.values
   
   
    valid_df_newuser = pd.merge(train_df_clf, users_df, on=['user_id'], how='inner',right_index=True)
    del users_df
    del users
    gc.collect()
    #
    train_df_clf.drop(valid_df_newuser.index, inplace=True)
    print('pd.merge(train_df_clf, questions_df)')
    #-----------
    #train_df_clf=train_df_clf.sample(frac=0.2)
    #train_df_clf.drop(valid_df_newuser.index, inplace=True)
    train_df_clf = pd.merge(train_df_clf, questions_df, on='content_id', how='left',right_index=True)#
    valid_df_newuser = pd.merge(valid_df_newuser, questions_df, on='content_id', how='left',right_index=True)#
    
#     train_df_clf = pd.merge(train_df_clf, user_lecture_stats_part, on='user_id', how="left",right_index=True)
#     valid_df_newuser = pd.merge(valid_df_newuser, user_lecture_stats_part, on='user_id', how="left",right_index=True)
    print('valid_df')
    valid_df=train_df_clf.sample(frac=0.1)
    train_df_clf.drop(valid_df.index, inplace=True)
    
#     test_df=train_df_clf.sample(n=100*10000)
#     train_df_clf.drop(test_df.index, inplace=True)
   
    valid_df = valid_df.append(valid_df_newuser)
    del valid_df_newuser
    gc.collect()
    #

    trains.append(train_df_clf)
    valids.append(valid_df)
    print('train_df_clf length：',len(train_df_clf))
    print('valid_df length：',len(valid_df))
    #train_df=train_df.reset_index(drop=True)

sample end
pd.merge(train_df_clf, questions_df)
valid_df
train_df_clf length： 10764756
valid_df length： 2235244


In [85]:
#del train_df
del train_df_clf
del valid_df
gc.collect()

20

In [86]:
'''
for i in range(0,num):
      
#     tr_data = lgb.Dataset(trains[i][features], label=trains[i][target])
#     va_data = lgb.Dataset(valids[i][features], label=valids[i][target])

    #Don't use DF to create lightgbm dataset, rather use np array:
    X_train_np = trains[i][features].values.astype(np.float32)
    X_valid_np = valids[i][features].values.astype(np.float32)
    #features = train.columns
    tr_data = lgb.Dataset(X_train_np, label=trains[i][target], feature_name=list(features))
    va_data = lgb.Dataset(X_valid_np, label=valids[i][target], feature_name=list(features))
    

#     del train_df_clf
#     del valid_df
#     gc.collect()
    del trains
    del valids
    del X_train_np
    del X_valid_np
    gc.collect()

    model = lgb.train(
        params, 
        tr_data,
#         train_df[features],
#         train_df[target],
        num_boost_round=5000,
        #valid_sets=[(train_df[features],train_df[target]), (valid_df[features],valid_df[target])], 
        valid_sets=[tr_data, va_data],
        early_stopping_rounds=50,
        feature_name=features,
        categorical_feature=categorical_columns,
        verbose_eval=50
    )
    '''
del trains
del valids
gc.collect()
model = lgb.Booster(model_file='../input/riiid-sakt-base/New_features1.txt')

clfs.append(model)
    #print('test-auc:', roc_auc_score(test_df[target], model.predict(test_df[features])))
    #model.save_model(f'model.txt')


    #fig,ax = plt.subplots(figsize=(15,15))
    #lgb.plot_importance(model, ax=ax,importance_type='gain',max_num_features=50)
    #plt.show()

#del tr_data
#del va_data
#gc.collect()



# del test_df
# gc.collect()

In [87]:
#joblib.dump( model, 'without_contentIDMean_Median_Riiid LGBM bagging.pkl')

In [88]:
#model = joblib.load('../input/riiid-sakt-base/without_contentIDMean_Median_Riiid LGBM bagging.pkl')#

In [89]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


MAX_SEQ = 100

class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128):
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.1)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight
    
    
skills = joblib.load("/kaggle/input/riiid-sakt-model-dataset-public/skills.pkl.zip")
n_skill = len(skills)
group = joblib.load("/kaggle/input/riiid-sakt-model-dataset-public/group.pkl.zip")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

nn_model = SAKTModel(n_skill, embed_dim=128)
try:
    nn_model.load_state_dict(torch.load("/kaggle/input/riiid-sakt-model-dataset-public/sakt_model.pt"))
except:
    nn_model.load_state_dict(torch.load("/kaggle/input/riiid-sakt-model-dataset-public/sakt_model.pt", map_location='cpu'))
nn_model.to(device)
nn_model.eval()

SAKTModel(
  (embedding): Embedding(27047, 128)
  (pos_embedding): Embedding(99, 128)
  (e_embedding): Embedding(13524, 128)
  (multi_att): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (layer_normal): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (ffn): FFN(
    (lr1): Linear(in_features=128, out_features=128, bias=True)
    (relu): ReLU()
    (lr2): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (pred): Linear(in_features=128, out_features=1, bias=True)
)

# Inference

In [90]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ): 
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions

In [91]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
# user_sum_dict2 = user_agg2['sum'].astype('int16').to_dict(defaultdict(int))
# user_count_dict2 = user_agg2['count'].astype('int16').to_dict(defaultdict(int))

In [92]:
#user_var_dict = user_agg['var'].astype('float16').to_dict(defaultdict(int))

# content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
# content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

del user_agg
#del user_agg2
#del content_agg
gc.collect()

task_container_sum_dict = task_container_agg['sum'].astype('int32').to_dict(defaultdict(int))
task_container_count_dict = task_container_agg['count'].astype('int32').to_dict(defaultdict(int))
task_container_std_dict = task_container_agg['var'].astype('float16').to_dict(defaultdict(int))

explanation_sum_dict = explanation_agg['sum'].astype('int16').to_dict(defaultdict(int))
explanation_count_dict = explanation_agg['count'].astype('int16').to_dict(defaultdict(int))
#explanation_var_dict = explanation_agg['var'].astype('float16').to_dict(defaultdict(int))
del task_container_agg
del explanation_agg
gc.collect()

0

In [93]:
#task_container_std_dict

In [94]:
user_lecture_sum_dict = user_lecture_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_lecture_count_dict = user_lecture_agg['count'].astype('int16').to_dict(defaultdict(int))

#lagtime_mean_dict = lagtime_agg['mean'].astype('int32').to_dict(defaultdict(int))
#del prior_question_elapsed_time_agg
del user_lecture_agg
#del lagtime_agg
gc.collect()

40

In [95]:
max_timestamp_u_dict=max_timestamp_u.set_index('user_id').to_dict()
max_timestamp_u_dict2=max_timestamp_u2.set_index('user_id').to_dict()
max_timestamp_u_dict3=max_timestamp_u3.set_index('user_id').to_dict()
user_prior_question_elapsed_time_dict=user_prior_question_elapsed_time.set_index('user_id').to_dict()
#del question_elapsed_time_agg
del max_timestamp_u
del max_timestamp_u2
del max_timestamp_u3
del user_prior_question_elapsed_time
gc.collect()

20

In [96]:
attempt_no_sum_dict = attempt_no_agg['sum'].to_dict(defaultdict(int))

del attempt_no_agg
gc.collect()

0

In [97]:
def get_max_attempt(user_id,content_id):
    k = (user_id,content_id)

    if k in attempt_no_sum_dict.keys():
        attempt_no_sum_dict[k]+=1
        return attempt_no_sum_dict[k]

    attempt_no_sum_dict[k] = 1
    return attempt_no_sum_dict[k]

In [98]:
# def get_max_attempt(user_id,content_id):
#     global  attempt_no_agg
#     #k = (user_id,content_id)
#     if(len(attempt_no_agg[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id)])==1):
#     #if k in attempt_no_sum_dict.keys():
#         x= attempt_no_agg.loc[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id),'sum'].values 
#         attempt_no_agg.loc[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id),'sum']=x+1
#         return x+1
      
#     attempt_no_agg = attempt_no_agg.append([{'user_id':user_id,'content_id':content_id,'sum':1}], ignore_index=True)
#     return 1

In [99]:
#model = lgb.Booster(model_file='../input/riiid-sakt-base/model.txt')
env = riiideducation.make_env()

In [100]:
iter_test = env.iter_test()
prior_test_df = None
prev_test_df1 = None

In [101]:
N=[0.4,0.6]

In [102]:
%%time

for (test_df, sample_prediction_df) in iter_test:
    test_df1=test_df.copy()
    if (prev_test_df1 is not None):
        prev_test_df1['answered_correctly'] = eval(test_df1['prior_group_answers_correct'].iloc[0])
        prev_test_df1 = prev_test_df1[prev_test_df1.content_type_id == False]
        
        prev_group = prev_test_df1[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            if prev_user_id in group.index:
                group[prev_user_id] = (
                    np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], 
                    np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:]
                )
 
            else:
                group[prev_user_id] = (
                    prev_group[prev_user_id][0], 
                    prev_group[prev_user_id][1]
                )

    prev_test_df1 = test_df1.copy()
    
    test_df1 = test_df1[test_df1.content_type_id == False]
    test_dataset = TestDataset(group, test_df1, skills)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    outs = []

    for item in test_dataloader:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        with torch.no_grad():
            output, att_weight = nn_model(x, target_id)
        outs.extend(torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())
        
    #test_df1['answered_correctly'] = outs
    
    
    
    
    ###for lgb
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        #prior_test_df = prior_test_df[prior_test_df[target] != -1]
        prior_test_df['prior_question_had_explanation'].fillna(False, inplace=True)       
        prior_test_df.prior_question_had_explanation=prior_test_df.prior_question_had_explanation.astype('int8')
    
        user_ids = prior_test_df['user_id'].values
        #content_ids = prior_test_df['content_id'].values
        #task_container_ids = prior_test_df['task_container_id'].values
        #prior_question_had_explanations = prior_test_df['prior_question_had_explanation'].values
        targets = prior_test_df[target].values        
        
        for user_id, answered_correctly in zip(user_ids,targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
#             user_sum_dict2[user_id] += answered_correctly
#             user_count_dict2[user_id] += 1   
            

    prior_test_df = test_df.copy() 
        
    
    question_len=len( test_df[test_df['content_type_id'] == 0])
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df.prior_question_had_explanation=test_df.prior_question_had_explanation.astype('int8')
    test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    

    user_lecture_sum = np.zeros(question_len, dtype=np.int16)
    user_lecture_count = np.zeros(question_len, dtype=np.int16) 
    
    user_sum = np.zeros(question_len, dtype=np.int16)
    user_count = np.zeros(question_len, dtype=np.int16)
#     user_sum2 = np.zeros(question_len, dtype=np.int16)
#     user_count2 = np.zeros(question_len, dtype=np.int16)

#     user_sum_dict_test=defaultdict(int)
#     user_count_dict_test=defaultdict(int)

    task_container_sum = np.zeros(question_len, dtype=np.int32)
    task_container_count = np.zeros(question_len, dtype=np.int32)
    task_container_std = np.zeros(question_len, dtype=np.float16)

    explanation_sum = np.zeros(question_len, dtype=np.int32)
    explanation_count = np.zeros(question_len, dtype=np.int32)
    delta_prior_question_elapsed_time = np.zeros(question_len, dtype=np.int32)

    attempt_no_count = np.zeros(question_len, dtype=np.int16)
    lagtime = np.zeros(question_len, dtype=np.float32)
    lagtime2 = np.zeros(question_len, dtype=np.float32)
    lagtime3 = np.zeros(question_len, dtype=np.float32)
    #lagtime_means = np.zeros(question_len, dtype=np.int32)
    #
   
    i=0
    for j, (user_id,prior_question_had_explanation,content_type_id,prior_question_elapsed_time,timestamp, content_id,task_container_id) in enumerate(zip(test_df['user_id'].values,test_df['prior_question_had_explanation'].values,test_df['content_type_id'].values,test_df['prior_question_elapsed_time'].values,test_df['timestamp'].values, test_df['content_id'].values, test_df['task_container_id'].values)):
        
         #
        user_lecture_sum_dict[user_id] += content_type_id
        user_lecture_count_dict[user_id] += 1
        if(content_type_id==1):#
            x=1
#             if(len(user_lecture_stats_part[user_lecture_stats_part.user_id==user_id])==0):
#                 user_lecture_stats_part = user_lecture_stats_part.append([{'user_id':user_id}], ignore_index=True)
#                 user_lecture_stats_part.fillna(0, inplace=True)
#                 user_lecture_stats_part.loc[user_lecture_stats_part.user_id==user_id,part_lectures_columns + types_of_lectures_columns]+=lectures_df[lectures_df.lecture_id==content_id][part_lectures_columns + types_of_lectures_columns].values
#             else:
#                 user_lecture_stats_part.loc[user_lecture_stats_part.user_id==user_id,part_lectures_columns + types_of_lectures_columns]+=lectures_df[lectures_df.lecture_id==content_id][part_lectures_columns + types_of_lectures_columns].values
        if(content_type_id==0):#   
            user_lecture_sum[i] = user_lecture_sum_dict[user_id]
            user_lecture_count[i] = user_lecture_count_dict[user_id]
                
            user_sum[i] = user_sum_dict[user_id]
            user_count[i] = user_count_dict[user_id]
#             user_sum2[i] = user_sum_dict2[user_id]
#             user_count2[i] = user_count_dict2[user_id]
    #         content_sum[i] = content_sum_dict[content_id]
    #         content_count[i] = content_count_dict[content_id]
            task_container_sum[i] = task_container_sum_dict[task_container_id]
            task_container_count[i] = task_container_count_dict[task_container_id]
            task_container_std[i]=task_container_std_dict[task_container_id]

            explanation_sum_dict[user_id] += prior_question_had_explanation
            explanation_count_dict[user_id] += 1
            explanation_sum[i] = explanation_sum_dict[user_id]
            explanation_count[i] = explanation_count_dict[user_id]

            if user_id in max_timestamp_u_dict['max_time_stamp'].keys():
                lagtime[i]=timestamp-max_timestamp_u_dict['max_time_stamp'][user_id]
                if(max_timestamp_u_dict2['max_time_stamp2'][user_id]==lagtime_mean2):#
                    lagtime2[i]=lagtime_mean2
                    lagtime3[i]=lagtime_mean3
                    #max_timestamp_u_dict3['max_time_stamp3'].update({user_id:lagtime_mean3})
                else:
                    lagtime2[i]=timestamp-max_timestamp_u_dict2['max_time_stamp2'][user_id]
                    if(max_timestamp_u_dict3['max_time_stamp3'][user_id]==lagtime_mean3):
                        lagtime3[i]=lagtime_mean3 #
                    else:
                        lagtime3[i]=timestamp-max_timestamp_u_dict3['max_time_stamp3'][user_id]
                    
                    max_timestamp_u_dict3['max_time_stamp3'][user_id]=max_timestamp_u_dict2['max_time_stamp2'][user_id]
                        
                max_timestamp_u_dict2['max_time_stamp2'][user_id]=max_timestamp_u_dict['max_time_stamp'][user_id]
                max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp
#                 lagtime_means[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2
#                 lagtime_mean_dict[user_id]=lagtime_means[i]
            else:
                lagtime[i]=lagtime_mean
                max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})
                lagtime2[i]=lagtime_mean2#
                max_timestamp_u_dict2['max_time_stamp2'].update({user_id:lagtime_mean2})
                lagtime3[i]=lagtime_mean3#
                max_timestamp_u_dict3['max_time_stamp3'].update({user_id:lagtime_mean3})
#                 lagtime_mean_dict.update({user_id:lagtime_mean})
#                 lagtime_means[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2

            if user_id in user_prior_question_elapsed_time_dict['prior_question_elapsed_time'].keys():            
                delta_prior_question_elapsed_time[i]=prior_question_elapsed_time-user_prior_question_elapsed_time_dict['prior_question_elapsed_time'][user_id]
                user_prior_question_elapsed_time_dict['prior_question_elapsed_time'][user_id]=prior_question_elapsed_time
            else:           
                delta_prior_question_elapsed_time[i]=delta_prior_question_elapsed_time_mean    
                user_prior_question_elapsed_time_dict['prior_question_elapsed_time'].update({user_id:prior_question_elapsed_time})
            i=i+1 


        
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    #test_df = test_df[test_df['content_type_id'] == 0]
    #right_index=True
    #test_df = pd.merge(test_df, questions_df, on='content_id', how='left',right_index=True)    
    #test_df = pd.concat([test_df.reset_index(drop=True), questions_df.reindex(test_df['content_id'].values).reset_index(drop=True)], axis=1)
    test_df=test_df.merge(questions_df.loc[questions_df.index.isin(test_df['content_id'])],
                  how='left', on='content_id', right_index=True)
    
    #test_df = pd.merge(test_df, user_lecture_stats_part, on=['user_id'], how="left",right_index=True)
    #test_df = pd.concat([test_df.reset_index(drop=True), user_lecture_stats_part.reindex(test_df['user_id'].values).reset_index(drop=True)], axis=1)
#     test_df=test_df.merge(user_lecture_stats_part.loc[user_lecture_stats_part.index.isin(test_df['user_id'])],
#                   how='left', on='user_id', right_index=True)
 
    test_df['user_lecture_lv'] = user_lecture_sum / user_lecture_count
    test_df['user_lecture_sum'] = user_lecture_sum
    
    test_df['user_interaction_count'] = user_lecture_count
    test_df['user_interaction_timestamp_mean'] = test_df['timestamp']/user_lecture_count
    
    test_df['user_correctness'] = user_sum / user_count
    test_df['user_uncorrect_count'] =user_count-user_sum
    test_df['user_correct_count'] =user_sum
    test_df['true_user_correctness'] = (test_df['user_correct_count']+1)/(test_df['user_correct_count']/test_df['user_correctness'] + 2) 
    #test_df['user_answer_count'] =user_count
    
#     test_df['user_correctness2'] = user_sum2 / user_count2
#     test_df['user_uncorrect_count2'] =user_count2-user_sum2
#     test_df['user_correct_count2'] =user_sum2
    #test_df['user_answer_count2'] =user_count2
    
    #    
    test_df['task_container_correctness'] = task_container_sum / task_container_count
    test_df['task_container_cor_count'] = task_container_sum 
    test_df['task_container_uncor_count'] =task_container_count-task_container_sum 
    test_df['task_container_std'] = task_container_std 
    #test_df['content_task_mean'] = content_task_mean 
    
    test_df['explanation_mean'] = explanation_sum / explanation_count
    test_df['explanation_true_count'] = explanation_sum
    test_df['explanation_false_count'] = explanation_count-explanation_sum 
    
    #
    test_df['delta_prior_question_elapsed_time'] = delta_prior_question_elapsed_time 
    
  
 
    test_df["attempt_no"] = test_df[["user_id", "content_id"]].apply(lambda row: get_max_attempt(row["user_id"], row["content_id"]), axis=1)
    test_df["lagtime"]=lagtime
    test_df["lagtime2"]=lagtime2
    test_df["lagtime3"]=lagtime3
    #test_df["lagtime_mean"]=lagtime_means

    

    test_df['timestamp']=test_df['timestamp']/(1000*3600)
    test_df.timestamp=test_df.timestamp.astype('float16')
    test_df['lagtime']=test_df['lagtime']/(1000*3600)
    test_df.lagtime=test_df.lagtime.astype('float32')
    test_df['lagtime2']=test_df['lagtime2']/(1000*3600)
    test_df.lagtime2=test_df.lagtime2.astype('float32')
    test_df['lagtime3']=test_df['lagtime3']/(1000*3600)
    test_df.lagtime3=test_df.lagtime3.astype('float32')
    test_df['user_interaction_timestamp_mean']=test_df['user_interaction_timestamp_mean']/(1000*3600)
    test_df.user_interaction_timestamp_mean=test_df.user_interaction_timestamp_mean.astype('float32')
    
    test_df['user_correctness'].fillna(0.67, inplace=True)
    #test_df['user_correctness2'].fillna(0.67, inplace=True)
    #
    #test_df = test_df.astype(features_dict)

    sub_preds = np.zeros(test_df.shape[0])
    for i, model in enumerate(clfs, 1):
        test_preds  = model.predict(test_df[features])
        sub_preds += test_preds
    o2=sub_preds / len(clfs)
    test_df[target]=0.5*np.array(outs)+0.5*np.array(o2)
#     if(flag_lgbm):
#         test_df[target] = model.predict(test_df[features])
#     else:
#         test_df[target] = model.predict(test_df[features].values)
    env.predict(test_df[['row_id', target]])

CPU times: user 1.4 s, sys: 77.2 ms, total: 1.47 s
Wall time: 1.25 s


In [103]:
sub = pd.read_csv('./submission.csv')
sub

Unnamed: 0,row_id,answered_correctly
0,0,0.414378
1,1,0.862806
2,2,0.688597
3,3,0.841724
4,4,0.360560
...,...,...
99,104,0.393216
100,105,0.778837
101,106,0.662380
102,107,0.610120
