In [1]:
import numpy as np
import random
import pandas as pd
import joblib
import psutil
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
# import riiideducation
from sklearn.metrics import roc_auc_score
import gc

_ = np.seterr(divide='ignore', invalid='ignore')

# Preprocess

In [2]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [3]:
print('start read train data...')
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

start read train data...


In [4]:
# train_df=train_df.sample(frac=0.1).reset_index(drop=True)

In [5]:
print('start handle lecture data...')

start handle lecture data...


In [6]:
#reading in lecture df
lectures_df = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

In [7]:
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')

lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])

part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]

types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

In [8]:
train_lectures = train_df[train_df.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')

In [9]:
user_lecture_stats_part = train_lectures.groupby('user_id',as_index = False)[part_lectures_columns + types_of_lectures_columns].sum()

In [10]:
lecturedata_types_dict = {   
    'user_id': 'int32', 
    'part_1': 'int8',
    'part_2': 'int8',
    'part_3': 'int8',
    'part_4': 'int8',
    'part_5': 'int8',
    'part_6': 'int8',
    'part_7': 'int8',
    'type_of_concept': 'int8',
    'type_of_intention': 'int8',
    'type_of_solving_question': 'int8',
    'type_of_starter': 'int8'
}
user_lecture_stats_part = user_lecture_stats_part.astype(lecturedata_types_dict)

In [11]:
for column in user_lecture_stats_part.columns:
    #bool_column = column + '_boolean'
    if(column !='user_id'):
        user_lecture_stats_part[column] = (user_lecture_stats_part[column] > 0).astype('int8')

In [12]:
user_lecture_stats_part.dtypes

user_id                     int32
part_1                       int8
part_2                       int8
part_3                       int8
part_4                       int8
part_5                       int8
part_6                       int8
part_7                       int8
type_of_concept              int8
type_of_intention            int8
type_of_solving_question     int8
type_of_starter              int8
dtype: object

In [13]:
#clearing memory
del(train_lectures)
gc.collect()

160

In [14]:
user_lecture_agg = train_df.groupby('user_id')['content_type_id'].agg(['sum', 'count'])
user_lecture_agg=user_lecture_agg.astype('int16')

In [15]:

#1= if the event was the user watching a lecture.
cum = train_df.groupby('user_id')['content_type_id'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['user_interaction_count'] = cum['cumcount'] 
train_df['user_interaction_timestamp_mean'] = train_df['timestamp']/cum['cumcount'] 
train_df['user_lecture_sum'] = cum['cumsum'] 
train_df['user_lecture_lv'] = cum['cumsum'] / cum['cumcount']


train_df.user_lecture_lv=train_df.user_lecture_lv.astype('float16')
train_df.user_lecture_sum=train_df.user_lecture_sum.astype('int16')
train_df.user_interaction_count=train_df.user_interaction_count.astype('int16')
train_df['user_interaction_timestamp_mean']=train_df['user_interaction_timestamp_mean']/(1000*3600)
train_df.user_interaction_timestamp_mean=train_df.user_interaction_timestamp_mean.astype('float32')


In [16]:
#pd.options.display.max_rows = 200

In [17]:
del cum
gc.collect()

82

In [18]:
print('start handle train_df...')

start handle train_df...


In [19]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

In [20]:
content_explation_agg=train_df[["content_id","prior_question_had_explanation",target]].groupby(["content_id","prior_question_had_explanation"])[target].agg(['mean'])

In [21]:
content_explation_agg.dtypes

mean    float64
dtype: object

In [22]:
content_explation_agg=content_explation_agg.unstack()

content_explation_agg=content_explation_agg.reset_index()
content_explation_agg.columns = ['content_id', 'content_explation_false_mean','content_explation_true_mean']

In [23]:
content_explation_agg.content_id=content_explation_agg.content_id.astype('int16')
content_explation_agg.content_explation_false_mean=content_explation_agg.content_explation_false_mean.astype('float16')
content_explation_agg.content_explation_true_mean=content_explation_agg.content_explation_true_mean.astype('float16')

In [24]:
print('start handle attempt_no...')

start handle attempt_no...


In [25]:
train_df["attempt_no"] = 1
train_df.attempt_no=train_df.attempt_no.astype('int8')
#
attempt_no_agg=train_df.groupby(["user_id","content_id"])["attempt_no"].agg(['sum']).astype('int8')
#attempt_no_agg=attempt_no_agg.astype('int8')
train_df["attempt_no"] = train_df[["user_id","content_id",'attempt_no']].groupby(["user_id","content_id"])["attempt_no"].cumsum()

In [26]:
attempt_no_agg=attempt_no_agg[attempt_no_agg['sum'] >1]

In [27]:
print('start handle timestamp...')
prior_question_elapsed_time_mean=train_df['prior_question_elapsed_time'].mean()
train_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

start handle timestamp...


In [28]:
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
max_timestamp_u.user_id=max_timestamp_u.user_id.astype('int32')

In [29]:
train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()

max_timestamp_u2 = train_df[['user_id','lagtime']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u2.columns = ['user_id', 'max_time_stamp2']
max_timestamp_u2.user_id=max_timestamp_u2.user_id.astype('int32')

In [30]:
train_df['lagtime']=train_df['timestamp']-train_df['lagtime']
lagtime_mean=train_df['lagtime'].mean()
train_df['lagtime'].fillna(lagtime_mean, inplace=True)

In [31]:
train_df['lagtime']=train_df['lagtime']/(1000*3600)
train_df.lagtime=train_df.lagtime.astype('float32')

In [32]:
train_df['lagtime2'] = train_df.groupby('user_id')['timestamp'].shift(2)

max_timestamp_u3 = train_df[['user_id','lagtime2']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u3.columns = ['user_id', 'max_time_stamp3']
max_timestamp_u3.user_id=max_timestamp_u3.user_id.astype('int32')

train_df['lagtime2']=train_df['timestamp']-train_df['lagtime2']
lagtime_mean2=train_df['lagtime2'].mean()
train_df['lagtime2'].fillna(lagtime_mean2, inplace=True)


In [33]:
train_df['lagtime2']=train_df['lagtime2']/(1000*3600)
train_df.lagtime2=train_df.lagtime2.astype('float32')

In [34]:

train_df['lagtime3'] = train_df.groupby('user_id')['timestamp'].shift(3)

train_df['lagtime3']=train_df['timestamp']-train_df['lagtime3']
lagtime_mean3=train_df['lagtime3'].mean()
train_df['lagtime3'].fillna(lagtime_mean3, inplace=True)
train_df['lagtime3']=train_df['lagtime3']/(1000*3600)
train_df.lagtime3=train_df.lagtime3.astype('float32')

In [35]:
train_df['timestamp']=train_df['timestamp']/(1000*3600)
#
train_df.timestamp=train_df.timestamp.astype('float16')

In [36]:
user_prior_question_elapsed_time = train_df[['user_id','prior_question_elapsed_time']].groupby(['user_id']).tail(1)
user_prior_question_elapsed_time.columns = ['user_id', 'prior_question_elapsed_time']

In [37]:
train_df['question_elapsed_time'] = train_df['prior_question_elapsed_time'].shift(-1)
train_df['question_elapsed_time'] = train_df['question_elapsed_time'].astype('float16')
train_df['delta_prior_question_elapsed_time'] = train_df.groupby('user_id')['prior_question_elapsed_time'].shift()
train_df['delta_prior_question_elapsed_time']=train_df['prior_question_elapsed_time']-train_df['delta_prior_question_elapsed_time']

In [38]:
content_elapsed_time_agg=train_df.loc[train_df[target] == 1]
content_elapsed_time_agg=content_elapsed_time_agg.groupby('content_id')['question_elapsed_time'].agg(['median','mean','max','min','skew','var'])
content_elapsed_time_agg=content_elapsed_time_agg.astype('float16')
content_had_explanation_agg=train_df.groupby('content_id')['prior_question_had_explanation'].agg(['mean'])
content_had_explanation_agg=content_had_explanation_agg.astype('float16')
del train_df['question_elapsed_time']
gc.collect()

40

In [39]:
delta_prior_question_elapsed_time_mean=train_df['delta_prior_question_elapsed_time'].mean()
train_df['delta_prior_question_elapsed_time'].fillna(delta_prior_question_elapsed_time_mean, inplace=True)
train_df.delta_prior_question_elapsed_time=train_df.delta_prior_question_elapsed_time.astype('int32')

In [40]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()

cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
##cum['cumcount']=cum['cumcount']+1
user_agg = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')
cum['cumsum'].fillna(0, inplace=True)

train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df['user_correct_count'] = cum['cumsum']
train_df['user_uncorrect_count'] = cum['cumcount']-cum['cumsum']
#train_df['user_answer_count'] = cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)
train_df['user_correctness'].fillna(0.67, inplace=True)

train_df.user_correctness=train_df.user_correctness.astype('float16')
train_df.user_correct_count=train_df.user_correct_count.astype('int16')
train_df.user_uncorrect_count=train_df.user_uncorrect_count.astype('int16')
train_df['true_user_correctness'] = (train_df['user_correct_count']+1)/(train_df['user_correct_count']/train_df['user_correctness'] + 2) 
train_df.true_user_correctness=train_df.true_user_correctness.astype('float16')
#train_df.user_answer_count=train_df.user_answer_count.astype('int16')

KeyboardInterrupt: 

In [None]:
del cum
gc.collect()

In [None]:
train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')
explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])
explanation_agg=explanation_agg.astype('int16')

In [None]:
cum = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']
train_df['explanation_true_count'] = cum['cumsum'] 
train_df['explanation_false_count'] =  cum['cumcount']-cum['cumsum']

train_df.explanation_mean=train_df.explanation_mean.astype('float16')
train_df.explanation_true_count=train_df.explanation_true_count.astype('int16')
train_df.explanation_false_count=train_df.explanation_false_count.astype('int16')

In [None]:
del cum
gc.collect()

In [None]:
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count','var','skew'])
task_container_agg = train_df.groupby('task_container_id')[target].agg(['sum', 'count','var'])
content_agg=content_agg.astype('float32')
task_container_agg=task_container_agg.astype('float32')

In [None]:
train_df['task_container_uncor_count'] = train_df['task_container_id'].map(task_container_agg['count']-task_container_agg['sum']).astype('int32')
train_df['task_container_cor_count'] = train_df['task_container_id'].map(task_container_agg['sum']).astype('int32')
train_df['task_container_std'] = train_df['task_container_id'].map(task_container_agg['var']).astype('float16')
train_df['task_container_correctness'] = train_df['task_container_id'].map(task_container_agg['sum'] / task_container_agg['count'])
train_df.task_container_correctness=train_df.task_container_correctness.astype('float16')

In [None]:
print('start questions data...')

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0,1,3,4],
    dtype={'question_id': 'int16','bundle_id': 'int16', 'part': 'int8','tags': 'str'}
)

In [None]:
que_fea_merge = pd.read_csv(
    '../input/offline-group-by-table/que_fea_merge.csv', 
    usecols=[0,2,3,4,5,6,7,8,9],
    dtype={'content_id': 'int16','question_elapsed_time_mean': 'float16','question_had_explanation_mean': 'float16', 'question_correctly_q_count': 'int16','question_correctly_q_mean': 'float16','tags_lsi': 'float16', 'tag_acc_count': 'float16','tag_acc_max':'float16','tag_acc_min': 'float16'}
)
part_fea_merge = pd.read_csv(
    '../input/offline-group-by-table/part_fea_merge.csv', 
    usecols=[0,1,2,3],
    dtype={'part': 'int8','part_elapsed_time_mean': 'float16','part_had_explanation_mean': 'float16','part_correctly_q_mean':'float16'}
)

In [None]:
questions_cmnts = pd.read_csv(
     '../input/riiid-sakt-base/question_cmnts (1).csv', 
     usecols=[1,2],
     dtype={'question_id': 'int16','community': 'int8'}
 )
questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#

In [None]:
questions_df['community'] = questions_df['community']
questions_df = pd.get_dummies(questions_df, columns=['community'])
part_questions_columns = [column for column in questions_df.columns if column.startswith('community')]

In [None]:
bundle_agg = questions_df.groupby('bundle_id')['question_id'].agg(['count'])

In [None]:
questions_df['content_sub_bundle'] = questions_df['bundle_id'].map(bundle_agg['count']).astype('int8')

In [None]:
questions_df['tags'].fillna('188', inplace=True)

In [None]:
def gettags(tags,num):
    tags_splits=tags.split(" ")
    result='' 
    for t in tags_splits:
        x=int(t)
        if(x<32*(num+1) and x>=32*num):#num 
            result=result+' '+t
    return result

In [None]:
from sklearn.preprocessing import LabelEncoder
for num in range(0,6):
    questions_df["tags"+str(num)] = questions_df["tags"].apply(lambda row: gettags(row,num))
    le = LabelEncoder()
    le.fit(np.unique(questions_df['tags'+str(num)].values))
    #questions_df[['tags'+str(num)]=
    questions_df['tags'+str(num)]=questions_df[['tags'+str(num)]].apply(le.transform)

In [None]:
questions_df_dict = {   
    'tags0': 'int8',
    'tags1': 'int8',
    'tags2': 'int8',
    'tags3': 'int8',
    'tags4': 'int8',
    'tags5': 'int8',
    #'tags6': 'int8',
    #'tags7': 'int8'
}
questions_df = questions_df.astype(questions_df_dict)

In [None]:
questions_df.drop(columns=['tags'], inplace=True)

In [None]:
questions_df['part_bundle_id']=questions_df['part']*100000+questions_df['bundle_id']
questions_df.part_bundle_id=questions_df.part_bundle_id.astype('int32')

In [None]:
# questions_cmnts = pd.read_csv(
#     '../input/2020-r3id-clustering-question-tags/question_cmnts.csv', 
#     usecols=[1,2],
#     dtype={'question_id': 'int16','community': 'int8'}
# )

In [None]:
# questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#

In [None]:
questions_df.rename(columns={'question_id':'content_id'}, inplace=True)

In [None]:
questions_df = pd.merge(questions_df, que_fea_merge, on='content_id', how='left',right_index=True)
questions_df = pd.merge(questions_df, part_fea_merge, on='part', how='left',right_index=True)

In [None]:
questions_df = pd.merge(questions_df, content_explation_agg, on='content_id', how='left',right_index=True)#
# questions_df.content_explation_false_mean=questions_df.content_explation_false_mean.astype('float16')
# questions_df.content_explation_true_mean=questions_df.content_explation_true_mean.astype('float16')

In [None]:
del content_explation_agg

In [None]:
questions_df['content_uncorrect_count'] = questions_df['content_id'].map(content_agg['count']-content_agg['sum']).astype('int32')
questions_df['content_correct_count'] = questions_df['content_id'].map(content_agg['sum']).astype('int32')
questions_df['content_count'] = questions_df['content_id'].map(content_agg['count'])
questions_df.content_count=questions_df.content_count.astype('int32')

questions_df['content_correctness'] = questions_df['content_id'].map(content_agg['sum'] / content_agg['count'])
questions_df.content_correctness=questions_df.content_correctness.astype('float16')

questions_df['true_content_correctness'] = (questions_df['content_correct_count']+1)/(questions_df['content_correct_count']/questions_df['content_correctness'] + 2) 
questions_df.true_content_correctness=questions_df.true_content_correctness.astype('float16')

questions_df['content_correctness_skew'] = questions_df['content_id'].map(content_agg['skew'])
questions_df.content_correctness_skew=questions_df.content_correctness_skew.astype('float16')

questions_df['content_correctness_std'] = questions_df['content_id'].map(content_agg['var'])
questions_df.content_correctness_std=questions_df.content_correctness_std.astype('float16')

In [None]:
questions_df['content_max_time'] = questions_df['content_id'].map(content_elapsed_time_agg['max'])
questions_df.content_max_time=questions_df.content_max_time.astype('float16')
questions_df['content_min_time'] = questions_df['content_id'].map(content_elapsed_time_agg['min'])
questions_df.content_min_time=questions_df.content_min_time.astype('float16')

questions_df['content_med_time'] = questions_df['content_id'].map(content_elapsed_time_agg['median'])
questions_df.content_med_time=questions_df.content_med_time.astype('float16')
questions_df['part_med_time'] = questions_df.groupby('part')['content_med_time'].agg(['mean'])
questions_df['part_med_time'] = questions_df['part_med_time'].astype('float16')

questions_df['content_time_std'] = questions_df['content_id'].map(content_elapsed_time_agg['var'])
questions_df.content_time_std=questions_df.content_time_std.astype('float16')
questions_df['part_time_std'] = questions_df.groupby('part')['content_time_std'].agg(['mean'])
questions_df['part_time_std'] = questions_df['part_time_std'].astype('float16')

questions_df['content_time_skew'] = questions_df['content_id'].map(content_elapsed_time_agg['skew'])
questions_df.content_time_skew=questions_df.content_time_skew.astype('float16')
questions_df['part_time_skew'] = questions_df.groupby('part')['content_time_skew'].agg(['mean'])
questions_df['part_time_skew'] = questions_df['part_time_skew'].astype('float16')

questions_df['content_mean_time'] = questions_df['content_id'].map(content_elapsed_time_agg['mean'])
questions_df['content_mean_time'] = questions_df['content_mean_time'].astype('float16')
questions_df['part_mean_time'] = questions_df.groupby('part')['content_mean_time'].agg(['mean'])
questions_df['part_mean_time'] = questions_df['part_mean_time'].astype('float16')

questions_df['mean_time_standard'] = questions_df['content_mean_time']/questions_df['part_mean_time']
questions_df['mean_time_standard'] = questions_df['mean_time_standard'].astype('float16')

questions_df['content_time_range'] = questions_df['content_max_time']-questions_df['content_min_time']
questions_df.content_time_range=questions_df.content_time_range.astype('float16')

questions_df['content_had_explanation_mean'] = questions_df['content_id'].map(content_had_explanation_agg['mean'])
questions_df.content_had_explanation_mean=questions_df.content_had_explanation_mean.astype('float16')

In [None]:
del content_elapsed_time_agg
del content_had_explanation_agg
gc.collect()

In [None]:
part_agg = questions_df.groupby('part')['content_correctness'].agg(['mean', 'var'])
questions_df['part_correctness_mean'] = questions_df['part'].map(part_agg['mean'])
questions_df['part_correctness_std'] = questions_df['part'].map(part_agg['var'])
questions_df.part_correctness_mean=questions_df.part_correctness_mean.astype('float16')
questions_df.part_correctness_std=questions_df.part_correctness_std.astype('float16')

In [None]:
part_agg = questions_df.groupby('part')['content_uncorrect_count'].agg(['sum'])
questions_df['part_uncor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')
#
part_agg = questions_df.groupby('part')['content_correct_count'].agg(['sum'])
questions_df['part_cor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')

In [None]:
bundle_agg = questions_df.groupby('bundle_id')['content_correctness'].agg(['mean'])
questions_df['bundle_correctness_mean'] = questions_df['bundle_id'].map(bundle_agg['mean'])
questions_df.bundle_correctness_mean=questions_df.bundle_correctness_mean.astype('float16')

In [None]:
tags1_agg = questions_df.groupby('tags0')['mean_time_standard'].agg(['skew'])
questions_df['tags0_mean_time'] = questions_df['tags0'].map(tags1_agg['skew'])
questions_df.tags0_mean_time=questions_df.tags0_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags0')['mean_time_standard'].agg(['var'])
questions_df['tags0_med_time'] = questions_df['tags0'].map(tags1_agg['var'])
questions_df.tags0_med_time=questions_df.tags0_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['mean'])
questions_df['tags0_correctness_mean'] = questions_df['tags0'].map(tags1_agg['mean'])
questions_df.tags0_correctness_mean=questions_df.tags0_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['skew'])
questions_df['tags0_correctness_skew'] = questions_df['tags0'].map(tags1_agg['skew'])
questions_df.tags0_correctness_skew=questions_df.tags0_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['var'])
questions_df['tags0_correctness_var'] = questions_df['tags0'].map(tags1_agg['var'])
questions_df.tags0_correctness_var=questions_df.tags0_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags1')['mean_time_standard'].agg(['skew'])
questions_df['tags1_mean_time'] = questions_df['tags1'].map(tags1_agg['skew'])
questions_df.tags1_mean_time=questions_df.tags1_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags1')['mean_time_standard'].agg(['var'])
questions_df['tags1_med_time'] = questions_df['tags1'].map(tags1_agg['var'])
questions_df.tags1_med_time=questions_df.tags1_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['mean'])
questions_df['tags1_correctness_mean'] = questions_df['tags1'].map(tags1_agg['mean'])
questions_df.tags1_correctness_mean=questions_df.tags1_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['skew'])
questions_df['tags1_correctness_skew'] = questions_df['tags1'].map(tags1_agg['skew'])
questions_df.tags1_correctness_skew=questions_df.tags1_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['var'])
questions_df['tags1_correctness_var'] = questions_df['tags1'].map(tags1_agg['var'])
questions_df.tags1_correctness_var=questions_df.tags1_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags2')['mean_time_standard'].agg(['skew'])
questions_df['tags2_mean_time'] = questions_df['tags2'].map(tags1_agg['skew'])
questions_df.tags2_mean_time=questions_df.tags2_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags2')['mean_time_standard'].agg(['var'])
questions_df['tags2_med_time'] = questions_df['tags2'].map(tags1_agg['var'])
questions_df.tags2_med_time=questions_df.tags2_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['mean'])
questions_df['tags2_correctness_mean'] = questions_df['tags2'].map(tags1_agg['mean'])
questions_df.tags2_correctness_mean=questions_df.tags2_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['skew'])
questions_df['tags2_correctness_skew'] = questions_df['tags2'].map(tags1_agg['skew'])
questions_df.tags2_correctness_skew=questions_df.tags2_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['var'])
questions_df['tags2_correctness_var'] = questions_df['tags2'].map(tags1_agg['var'])
questions_df.tags2_correctness_var=questions_df.tags2_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags3')['mean_time_standard'].agg(['skew'])
questions_df['tags3_mean_time'] = questions_df['tags3'].map(tags1_agg['skew'])
questions_df.tags3_mean_time=questions_df.tags3_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags3')['mean_time_standard'].agg(['var'])
questions_df['tags3_med_time'] = questions_df['tags3'].map(tags1_agg['var'])
questions_df.tags3_med_time=questions_df.tags3_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['mean'])
questions_df['tags3_correctness_mean'] = questions_df['tags3'].map(tags1_agg['mean'])
questions_df.tags3_correctness_mean=questions_df.tags3_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['skew'])
questions_df['tags3_correctness_skew'] = questions_df['tags3'].map(tags1_agg['skew'])
questions_df.tags3_correctness_skew=questions_df.tags3_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags3')['content_correctness'].agg(['var'])
questions_df['tags3_correctness_var'] = questions_df['tags3'].map(tags1_agg['var'])
questions_df.tags3_correctness_var=questions_df.tags3_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags4')['mean_time_standard'].agg(['skew'])
questions_df['tags4_mean_time'] = questions_df['tags4'].map(tags1_agg['skew'])
questions_df.tags4_mean_time=questions_df.tags4_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags4')['mean_time_standard'].agg(['var'])
questions_df['tags4_med_time'] = questions_df['tags4'].map(tags1_agg['var'])
questions_df.tags4_med_time=questions_df.tags4_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['mean'])
questions_df['tags4_correctness_mean'] = questions_df['tags4'].map(tags1_agg['mean'])
questions_df.tags4_correctness_mean=questions_df.tags4_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['skew'])
questions_df['tags4_correctness_skew'] = questions_df['tags4'].map(tags1_agg['skew'])
questions_df.tags4_correctness_skew=questions_df.tags4_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['var'])
questions_df['tags4_correctness_var'] = questions_df['tags4'].map(tags1_agg['var'])
questions_df.tags4_correctness_var=questions_df.tags4_correctness_var.astype('float16')


tags1_agg = questions_df.groupby('tags5')['mean_time_standard'].agg(['skew'])
questions_df['tags5_mean_time'] = questions_df['tags5'].map(tags1_agg['skew'])
questions_df.tags5_mean_time=questions_df.tags5_mean_time.astype('float16')
tags1_agg = questions_df.groupby('tags5')['mean_time_standard'].agg(['var'])
questions_df['tags5_med_time'] = questions_df['tags5'].map(tags1_agg['var'])
questions_df.tags5_med_time=questions_df.tags5_med_time.astype('float16')

tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['mean'])
questions_df['tags5_correctness_mean'] = questions_df['tags5'].map(tags1_agg['mean'])
questions_df.tags5_correctness_mean=questions_df.tags5_correctness_mean.astype('float16')
tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['skew'])
questions_df['tags5_correctness_skew'] = questions_df['tags5'].map(tags1_agg['skew'])
questions_df.tags5_correctness_skew=questions_df.tags5_correctness_skew.astype('float16')
tags1_agg = questions_df.groupby('tags5')['content_correctness'].agg(['var'])
questions_df['tags5_correctness_var'] = questions_df['tags5'].map(tags1_agg['var'])
questions_df.tags5_correctness_var=questions_df.tags5_correctness_var.astype('float16')

In [None]:
questions_df.dtypes

In [None]:
del content_agg
del bundle_agg
del part_agg
del tags1_agg
gc.collect()

In [None]:
#pd.set_option("display.max_columns",500)

In [None]:
#questions_df.drop(columns=['tags4','tags5','tags6'], inplace=True)

In [None]:
len(train_df)

In [None]:
#train_df.drop(columns=['content_type_id'], inplace=True)

# Train

In [None]:
features_dict = {
    #'user_id',
    #'timestamp':'float16',#
    'user_interaction_count':'int16',
    'user_interaction_timestamp_mean':'float32',
    'lagtime':'float32',#
    'lagtime2':'float32',
    'lagtime3':'float32',
    #'lagtime_mean':'int32',
    #'content_id':'int16',
    #'task_container_id':'int16',
    'user_lecture_sum':'int16',#
    'user_lecture_lv':'float16',##
    'prior_question_elapsed_time':'float32',#
    #'delta_prior_question_elapsed_time':'int32',#
    'user_correctness':'float16',#
    'true_user_correctness':'float16',
    'user_uncorrect_count':'int16',#
    'user_correct_count':'int16',#
    'content_correctness':'float16',
    'true_content_correctness':'float16',
    'content_correctness_std':'float16',
    'content_count':'int32',
    'content_correct_count':'int32',
    'content_uncorrect_count':'int32',#
    'content_correctness_skew':'float16',
    #'content_time_std':'float16',
    #'part_time_std':'float16',
    #'content_time_skew':'float16',
    #'part_time_skew':'float16',
    'content_mean_time':'float16',
    'part_mean_time':'float16',
    'mean_time_standard':'float16',
    'content_med_time':'float16',
    'part_med_time':'float16',
    'content_time_range':'float16',
    #'content_elapsed_time_mean':'float16',
    'content_had_explanation_mean':'float16',
    'content_explation_false_mean':'float16',
    'content_explation_true_mean':'float16',
    'task_container_correctness':'float16',
    #'task_container_std':'float16',
    'task_container_cor_count':'int32',#
    'task_container_uncor_count':'int32',#
    'attempt_no':'int8',#
    #'part':'int8',
    'part_correctness_mean':'float16',
    'part_correctness_std':'float16',
    'part_uncor_count':'int32',
    'part_cor_count':'int32',
    #'tags0': 'int8',
    #'tags1': 'int8',
    #'tags2': 'int8',
    #'tags3': 'int8',
    #'tags4': 'int8',
    #'tags5': 'int8',
    #'tags6': 'int8',
    #'tags7': 'int8',
     #'tags0_correctness_mean':'float16',
     #'tags1_correctness_mean':'float16',
     #'tags2_correctness_mean':'float16',
     #'tags3_correctness_mean':'float16',
     #'tags4_correctness_mean':'float16',
     #'tags5_correctness_mean':'float16',
     'tags0_correctness_skew':'float16',
     'tags1_correctness_skew':'float16',
     'tags2_correctness_skew':'float16',
     'tags3_correctness_skew':'float16',
     'tags4_correctness_skew':'float16',
     'tags5_correctness_skew':'float16',
     #'tags0_correctness_var':'float16',
     #'tags1_correctness_var':'float16',
     #'tags2_correctness_var':'float16',
     #'tags3_correctness_var':'float16',
     #'tags4_correctness_var':'float16',
     #'tags5_correctness_var':'float16',
    'question_elapsed_time_mean': 'float16',
    'question_had_explanation_mean': 'float16',
    'question_correctly_q_count': 'int16',
    'question_correctly_q_mean': 'float16',
    'tags_lsi': 'float16', 
    'tag_acc_count': 'float16',	
    'tag_acc_max':'float16',
    'tag_acc_min': 'float16',
    'part_elapsed_time_mean': 'float16',
    'part_had_explanation_mean': 'float16',
    'part_correctly_q_mean':'float16',
#     'bundle_id':'int16',
#     'bundle_correctness_mean':'float16',
#     'bundle_uncor_count':'int32',
#     'bundle_cor_count':'int32',
    #'part_bundle_id':'int32',
    'content_sub_bundle':'int8',
    'prior_question_had_explanation':'int8',
    'explanation_mean':'float16', #
    #'explanation_var',#
    'explanation_false_count':'int16',#
    'explanation_true_count':'int16',#
    'community_0':'int8',
    'community_1':'int8',
    'community_2':'int8',
    'community_3':'int8',
    'community_4':'int8',
   # 'community':'int8',
     #'parts_1':'int8',
     #'parts_2':'int8',
     #'parts_3':'int8',
     #'parts_4':'int8',
     #'parts_5':'int8',
     #'parts_6':'int8',
     #'parts_7':'int8',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
}
categorical_columns= [
    #'user_id',
    #'content_id',
    #'task_container_id',
    #'part',
   # 'community',
   #'tags0',
    #'tags1',
    #'tags2',
    #'tags3',
    #'tags4',
    #'tags5',
    #'tags6',
    #'tags7',
    #'bundle_id',
    #'part_bundle_id',
    'content_sub_bundle',
    'prior_question_had_explanation',
    'community_0',
    'community_1',
    'community_2',
    'community_3',
    'community_4', 
     #'parts_1',
     #'parts_2',
     #'parts_3',
     #'parts_4',
     #'parts_5',
     #'parts_6',
     #'parts_7',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
]

features=list(features_dict.keys())


In [None]:
flag_lgbm=True
clfs = list()
params = {
'num_leaves': 200,
'max_bin':450,
# 'min_child_weight': 0.03454472573214212,
'feature_fraction': 0.52,
'bagging_fraction': 0.52,
#'min_data_in_leaf': 106,
# 'max_depth': -1,
'objective': 'binary',
'learning_rate': 0.05,
"boosting_type": "gbdt",
"metric": 'auc',
# "bagging_seed": 11,
# "verbosity": -1,
# 'reg_alpha': 0.3899927210061127,
# 'reg_lambda': 0.6485237330340494,
# 'random_state': 2021
'force_col_wise':'true'
}

In [None]:
feature_before_merge_question = list(set(train_df.columns)&set(features))
feature_before_merge_question.append(target)
feature_before_merge_question.append('content_id')
# print(feature_before_merge_question)

In [None]:
print(psutil.virtual_memory().percent)

In [None]:
# train_df_clf=train_df[feature_before_merge_question]
train_df_clf=train_df[feature_before_merge_question].sample(frac=0.2)
del train_df
gc.collect()
print('del train_df')
train_df_clf = pd.merge(train_df_clf, questions_df, on='content_id', how='left',right_index=True)#
print('merge completed')
print('train_df_clf length：',len(train_df_clf))
del questions_df
tr_data = lgb.Dataset(train_df_clf[features], label=train_df_clf[target], feature_name=list(features))
del train_df_clf
print(len(features))
print(psutil.virtual_memory().percent)
gc.collect()

In [None]:
del attempt_no_agg
del user_prior_question_elapsed_time
del max_timestamp_u
del max_timestamp_u2
del max_timestamp_u3
del explanation_agg
del user_agg
del user_lecture_agg
del user_lecture_stats_part
del task_container_agg
del lectures_df
del column
del data_types_dict
del delta_prior_question_elapsed_time_mean
del feature_before_merge_question
del features_dict
del lagtime_mean
del lagtime_mean2
del lagtime_mean3
del le
del num
del part_lectures_columns
del prior_question_elapsed_time_mean
del questions_df_dict
del types_of_lectures_columns
gc.collect()

In [None]:
print(psutil.virtual_memory().percent)

In [None]:
model = lgb.train(
    params,
    tr_data,
    num_boost_round=1300,
    valid_sets=[tr_data],
    early_stopping_rounds=50,
    feature_name=features,
    categorical_feature=categorical_columns,
    verbose_eval=50
)

clfs.append(model)

fig,ax = plt.subplots(figsize=(15,15))
lgb.plot_importance(model, ax=ax,importance_type='gain',max_num_features=50)
plt.show()

del tr_data
gc.collect()

In [None]:
joblib.dump(model, '../data/bagging4.2.pkl')
print('Model saved')