In [1]:
import numpy as np
import random
import pandas as pd
import joblib
from collections import defaultdict
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
import gc

_ = np.seterr(divide='ignore', invalid='ignore')

# Preprocess

In [2]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'boolean'
}
target = 'answered_correctly'

In [3]:
train_df = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')[data_types_dict.keys()]
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('boolean')

In [4]:
print('start handle attempt_no...')

start handle attempt_no...


In [5]:
train_df["attempt_no"] = 1
train_df.attempt_no=train_df.attempt_no.astype('int8')

attempt_no_agg=train_df.groupby(["user_id","content_id"])["attempt_no"].agg(['sum']).astype('int8')

train_df["attempt_no"] = train_df[["user_id","content_id",'attempt_no']].groupby(["user_id","content_id"])["attempt_no"].cumsum()

attempt_no_agg=attempt_no_agg[attempt_no_agg['sum'] >1]

In [6]:
print('start handle lecture data...')

start handle lecture data...


In [7]:
#reading in lecture df
lectures_df = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

In [8]:
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])
part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]
types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

In [9]:
train_lectures = train_df[train_df.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')

In [10]:
user_lecture_stats_part = train_lectures.groupby('user_id',as_index = False)[part_lectures_columns + types_of_lectures_columns].sum()

In [11]:
lecturedata_types_dict = {   
    'user_id': 'int32', 
    'part_1': 'int8',
    'part_2': 'int8',
    'part_3': 'int8',
    'part_4': 'int8',
    'part_5': 'int8',
    'part_6': 'int8',
    'part_7': 'int8',
    'type_of_concept': 'int8',
    'type_of_intention': 'int8',
    'type_of_solving_question': 'int8',
    'type_of_starter': 'int8'
}
user_lecture_stats_part = user_lecture_stats_part.astype(lecturedata_types_dict)

In [12]:
# 设定全部变成1
for column in user_lecture_stats_part.columns:
    #bool_column = column + '_boolean'
    if(column !='user_id'):
        user_lecture_stats_part[column] = (user_lecture_stats_part[column] > 0).astype('int8')

In [13]:
del(train_lectures)
gc.collect()

80

In [14]:
user_lecture_agg = train_df.groupby('user_id')['content_type_id'].agg(['sum', 'count'])
user_lecture_agg=user_lecture_agg.astype('int16')

In [15]:
#1= if the event was the user watching a lecture.
cum = train_df.groupby('user_id')['content_type_id'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['user_interaction_count'] = cum['cumcount'] 
train_df['user_interaction_timestamp_mean'] = train_df['timestamp']/cum['cumcount'] 
train_df['user_lecture_sum'] = cum['cumsum'] 
train_df['user_lecture_lv'] = cum['cumsum'] / cum['cumcount']


train_df.user_lecture_lv=train_df.user_lecture_lv.astype('float16')
train_df.user_lecture_sum=train_df.user_lecture_sum.astype('int16')
train_df.user_interaction_count=train_df.user_interaction_count.astype('int16')
train_df['user_interaction_timestamp_mean']=train_df['user_interaction_timestamp_mean']/(1000*3600)
train_df.user_interaction_timestamp_mean=train_df.user_interaction_timestamp_mean.astype('float32')

In [16]:
del cum
gc.collect()

42

In [17]:
print('start handle train_df...')

start handle train_df...


In [18]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

In [19]:
content_explation_agg=train_df[["content_id","prior_question_had_explanation",target]].groupby(["content_id","prior_question_had_explanation"])[target].agg(['mean'])

In [20]:
content_explation_agg=content_explation_agg.unstack()

content_explation_agg=content_explation_agg.reset_index()
content_explation_agg.columns = ['content_id', 'content_explation_false_mean','content_explation_true_mean']

In [21]:
content_explation_agg.content_id=content_explation_agg.content_id.astype('int16')
content_explation_agg.content_explation_false_mean=content_explation_agg.content_explation_false_mean.astype('float16')
content_explation_agg.content_explation_true_mean=content_explation_agg.content_explation_true_mean.astype('float16')

In [22]:
print('start handle timestamp...')
prior_question_elapsed_time_mean=train_df['prior_question_elapsed_time'].mean()
train_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

start handle timestamp...


In [23]:
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
max_timestamp_u.user_id=max_timestamp_u.user_id.astype('int32')

In [24]:

train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()

max_timestamp_u2 = train_df[['user_id','lagtime']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u2.columns = ['user_id', 'max_time_stamp2']
max_timestamp_u2.user_id=max_timestamp_u2.user_id.astype('int32')

In [25]:
train_df['lagtime']=train_df['timestamp']-train_df['lagtime']
lagtime_mean=train_df['lagtime'].mean()
train_df['lagtime'].fillna(lagtime_mean, inplace=True)

In [26]:
train_df['lagtime']=train_df['lagtime']/(1000*3600)
train_df.lagtime=train_df.lagtime.astype('float32')

In [27]:

train_df['lagtime2'] = train_df.groupby('user_id')['timestamp'].shift(2)

max_timestamp_u3 = train_df[['user_id','lagtime2']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u3.columns = ['user_id', 'max_time_stamp3']
max_timestamp_u3.user_id=max_timestamp_u3.user_id.astype('int32')

train_df['lagtime2']=train_df['timestamp']-train_df['lagtime2']
lagtime_mean2=train_df['lagtime2'].mean()
train_df['lagtime2'].fillna(lagtime_mean2, inplace=True)


In [28]:
train_df['lagtime2']=train_df['lagtime2']/(1000*3600)
train_df.lagtime2=train_df.lagtime2.astype('float32')

In [29]:

train_df['lagtime3'] = train_df.groupby('user_id')['timestamp'].shift(3)

train_df['lagtime3']=train_df['timestamp']-train_df['lagtime3']
lagtime_mean3=train_df['lagtime3'].mean()
train_df['lagtime3'].fillna(lagtime_mean3, inplace=True)
train_df['lagtime3']=train_df['lagtime3']/(1000*3600)
train_df.lagtime3=train_df.lagtime3.astype('float32')

In [30]:
train_df['timestamp']=train_df['timestamp']/(1000*3600)
#
train_df.timestamp=train_df.timestamp.astype('float16')

In [31]:
user_prior_question_elapsed_time = train_df[['user_id','prior_question_elapsed_time']].groupby(['user_id']).tail(1)
user_prior_question_elapsed_time.columns = ['user_id', 'prior_question_elapsed_time']

In [32]:

train_df['delta_prior_question_elapsed_time'] = train_df.groupby('user_id')['prior_question_elapsed_time'].shift()
train_df['delta_prior_question_elapsed_time']=train_df['prior_question_elapsed_time']-train_df['delta_prior_question_elapsed_time']

In [33]:
delta_prior_question_elapsed_time_mean=train_df['delta_prior_question_elapsed_time'].mean()
train_df['delta_prior_question_elapsed_time'].fillna(delta_prior_question_elapsed_time_mean, inplace=True)
train_df.delta_prior_question_elapsed_time=train_df.delta_prior_question_elapsed_time.astype('int32')

In [34]:

train_df['lag'] = train_df.groupby('user_id')[target].shift()

cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
##cum['cumcount']=cum['cumcount']+1
user_agg = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')
cum['cumsum'].fillna(0, inplace=True)

train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df['user_correct_count'] = cum['cumsum']
train_df['user_uncorrect_count'] = cum['cumcount']-cum['cumsum']
#train_df['user_answer_count'] = cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)
train_df['user_correctness'].fillna(0.67, inplace=True)
train_df.user_correctness=train_df.user_correctness.astype('float16')
train_df.user_correct_count=train_df.user_correct_count.astype('int16')
train_df.user_uncorrect_count=train_df.user_uncorrect_count.astype('int16')
#train_df.user_answer_count=train_df.user_answer_count.astype('int16')

In [35]:
del cum
gc.collect()

40

In [36]:
# train_df['lag'] = train_df.groupby('user_id')[target].shift(2)
# cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
# ##cum['cumcount']=cum['cumcount']+1
# user_agg2 = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')
# cum['cumsum'].fillna(0, inplace=True)

# train_df['user_correctness2'] = cum['cumsum'] / cum['cumcount']
# train_df['user_correct_count2'] = cum['cumsum']
# train_df['user_uncorrect_count2'] = cum['cumcount']-cum['cumsum']
# #train_df['user_answer_count2'] = cum['cumcount']
# train_df.drop(columns=['lag'], inplace=True)
# train_df['user_correctness2'].fillna(0.67, inplace=True)
# train_df.user_correctness2=train_df.user_correctness2.astype('float16')
# train_df.user_correct_count2=train_df.user_correct_count2.astype('int16')
# train_df.user_uncorrect_count2=train_df.user_uncorrect_count2.astype('int16')
# #train_df.user_answer_count2=train_df.user_answer_count2.astype('int16')
# del cum
# gc.collect()

In [37]:
train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')
explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])
explanation_agg=explanation_agg.astype('int16')
# explanation_agg.sum=explanation_agg.sum.astype('int16')
# explanation_agg.count=explanation_agg.count.astype('int16')
#explanation_agg.var=explanation_agg.var.astype('float16')

In [38]:


#train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()

cum = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['cumsum', 'cumcount'])
cum['cumcount']=cum['cumcount']+1
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']
train_df['explanation_true_count'] = cum['cumsum'] 
train_df['explanation_false_count'] =  cum['cumcount']-cum['cumsum']
#train_df.drop(columns=['lag'], inplace=True)

train_df.explanation_mean=train_df.explanation_mean.astype('float16')
train_df.explanation_true_count=train_df.explanation_true_count.astype('int16')
train_df.explanation_false_count=train_df.explanation_false_count.astype('int16')

In [39]:
del cum
gc.collect()

40

In [40]:
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count','var'])
task_container_agg = train_df.groupby('task_container_id')[target].agg(['sum', 'count','var'])
content_agg=content_agg.astype('float32')
task_container_agg=task_container_agg.astype('float32')

In [41]:
#
train_df['task_container_uncor_count'] = train_df['task_container_id'].map(task_container_agg['count']-task_container_agg['sum']).astype('int32')
train_df['task_container_cor_count'] = train_df['task_container_id'].map(task_container_agg['sum']).astype('int32')
train_df['task_container_std'] = train_df['task_container_id'].map(task_container_agg['var']).astype('float16')
train_df['task_container_correctness'] = train_df['task_container_id'].map(task_container_agg['sum'] / task_container_agg['count'])
train_df.task_container_correctness=train_df.task_container_correctness.astype('float16')

In [42]:
content_elapsed_time_agg=train_df.groupby('content_id')['prior_question_elapsed_time'].agg(['mean'])
content_had_explanation_agg=train_df.groupby('content_id')['prior_question_had_explanation'].agg(['mean'])

In [43]:
print('start questions data...')

start questions data...


In [44]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 1,3,4],
    dtype={'question_id': 'int16','bundle_id': 'int16', 'part': 'int8','tags': 'str'}
)

In [45]:
bundle_agg = questions_df.groupby('bundle_id')['question_id'].agg(['count'])

In [46]:
questions_df['content_sub_bundle'] = questions_df['bundle_id'].map(bundle_agg['count']).astype('int8')

In [47]:

questions_df['tags'].fillna('188', inplace=True)

In [48]:
def gettags(tags,num):
    tags_splits=tags.split(" ")
    result='' 
    for t in tags_splits:
        x=int(t)
        if(x<32*(num+1) and x>=32*num):#num 
            result=result+' '+t
    return result

In [49]:
from sklearn.preprocessing import LabelEncoder
for num in range(0,6):
    questions_df["tags"+str(num)] = questions_df["tags"].apply(lambda row: gettags(row,num))
    le = LabelEncoder()
    le.fit(np.unique(questions_df['tags'+str(num)].values))
    #questions_df[['tags'+str(num)]=
    questions_df['tags'+str(num)]=questions_df[['tags'+str(num)]].apply(le.transform)

In [50]:
questions_df_dict = {   
    'tags0': 'int8',
    'tags1': 'int8',
    'tags2': 'int8',
    'tags3': 'int8',
    'tags4': 'int8',
    'tags5': 'int8',
    #'tags6': 'int8',
    #'tags7': 'int8'
}
questions_df = questions_df.astype(questions_df_dict)

In [51]:
questions_df.drop(columns=['tags'], inplace=True)

In [52]:

questions_df['part_bundle_id']=questions_df['part']*100000+questions_df['bundle_id']
questions_df.part_bundle_id=questions_df.part_bundle_id.astype('int32')
# tag = questions_df["tags"].str.split(" ", n = 10, expand = True)
# tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
# #

# tag.fillna(0, inplace=True)
# tag = tag.astype('int16')
# questions_df =  pd.concat([questions_df,tag],axis=1).drop(['tags'],axis=1)

In [53]:
# questions_cmnts = pd.read_csv(
#     '../input/2020-r3id-clustering-question-tags/question_cmnts.csv', 
#     usecols=[1,2],
#     dtype={'question_id': 'int16','community': 'int8'}
# )

In [54]:
# questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#

In [55]:
questions_df.rename(columns={'question_id':'content_id'}, inplace=True)

In [56]:
questions_df = pd.merge(questions_df, content_explation_agg, on='content_id', how='left',right_index=True)#
# questions_df.content_explation_false_mean=questions_df.content_explation_false_mean.astype('float16')
# questions_df.content_explation_true_mean=questions_df.content_explation_true_mean.astype('float16')

In [57]:
del content_explation_agg

In [58]:
questions_df['content_correctness'] = questions_df['content_id'].map(content_agg['sum'] / content_agg['count'])
questions_df.content_correctness=questions_df.content_correctness.astype('float16')
questions_df['content_correctness_std'] = questions_df['content_id'].map(content_agg['var'])
questions_df.content_correctness_std=questions_df.content_correctness_std.astype('float16')
questions_df['content_uncorrect_count'] = questions_df['content_id'].map(content_agg['count']-content_agg['sum']).astype('int32')
questions_df['content_correct_count'] = questions_df['content_id'].map(content_agg['sum']).astype('int32')

In [59]:
questions_df['content_elapsed_time_mean'] = questions_df['content_id'].map(content_elapsed_time_agg['mean'])
questions_df.content_elapsed_time_mean=questions_df.content_elapsed_time_mean.astype('float16')
questions_df['content_had_explanation_mean'] = questions_df['content_id'].map(content_had_explanation_agg['mean'])
questions_df.content_had_explanation_mean=questions_df.content_had_explanation_mean.astype('float16')

In [60]:
del content_elapsed_time_agg
del content_had_explanation_agg
gc.collect()

80

In [61]:
part_agg = questions_df.groupby('part')['content_correctness'].agg(['mean', 'var'])
questions_df['part_correctness_mean'] = questions_df['part'].map(part_agg['mean'])
questions_df['part_correctness_std'] = questions_df['part'].map(part_agg['var'])
questions_df.part_correctness_mean=questions_df.part_correctness_mean.astype('float16')
questions_df.part_correctness_std=questions_df.part_correctness_std.astype('float16')

In [62]:
part_agg = questions_df.groupby('part')['content_uncorrect_count'].agg(['sum'])
questions_df['part_uncor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')
#
part_agg = questions_df.groupby('part')['content_correct_count'].agg(['sum'])
questions_df['part_cor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')

In [63]:
bundle_agg = questions_df.groupby('bundle_id')['content_correctness'].agg(['mean'])
questions_df['bundle_correctness_mean'] = questions_df['bundle_id'].map(bundle_agg['mean'])
questions_df.bundle_correctness_mean=questions_df.bundle_correctness_mean.astype('float16')

# bundle_agg = questions_df.groupby('bundle_id')['content_uncorrect_count'].agg(['sum'])
# questions_df['bundle_uncor_count'] = questions_df['bundle_id'].map(bundle_agg['sum']).astype('int32')
# #
# bundle_agg = questions_df.groupby('bundle_id')['content_correct_count'].agg(['sum'])
# questions_df['bundle_cor_count'] = questions_df['bundle_id'].map(bundle_agg['sum']).astype('int32')

In [64]:
#questions_df.loc[questions_df['content_sub_bundle'] !=0, 'content_sub_bundle']=1 #修改列"content_sub_bundle"的值，推荐使用.loc

In [65]:
# tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['mean', 'var'])
# questions_df['tags0_correctness_mean'] = questions_df['tags0'].map(tags1_agg['mean'])
# questions_df.tags0_correctness_mean=questions_df.tags0_correctness_mean.astype('float16')

# tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['mean', 'var'])
# questions_df['tags1_correctness_mean'] = questions_df['tags1'].map(tags1_agg['mean'])
# questions_df.tags1_correctness_mean=questions_df.tags1_correctness_mean.astype('float16')

# tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['mean', 'var'])
# questions_df['tags2_correctness_mean'] = questions_df['tags2'].map(tags1_agg['mean'])
# questions_df.tags2_correctness_mean=questions_df.tags2_correctness_mean.astype('float16')

# tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['mean', 'var'])
# questions_df['tags4_correctness_mean'] = questions_df['tags4'].map(tags1_agg['mean'])
# questions_df.tags4_correctness_mean=questions_df.tags4_correctness_mean.astype('float16')

# questions_df['tags1_correctness_std'] = questions_df['tags1'].map(tags1_agg['var'])

# questions_df.tags1_correctness_std=questions_df.tags1_correctness_std.astype('float16')
# tags1_agg = questions_df.groupby('tags1')['content_uncorrect_count'].agg(['sum'])
# questions_df['tags1_uncor_count'] = questions_df['tags1'].map(tags1_agg['sum']).astype('int32')
# #
# tags1_agg = questions_df.groupby('tags1')['content_correct_count'].agg(['sum'])
# questions_df['tags1_cor_count'] = questions_df['tags1'].map(tags1_agg['sum']).astype('int32')

In [66]:
del content_agg
del bundle_agg
del part_agg
#del tags1_agg
gc.collect()

122

In [67]:
#pd.set_option("display.max_columns",500)

In [68]:
#questions_df.drop(columns=['tags4','tags5','tags6'], inplace=True)

In [69]:
len(train_df)

99271300

In [70]:
#train_df.drop(columns=['content_type_id'], inplace=True)

# Train

In [71]:
features_dict = {
    #'user_id',
    'timestamp':'float16',#
    'user_interaction_count':'int16',
    'user_interaction_timestamp_mean':'float32',
    'lagtime':'float32',#
    'lagtime2':'float32',
    'lagtime3':'float32',
    #'lagtime_mean':'int32',
    'content_id':'int16',
    'task_container_id':'int16',
    'user_lecture_sum':'int16',#
    'user_lecture_lv':'float16',##
    'prior_question_elapsed_time':'float32',#
#     'delta_prior_question_elapsed_time':'int32',#
    'user_correctness':'float16',#
    'user_uncorrect_count':'int16',#
    'user_correct_count':'int16',#
    #'content_correctness':'float16',
    'content_correctness_std':'float16',
    'content_correct_count':'int32',
    'content_uncorrect_count':'int32',#
    'content_elapsed_time_mean':'float16',
    'content_had_explanation_mean':'float16',
    'content_explation_false_mean':'float16',
    'content_explation_true_mean':'float16',
#     'task_container_correctness':'float16',
#     'task_container_std':'float16',
    'task_container_cor_count':'int32',#
#     'task_container_uncor_count':'int32',#
    'attempt_no':'int8',#
    'part':'int8',
    'part_correctness_mean':'float16',
    'part_correctness_std':'float16',
    'part_uncor_count':'int32',
#     'part_cor_count':'int32',
#     'tags0': 'int8',
#     'tags1': 'int8',
    'tags2': 'int8',
#     'tags3': 'int8',
#     'tags4': 'int8',
#     'tags5': 'int8',
   # 'tags6': 'int8',
   # 'tags7': 'int8',
#     'tags0_correctness_mean':'float16',
#     'tags1_correctness_mean':'float16',
#     'tags2_correctness_mean':'float16',
#     'tags4_correctness_mean':'float16',
#     'bundle_id':'int16',
#     'bundle_correctness_mean':'float16',
#     'bundle_uncor_count':'int32',
#     'bundle_cor_count':'int32',
    'part_bundle_id':'int32',
    'content_sub_bundle':'int8',
    'prior_question_had_explanation':'int8',
    'explanation_mean':'float16', #
    #'explanation_var',#
    'explanation_false_count':'int16',#
#     'explanation_true_count':'int16',#
   # 'community':'int8',
#     'part_1',
#     'part_2',
#     'part_3',
#     'part_4',
#     'part_5',
#     'part_6',
#     'part_7',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
}
categorical_columns= [
    #'user_id',
    'content_id',
    'task_container_id',
    'part',
   # 'community',
    'tags0',
    'tags1',
    'tags2',
    'tags3',
    'tags4',
    'tags5',
    #'tags6',
    #'tags7',
    #'bundle_id',
    'part_bundle_id',
    'content_sub_bundle',
    'prior_question_had_explanation', 
#     'part_1',
#     'part_2',
#     'part_3',
#     'part_4',
#     'part_5',
#     'part_6',
#     'part_7',
#     'type_of_concept',
#     'type_of_intention',
#     'type_of_solving_question',
#     'type_of_starter'
]

features=list(features_dict.keys())

In [72]:
flag_lgbm=True
clfs = list()
params = {
'num_leaves': 200,
'max_bin':450,
# 'min_child_weight': 0.03454472573214212,
'feature_fraction': 0.52,
'bagging_fraction': 0.52,
#'min_data_in_leaf': 106,
# 'max_depth': -1,
'objective': 'binary',
'learning_rate': 0.05,
"boosting_type": "gbdt",
"metric": 'auc',
# "bagging_seed": 11,
# "verbosity": -1,
# 'reg_alpha': 0.3899927210061127,
# 'reg_lambda': 0.6485237330340494,
# 'random_state': 2021
}

In [73]:
feature_before_merge_question = list(set(train_df.columns)&set(features))
feature_before_merge_question.append(target)
# print(feature_before_merge_question)

In [74]:
# train_df_clf=train_df[feature_before_merge_question]
train_df_clf=train_df[feature_before_merge_question].sample(frac=0.8)
del train_df
gc.collect()
train_df_clf = pd.merge(train_df_clf, questions_df, on='content_id', how='left',right_index=True)#
print('train_df_clf length：',len(train_df_clf))
del questions_df
tr_data = lgb.Dataset(train_df_clf[features], label=train_df_clf[target], feature_name=list(features))
del train_df_clf
gc.collect()

train_df_clf length： 79417040


0

In [75]:
del attempt_no_agg
del user_prior_question_elapsed_time
del max_timestamp_u
del max_timestamp_u2
del max_timestamp_u3
del explanation_agg
del user_agg
del user_lecture_agg
del user_lecture_stats_part
del task_container_agg
del lectures_df
gc.collect()

40

In [None]:
model = lgb.train(
    params, 
    tr_data,
    num_boost_round=400,
    valid_sets=[tr_data],
    early_stopping_rounds=50,
    feature_name=features,
    categorical_feature=categorical_columns,
    verbose_eval=50
)

# clfs.append(model)

fig,ax = plt.subplots(figsize=(15,15))
lgb.plot_importance(model, ax=ax,importance_type='gain',max_num_features=50)
plt.show()

del tr_data
gc.collect()

New categorical_feature is ['content_id', 'content_sub_bundle', 'part', 'part_bundle_id', 'prior_question_had_explanation', 'tags0', 'tags1', 'tags2', 'tags3', 'tags4', 'tags5', 'task_container_id']


In [None]:
joblib.dump(model, '../data/bagging2.pkl')
print('Model saved')