In [None]:
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pickle
import lightgbm as lgb
import joblib
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# 用 1.Riiid LGBM 训练数据生成.ipynb 最后生成的训练数据
train_df = pd.read_pickle("D:/kaggle/input/riiid-test-answer-prediction/train_df_3115.pkl")
valid_df = pd.read_pickle("D:/kaggle/input/riiid-test-answer-prediction/valid_df_3115.pkl")
print("Train:", train_df.shape,"Valid:", valid_df.shape)

In [None]:
# 所有要用的特征
features = [
    "part_bundle_id",
    
    'content_elapsed_time',
    'content_had_explanation',
    
    'lagtime2',
    'lagtime3',
    
    'content_explation_false_mean',
    'content_explation_true_mean',
    
    'curr_user_part_acc', 
    'curr_user_part_count', 
    'curr_user_part_sum',  
    'curr_uq_time_diff', 
    'curr_user_time_diff',
    'curr_user_time_diff_mean',
    'curr_user_elapsed_time_diff',

    'avg_task_seen_cumsum',
    'content_mean_acc',
    'content_cnt',
    'corr_question_elapsed_time_mean', 
    'incorr_question_elapsed_time_mean',
    
    "watched_tags_rate",
    "watched_tags_bool",
    'tags_acc',
    'part',
    'part_bundle_acc', 
    
    'part_1_cnt', 'part_2_cnt', 'part_3_cnt', 'part_4_cnt', 'part_5_cnt', 'part_6_cnt', 'part_7_cnt', 
    'type_of_concept_cnt', 'type_of_intention_cnt', 'type_of_solving_question_cnt', 'type_of_starter_cnt', 
    "same_part_cnt",
    
    'curr_lecture_bool',
    'curr_user_correct_cnt', 
    'curr_user_answer_cnt',
    'curr_user_acc',
    'hmean_acc',
    'curr_uq_correct_cnt',
    'curr_uq_answer_cnt',
    'curr_uq_acc',
    'prior_question_elapsed_time',
    'prior_question_had_explanation', 
]

target = 'answered_correctly'

In [None]:
# 训练也需要30g+的RAM，如果内存不够修改TEST_SIZE来适配机器。注意：TEST_SIZE越大所需内存越小！
TEST_SIZE = 0.5
train_data, test_data = train_test_split(train_df, random_state=666, test_size=TEST_SIZE)
del test_data
del train_df
_ = gc.collect()

lgb_train = lgb.Dataset(train_data[features], train_data[target])
lgb_valid = lgb.Dataset(valid_df[features], valid_df[target])
del train_data
_ = gc.collect()

In [None]:
%%time
# 用默认参数直接干，这个比赛参数影响不大
model = lgb.train(  
                    {"objective": "binary"},
                    lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    verbose_eval=100,
                    num_boost_round=10000,
                    early_stopping_rounds=20, 
                ) 

print('auc:', roc_auc_score(valid_df[target], model.predict(valid_df[features])))
_ = lgb.plot_importance(model,figsize=(10,9)) 

In [None]:
# 保存模型，之后上传到kaggle做inference
joblib.dump(model, 'D:/kaggle/input/riiid-test-answer-prediction/lgb_nouq_0501.model')