In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count
from tqdm.notebook import tqdm
import gc
import os
import sys
import joblib

In [None]:
%%time
cols_to_load = ['user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time']
FEATURES = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')[cols_to_load]
FEATURES['prior_question_had_explanation'] = FEATURES['prior_question_had_explanation'].astype('boolean')

questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
dtype={'num_user_answered_questions':np.int16, 
       'num_user_correctly_questions':np.int16, 
       'mean_user_accuracy':np.float32,
       'watches_lecture':'boolean',
       'part':np.int8,
       'content_questions':np.int32,
       'mean_content_accuracy':np.float32,
       'mean_prior_question_had_explanation':np.float32,
       'mean_prior_question_elapsed_time':np.float32,
       'mean_user_question_elapsed_time':np.float32,
       'mean_user_question_had_explanation':np.float32
      }
user_df = pd.read_csv('../input/data_io/user_df.csv', index_col=0,dtype=dtype)
content_df = pd.read_csv('../input/data_io/content_df.csv', index_col=0,dtype=dtype)

In [None]:
%%time
#adding user features
user_df = FEATURES[FEATURES.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'sum', 'mean']}).reset_index()
user_df.columns = ['user_id', 'num_user_answered_questions', 'num_user_correctly_questions', 'mean_user_accuracy']
user_df['num_user_answered_questions'] = user_df['num_user_answered_questions'].astype('int16')
user_df['num_user_correctly_questions'] = user_df['num_user_correctly_questions'].astype('int16')
user_df['mean_user_accuracy'] = user_df['mean_user_accuracy'].astype('float32')

user_lect = FEATURES.groupby(["user_id", "answered_correctly"]).size().unstack()
user_lect.columns = ['Lecture', 'Wrong', 'Right']
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(user_lect.Lecture > 0, 1, 0)
user_lect['watches_lecture'] = user_lect['watches_lecture'].astype('boolean')
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
_ = gc.collect()

user_time_df = FEATURES[['user_id', 'prior_question_elapsed_time']].loc[FEATURES.answered_correctly != -1]
user_time_df['prior_question_elapsed_time'] = user_time_df['prior_question_elapsed_time'].shift(-1)
user_user_time_df = user_time_df.groupby('user_id').mean()
user_user_time_df.columns = ['mean_user_question_elapsed_time']
user_df = user_df.merge(user_user_time_df, on = "user_id", how = "left")
del user_user_time_df
del user_time_df
gc.collect()

user_explan_df = FEATURES[['user_id', 'prior_question_had_explanation']].loc[FEATURES.answered_correctly != -1]
user_explan_df['prior_question_had_explanation'] = user_explan_df['prior_question_had_explanation'].shift(-1)
user_user_explan_df = user_explan_df.groupby('user_id').mean().astype('float32')
user_user_explan_df.columns = ['mean_user_question_had_explanation']
user_df = user_df.merge(user_user_explan_df, on = "user_id", how = "left")
del user_user_explan_df
del user_explan_df
gc.collect()

user_df.head()
user_df.to_csv('../input/data_io/user_df.csv')

In [None]:
%%time
content_df = FEATURES[['content_id','answered_correctly']].loc[FEATURES['answered_correctly'] != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_questions', 'mean_content_accuracy']
content_df.head()
questions.rename(columns={'question_id': 'content_id'}, inplace=True)
questions = questions.merge(content_df, on = "content_id", how = "left")
questions = questions[['content_id','part','content_questions','mean_content_accuracy']]

questions['part'] = questions['part'].astype('int8')
questions['content_questions'] = questions['content_questions'].astype('int32')
questions['mean_content_accuracy'] = questions['mean_content_accuracy'].astype('float32')

del content_df
_ = gc.collect()

FEATURES = FEATURES[FEATURES['answered_correctly'] != -1]
FEATURES['prior_question_had_explanation'] = FEATURES['prior_question_had_explanation'].shift(-1)
FEATURES['prior_question_elapsed_time'] = FEATURES['prior_question_elapsed_time'].shift(-1)
FEATURES['prior_question_had_explanation'] = FEATURES['prior_question_had_explanation'].replace({True: 1, False: 0})
FEATURES['prior_question_had_explanation'].fillna(0, inplace=True)

content_df_prior = FEATURES[['content_id','prior_question_had_explanation']].groupby('content_id').mean()
content_df_prior.columns = ['mean_prior_question_had_explanation']
content_df_prior['mean_prior_question_had_explanation'] = content_df_prior['mean_prior_question_had_explanation'].astype('float32')
questions = questions.merge(content_df_prior, on = "content_id", how = "left")
del content_df_prior
_ = gc.collect()

content_df_prior = FEATURES[['content_id','prior_question_elapsed_time']].groupby('content_id').mean()
content_df_prior.columns = ['mean_prior_question_elapsed_time']
content_df_prior['mean_prior_question_elapsed_time'] = content_df_prior['mean_prior_question_elapsed_time'].astype('float32')
questions = questions.merge(content_df_prior, on = "content_id", how = "left")
del content_df_prior
_ = gc.collect()


content_df = questions
del questions
gc.collect()
cocontent_df.head()
content_df.to_csv('../input/data_io/content_df.csv')