## imports

In [None]:
import numpy as np 
import pandas as pd
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm

## read data

In [None]:
def read_data():
    train_pickle= '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle= '../input/riiid-cross-validation-files/cv1_valid.pickle'
    
    cols=['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'max_time_stamp', 'rand_time_stamp', 'viretual_time_stamp']

    cols_to_load= ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation']
    
    train= pd.read_pickle(train_pickle)[cols_to_load]
    valid= pd.read_pickle(valid_pickle)[cols_to_load]
    
        
    return train,valid 

In [None]:
%%time
train,valid= read_data()
print(train.shape)
print(valid.shape)

## feature engineering

In [None]:
def add_user_feats(df, user_sum_dict, user_count_dict):
    '''to add user features only using past data for user'''
    
    user_sum_arr = np.zeros(len(df), dtype=np.int32)
    user_count_arr= np.zeros(len(df), dtype=np.int32)
    
    for i, row in enumerate(tqdm(df[['user_id', 'answered_correctly']].values)):
        user_sum_arr[i]= user_sum_dict[row[0]]
        user_count_arr[i]= user_count_dict[row[0]]
        
        user_sum_dict[row[0]]+= row[1]
        user_count_dict[row[0]]+= 1
    
        
    df['user_correct_sum']= user_sum_arr
    df['user_count']= user_count_arr
    df['user_accuracy']= df['user_correct_sum']/df['user_count']
    return df

In [None]:
def add_user_feats_without_update(df,user_sum_dict, user_count_dict):
    
    user_sum_arr = np.zeros(len(df), dtype=np.int32)
    user_count_arr= np.zeros(len(df), dtype=np.int32)

    for i, row in enumerate(tqdm(df['user_id'].values)):
        user_sum_arr[i]= user_sum_dict[row]
        user_count_arr[i]= user_count_dict[row]
    
    df['user_correct_sum']= user_sum_arr
    df['user_count']= user_count_arr
    df['user_accuracy']= df['user_correct_sum']/df['user_count']
    return df
    

In [None]:
user_sum_dict= defaultdict(int)
user_count_dict= defaultdict(int)

train = train.loc[train.content_type_id == False]
valid = valid.loc[valid.content_type_id == False]

train= add_user_feats(train, user_sum_dict, user_count_dict)
valid= add_user_feats(valid, user_sum_dict, user_count_dict)

In [None]:
content_df = train.groupby('content_id', as_index= False).agg(
    {'answered_correctly': ['count', 'mean']})

content_df.columns = ['content_id', 'content_questions', 'content_mean']

train = train.merge(content_df, on = "content_id", how = "left")
valid = valid.merge(content_df, on = "content_id", how = "left")


In [None]:
# part
question_file='../input/riiid-test-answer-prediction/questions.csv'
questions_df = pd.read_csv(question_file)
train = pd.merge(train, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
valid = pd.merge(valid, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')

In [None]:
train.head(2)

In [None]:
# changing dtype to avoid lightgbm error
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int')
valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int')

time_mean = train.prior_question_elapsed_time.dropna().values.mean()
train['prior_question_elapsed_time'] = train.prior_question_elapsed_time.fillna(time_mean)
valid['prior_question_elapsed_time'] = valid.prior_question_elapsed_time.fillna(time_mean)

valid['user_accuracy'].fillna(0.65, inplace=True)
train['user_accuracy'].fillna(0.65, inplace=True)

In [None]:
time_mean= X.prior_question_elapsed_time.dropna().values.mean()
time_mean

In [None]:
FEATS= ['user_correct_sum', 'user_count', 'user_accuracy', 'content_questions',
        'content_mean', 'prior_question_had_explanation', 'prior_question_elapsed_time',
       'part', 'answered_correctly']

valid= valid[FEATS]

train= train.iloc[-10000000:, :]
train= train[FEATS]

In [None]:
train.head(2)

## modelling

In [None]:
X= train.drop('answered_correctly', axis=1)
y= train.loc[ :, 'answered_correctly']

print(X.shape)
print(y.shape)

X_val= valid.drop('answered_correctly', axis=1)
y_val= valid.loc[ :, 'answered_correctly']


print(X_val.shape)
print(y_val.shape)

In [None]:
del train
del valid

In [None]:
X['prior_question_had_explanation'] = X.prior_question_had_explanation.fillna(False).astype('int')
X_val['prior_question_had_explanation'] = X_val.prior_question_had_explanation.fillna(False).astype('int')
X.head(1)

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X, y)
lgb_valid = lgb.Dataset(X_val, y_val)


model = lgb.train(
                    {'objective': 'binary'}, 
                    lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    verbose_eval=100,
                    num_boost_round=10000,
                    early_stopping_rounds=10
                )


In [None]:
from sklearn.metrics import roc_auc_score as auc
pred= model.predict(X_val)
print(auc(y_val, pred))

In [None]:
# hyperparam tuning for gbdt
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import roc_auc_score as auc

# params= {
#     'learning_rate': [0.05,0.1,0.15],
#     'n_estimators': [100, 200],
#     'max_depth': [3, 7]
# }


# for lr in params['learning_rate']:
#     for est in params['n_estimators']:
#         for d in params['max_depth']:
#             print('LR :', lr, 'Est :', est, 'D :', d)
#             gbdt = GradientBoostingClassifier(learning_rate= lr, n_estimators=est, max_depth=d)
#             gbdt.fit(X,y)
#             pred= gbdt.predict_proba(X_val)[:, 1]
#             print(auc(y_val, pred))
            


In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
# gbdt = GradientBoostingClassifier()
# gbdt.fit(X,y)

## cross validation

In [None]:
# from sklearn.metrics import roc_auc_score as auc
# X_val= valid.drop('answered_correctly', axis=1)
# y_val= valid.loc[ :, 'answered_correctly']

# pred= gbdt.predict_proba(X_val)[:, 1]
# auc(y_val, pred)

## test

In [None]:
test= pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [None]:
test.head(2)

In [None]:
FEAT= ['user_correct_sum', 'user_count', 'user_accuracy', 'content_questions',
        'content_mean', 'prior_question_had_explanation', 'prior_question_elapsed_time',
       'part']


In [None]:
def process_and_predict(test):
    test= test.merge(content_df, on = "content_id", how = "left")
    test= add_user_feats_without_update(test,user_sum_dict, user_count_dict)
    test= pd.merge(test, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    
    
    te= test.loc[ :, FEAT]
    te['content_questions'].fillna(0, inplace = True)
    te['content_mean'].fillna(0.65, inplace = True)
    te['user_correct_sum'].fillna(0, inplace = True)
    te['user_count'].fillna(0, inplace = True)
    te['user_accuracy'].fillna(0.65, inplace = True)
    te['part'].fillna(7, inplace = True)
    
    
    te['prior_question_had_explanation'] = te.prior_question_had_explanation.fillna(False).astype('int')
    te['prior_question_elapsed_time'] = te.prior_question_elapsed_time.fillna(25000)
    
    
    

    pred= model.predict(te)
    test['answered_correctly']=pred
    
    return test


In [None]:
t= process_and_predict(test)
t.head()

In [None]:
import riiideducation
env = riiideducation.make_env()

# Training data is in the competition dataset as usual

iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_results= process_and_predict(test_df)
    env.predict(test_results.loc[test_results['content_type_id'] == 0, ['row_id', 'answered_correctly']])