# Import the Required Libraries

In [1]:
import os
import jo_wilder
import os
import gc
import pandas as pd
import pickle 
import numpy as np

from xgboost import XGBClassifier

In [2]:
model_folder = '/kaggle/input/model-may26'

models = {}

for file in os.listdir(model_folder):
    
    if 'question' in file:
        x = int(file.split('_')[1])
        
        with open(f'{model_folder}/{file}', 'rb') as file:
            model_ = pickle.load(file)
        
        models[f'question_{x}'] = model_

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [3]:
# def kaggle_prep(test_data=None):
#     "how to prep the data for the kaggle notebook"
    
#     level_dict = {
#         '0-4' : [1, 2, 3],
#         '5-12' : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
#         '13-22' : [14, 15, 16, 17, 18]
#     }

    
#     df_ = test_data.copy()
#         # getting elapsed diffs
#     df_['event_time_delta'] = (
#         df_
#         .groupby('session_id')['elapsed_time']
#         .transform(lambda x: x.diff().fillna(0))
#     )

#     # getting the time until the next event
#     df_['time_delta_til_next'] = (
#         df_
#         .groupby('session_id')['elapsed_time']
#         .transform(lambda x: abs(x.diff(-1)).fillna(0)))
    
    
#     # time delta means    
#     time_delta_mean = df_.groupby('session_id').agg(event_time_mean=('event_time_delta', 'mean'),
#                                                     event_time_std=('event_time_delta', 'std'),
#                                                     event_time_max=('event_time_delta', 'max'))
    
#     # hover duration stats
#     hover_duration = df_.groupby('session_id').agg(hover_duration_mean=('hover_duration', 'mean'),
#                                                         hover_duration_std=('hover_duration', 'std'),
#                                                         hover_duration_max=('hover_duration', 'max'))
    
#     # total time on each event
#     total_time_event = (
#         df_
#         .groupby(['session_id', 'event_name'])
#         .agg(total_time_event=('time_delta_til_next', 'sum'),
#              mean_time_event=('time_delta_til_next', 'mean'),
#              std_time_event=('time_delta_til_next', 'std'))
#         .unstack()
#     )
    
#     # total time on each level    
#     level_duration = (
#         df_
#         .groupby(['session_id', 'level'])['elapsed_time']
#         .apply(lambda x: x.max() - x.min())
#         .unstack()
#         .rename(columns=lambda x: f'time_on_level_{x}')
#     )
    
#     # unique text ids
#     nunique_text_ids = df_.groupby('session_id')['text_fqid'].nunique()
    
#     # unique fqids 
#     nunique_fqid = df_.groupby('session_id')['fqid'].nunique()
    
#     # how many events of each type occurred in the session
#     event_counts = df_.groupby('session_id')['event_name'].value_counts().unstack()
    
#     # getting session lengths
#     session_lengths = df_.groupby('session_id')['elapsed_time'].max().rename('total_length')
    
#     # total events in the session
#     session_events = df_.groupby('session_id')['session_id'].count()
    
#     # final df pre labels
#     df_features = (
#         pd.concat([total_time_event, nunique_text_ids, nunique_fqid, event_counts, 
#                    session_lengths, session_events, time_delta_mean, hover_duration,
#                    level_duration], axis=1)
#         .fillna(0)
#         .drop(columns='session_id')
#         .reset_index()
#     )
    
#     # fixing column names 
#     df_features.rename(columns=lambda x: '_'.join(x) if isinstance(x, tuple) else x, inplace=True)
    
#     return df_features.set_index('session_id')

In [4]:
def kaggle_prep(test_data=None):
    "how to prep the data for the kaggle notebook"
    
    level_dict = {
        '0-4' : [1, 2, 3],
        '5-12' : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
        '13-22' : [14, 15, 16, 17, 18]
    }

    
    df_ = test_data.copy()
   
    # getting elapsed diffs
    df_['event_time_delta'] = (
        df_
        .groupby('session_id')['elapsed_time']
        .transform(lambda x: x.diff().fillna(0))
        .clip(0, 103000) 
    )

    # getting the time until the next event
    df_['time_delta_til_next'] = (
        df_
        .groupby('session_id')['elapsed_time']
        .transform(lambda x: abs(x.diff(-1)).fillna(0))
        .clip(0, 103000)
    )
    
    
    # time delta means    
    time_delta_mean = df_.groupby('session_id').agg(event_time_mean=('event_time_delta', 'mean'),
                                                    event_time_std=('event_time_delta', 'std'),
                                                    event_time_max=('event_time_delta', 'max'))
    
    # hover duration stats
    hover_duration = df_.groupby('session_id').agg(hover_duration_mean=('hover_duration', 'mean'),
                                                        hover_duration_std=('hover_duration', 'std'),
                                                        hover_duration_max=('hover_duration', 'max'))
    
    # total time on each event
    total_time_event = (
        df_
        .groupby(['session_id', 'event_name'])
        .agg(total_time_event=('time_delta_til_next', 'sum'),
             mean_time_event=('time_delta_til_next', 'mean'),
             std_time_event=('time_delta_til_next', 'std'))
        .unstack()
    )
    
    # total time on text_id
    total_time_text = (
        df_
        .groupby(['session_id', 'text_fqid'])['event_time_delta']
        .sum()
        .unstack()
        .rename(columns=lambda x: f"time_on_{x}")
    )
    
    # total time on each level    
    level_duration = (
        df_
        .groupby(['session_id', 'level'])['elapsed_time']
        .apply(lambda x: x.max() - x.min())
        .unstack()
        .rename(columns=lambda x: f'time_on_level_{x}')
    )
    
    # page counts
    page_counts = df_.groupby('session_id')['page'].value_counts().unstack().fillna(0)
    page_counts.columns = [f'page_{x}' for x in page_counts.columns]
    
    # unique rooms
    nunique_rooms = (
        df_
        .groupby(['session_id', 'level'])['room_fqid']
        .nunique()
        .unstack()
        .rename(columns=lambda x: f'unique_room_fqid_level_{x}')
    )
    
    # unique text ids
    nunique_text_ids = (
        df_
        .groupby(['session_id', 'level'])['text_fqid']
        .nunique()
        .unstack()
        .rename(columns=lambda x: f'unique_text_fqid_level_{x}')
    )
    
    # unique fqids 
    nunique_fqid = (
        df_
        .groupby(['session_id', 'level'])['fqid']
        .nunique()
        .unstack()
        .rename(columns=lambda x: f'unique_fqid_level_{x}')
    )
    
    # how many events of each type occurred in the session
    event_counts = df_.groupby('session_id')['event_name'].value_counts().unstack()
    
    # getting session lengths
    session_lengths = df_.groupby('session_id')['elapsed_time'].max().rename('total_length')
    
    # total events in the session
    session_events = df_.groupby('session_id')['session_id'].count()
    
    # final df pre labels
    df_features = (
        pd.concat([total_time_event, nunique_text_ids, nunique_fqid, event_counts, 
                   session_lengths, session_events, time_delta_mean, hover_duration,
                   level_duration, page_counts, total_time_text], axis=1)
        .fillna(0)
        .drop(columns='session_id')
        .reset_index()
    )
    
    # fixing column names 
    df_features.rename(columns=lambda x: '_'.join(x) if isinstance(x, tuple) else x, inplace=True)
    
    return df_features.set_index('session_id')

In [5]:
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [6]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for test, sample_submission in iter_test:
    
    level_group = test['level_group'].values[0]
    df_ = kaggle_prep(test_data=test)
     
    if level_group == '0-4':
        feature_cols = models['question_1'].feature_name_
    elif level_group == '5-12':
        feature_cols = models['question_6'].feature_name_
    else:
        feature_cols = models['question_17'].feature_name_
    
    # figure out if feature columns are missing - important for submission
    missing_columns = list(set(feature_cols) - set(df_.columns))
    
    # create a dataframe with the missing columns filled with 0s (for now)
    missing_df = pd.DataFrame(0, columns=missing_columns, index=df_.index)
    
    # combine the original df and the missing df
    df_ = pd.concat([df_, missing_df], axis=1)
    
    a, b = limits[level_group]
    
    for question in range(a, b):
        model_ = models[f'question_{question}']
        threshold = 0.63
        feature_cols = model_.feature_name_

        preds = model_.predict_proba(df_[feature_cols])[0, 1]
        fixed_preds = (preds > threshold).astype(int)

        mask = sample_submission.session_id.str.contains(f'q{question}')
        sample_submission.loc[mask, 'correct'] = fixed_preds

    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
