In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

In [2]:
# loading the train labels

labels_df = pd.read_csv('train_labels.csv')

labels_df['session'] = labels_df['session_id'].apply(lambda x: int(x.split('_')[0]))
labels_df['question'] = labels_df['session_id'].apply(lambda x: int(x.split('q')[1]))

labels_df = (
    labels_df
    .sort_values(by=['session', 'question'], ascending=[True, True])
    .reindex(columns=['session_id', 'session', 'question', 'correct'])
    .reset_index(drop=True)
    .drop(columns='session_id')
)

labels_df.head()

Unnamed: 0,session,question,correct
0,20090312431273200,1,1
1,20090312431273200,2,1
2,20090312431273200,3,1
3,20090312431273200,4,1
4,20090312431273200,5,1


In [88]:
# excluding cols to save memory
exclude_cols = ['index', 'fullscreen', 'hq', 'music', 'text']

# borrowing the dtypes dictionary from a featured notebook
dtypes = {
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen': bool,
    'hq':bool,
    'music': bool,
    'level_group':'category'
}

# df = pd.read_csv('train.csv', usecols=lambda x: x not in exclude_cols, dtype=dtypes)

# df = (
#     df
#     .sort_values(by=['session_id', 'level', 'elapsed_time'], ascending=[True, True, True])
#     .reset_index(drop=True)
# )


# getting elapsed diffs
df['event_time_delta'] = (
    df
    .groupby('session_id')['elapsed_time']
    .transform(lambda x: x.diff().fillna(x.min()))
)

# getting the time until the next event
df['time_delta_til_next'] = (
    df
    .groupby('session_id')['elapsed_time']
    .transform(lambda x: abs(x.diff(-1)).fillna(abs(x.min())))
)

In [6]:
def get_last_4(x):
    return x.iloc[-4:]


# get last 4 fqid
fqid_df = df.groupby(['session_id', 'level_group'])['fqid'].apply(get_last_4).reset_index()
fqid_df['entry_number'] = fqid_df.groupby(['session_id', 'level_group']).cumcount() + 1

final_fqid = (
    fqid_df
    .pivot(index=['session_id', 'level_group'],
           columns='entry_number',
           values='fqid')
    .rename(columns={1:'fqid_1back', 2:'fqid_2back', 3:'fqid_3back', 4:'fqid_4back'})
)

final_fqid

Unnamed: 0_level_0,entry_number,fqid_1back,fqid_2back,fqid_3back,fqid_4back
session_id,level_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20090312431273200,0-4,tunic.kohlcenter,tunic.capitol_0,chap1_finale,chap1_finale_c
20090312431273200,13-22,tomap,tunic.capitol_2,chap4_finale_c,chap4_finale_c
20090312431273200,5-12,tunic.capitol_1,chap2_finale_c,chap2_finale_c,chap2_finale_c
20090312433251036,0-4,,tunic.capitol_0,chap1_finale,chap1_finale_c
20090312433251036,13-22,tunic.library,tunic.capitol_2,chap4_finale_c,chap4_finale_c
20090312433251036,5-12,tunic.library,tunic.capitol_1,chap2_finale_c,chap2_finale_c
20090312455206810,0-4,tunic.kohlcenter,tunic.capitol_0,chap1_finale,chap1_finale_c
20090312455206810,13-22,tomap,tunic.capitol_2,chap4_finale_c,chap4_finale_c
20090312455206810,5-12,tunic.library,tunic.capitol_1,chap2_finale_c,chap2_finale_c
20090313091715820,0-4,chap1_finale,chap1_finale,chap1_finale,chap1_finale_c


In [123]:
def get_data_for_level(level_group=None, labels_df=labels_df):
    
    level_dict = {
        '0-4' : [1, 2, 3],
        '5-12' : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
        '13-22' : [14, 15, 16, 17, 18]
    }

    
    df_ = df[df['level_group'] == level_group].copy()

    
    # time delta means    
    time_delta_mean = df_.groupby('session_id').agg(event_time_mean=('event_time_delta', 'mean'),
                                                    event_time_std=('event_time_delta', 'std'),
                                                    event_time_max=('event_time_delta', 'max'))
    
    # total time on each event
    total_time_event = (
        df_
        .groupby(['session_id', 'event_name'])
        .agg(total_time=('time_delta_til_next', 'sum'))
        .unstack()
    )   
    
    # unique text ids
    nunique_text_ids = df_.groupby('session_id')['text_fqid'].nunique()
    
    # unique fqids 
    nunique_fqid = df_.groupby('session_id')['fqid'].nunique()
    
    # how many events of each type occurred in the session
    event_counts = df_.groupby('session_id')['event_name'].value_counts().unstack()
    
    # getting session lengths
    session_lengths = df_.groupby('session_id')['elapsed_time'].max().rename('total_length')
    
    # total events in the session
    session_events = df_.groupby('session_id')['session_id'].count()
    
    # getting labels
    session_labels = (
        labels_df
        .loc[labels_df.question.isin(level_dict[level_group])]
        .pivot(columns='question', values='correct', index='session')
    )
    
    # final df pre labels
    df_features = (
        pd.concat([total_time_event, nunique_text_ids, nunique_fqid, event_counts, session_lengths, session_events, time_delta_mean], axis=1)
    )
    
    # df with labels
    df_final = (
        pd.concat([df_features, session_labels], axis=1)
        .reset_index()
        .drop(columns=['session_id'])
        .rename(columns={'index' : 'session_id'})
    )
    
    df_final['year'] = df_final['session_id'].apply(lambda x: int(str(x)[:2]))
    df_final['month'] = df_final['session_id'].apply(lambda x: int(str(x)[2:4]))
    df_final['day'] = df_final['session_id'].apply(lambda x: int(str(x)[4:6]))
    df_final['hour'] = df_final['session_id'].apply(lambda x: int(str(x)[6:8]))

    # # creating a weekend indicator because weekends are different
    df_final['weekend'] = np.where(df_final['day'].isin([6,0]), 1, 0)
    
    return df_final

In [124]:
group1_data = get_data_for_level('5-12')

In [125]:
group1_data.shape

(23562, 44)

In [145]:
X = group1_data.drop(columns=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 'year', 'checkpoint', 'session_id',
                              'hour', 'day', 'weekend', 'fqid', 'notification_click', 'month', 
                              'cutscene_click', 'observation_click']).copy()
y = group1_data[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13]].values

param_grid = {
    'n_estimators' : [50, 100, 200]
}

model = XGBClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

model.fit(X_train, y_train)

In [146]:
f1_score(y_test, model.predict(X_test), average='micro')

0.7961571622120952

In [147]:
scorer = make_scorer(f1_score, average='micro')

In [148]:
from sklearn.inspection import permutation_importance

p = permutation_importance(
    model,
    X_test, y_test,
    n_repeats=5,
    scoring=scorer, 
    random_state=44
)

imp_df = (
    pd.DataFrame(
        {'feature' : X_test.columns,
         'importance_mean' : p['importances_mean'],
         'importance_std' : p['importances_std']})
)

imp_df.sort_values(by='importance_mean', ascending=True)

Unnamed: 0,feature,importance_mean,importance_std
14,navigate_click,0.000351,0.000563
11,text_fqid,0.00104,0.000348
2,"(total_time, map_click)",0.00115,0.000512
13,map_hover,0.0012,0.000265
12,map_click,0.001212,0.000359
18,person_click,0.001369,0.000232
3,"(total_time, map_hover)",0.001452,0.000905
17,object_hover,0.001506,0.000799
8,"(total_time, object_hover)",0.001796,0.0004
15,notebook_click,0.001926,0.000117


In [140]:
imp_df.sort_values(by='importance_mean', ascending=False).head(30)

Unnamed: 0,feature,importance_mean,importance_std
0,"(total_time, checkpoint)",0.029089,0.001174
10,"(total_time, person_click)",0.01142,0.000358
18,object_click,0.004221,0.000372
23,event_time_mean,0.003459,0.000369
24,event_time_std,0.003359,0.00062
1,"(total_time, cutscene_click)",0.003085,0.000465
4,"(total_time, navigate_click)",0.002987,0.000288
22,total_length,0.002721,0.000151
8,"(total_time, object_hover)",0.002703,0.000365
25,event_time_max,0.002635,0.000391


In [149]:
val_probs = np.concatenate(model.predict_proba(X_test))
y_test_flat = np.concatenate(y_test)

best_threshold = None
best_f1 = 0.0

for threshold in np.arange(0.25, 0.45, 0.01):  # Adjust the range and step size as needed
    # Apply the threshold to get binary labels
    val_preds = (val_probs > threshold).astype(int)
    
    # Compute the F1 score
    f1 = f1_score(y_test_flat, val_preds)
    
    # Check if it's the best F1 score so far
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1)

Best Threshold: 0.32000000000000006
Best F1 Score: 0.8100779572929614


In [159]:
feature_cols = [x for x in X_train.columns if x is not int]

In [160]:
models = {}

level_dict = {
    '0-4' : [1, 2, 3],
    '5-12' : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    '13-22' : [14, 15, 16, 17, 18]
}

for group in ['0-4', '5-12', '13-22']:
    
    df_ = get_data_for_level(group)    
    
    X = df_[feature_cols].copy()
    y = df_[level_dict[group]].values
    
    print(X.shape)

    model = XGBClassifier()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

    model.fit(X_train, y_train)
    
    print(f1_score(y_test.round(), model.predict(X_test), average='macro'))
    
    y_test_flat = np.concatenate(y_test)
    X_preds_flat = np.concatenate(model.predict(X_test))

    print("second f1:", f1_score(y_test_flat, X_preds_flat))
    
    val_probs = np.concatenate(model.predict_proba(X_test))

    best_threshold = None
    best_f1 = 0.0

    for threshold in np.arange(0.25, 0.75, 0.05):  # Adjust the range and step size as needed
        # Apply the threshold to get binary labels
        val_preds = (val_probs > threshold).astype(int)

        # Compute the F1 score
        f1 = f1_score(y_test_flat, val_preds)

        # Check if it's the best F1 score so far
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    print(best_threshold, best_f1)
    
    models[group] = model

(23562, 23)
0.9269773930390682
second f1: 0.9341702506877446
0.35 0.9385457864313965
(23562, 23)
0.7493463977510807
second f1: 0.7961571622120952
0.35 0.8098443596449857
(23562, 23)
0.8032483423248374
second f1: 0.8273037542662116
0.3 0.8381387235789042


In [163]:
with open('first_groupv3.pickle', 'wb') as file:
    pickle.dump(models['0-4'], file, protocol=4)
    
with open('second_groupv3.pickle', 'wb') as file:
    pickle.dump(models['5-12'], file, protocol=4)

with open('third_groupv3.pickle', 'wb') as file:
    pickle.dump(models['13-22'], file, protocol=4)

In [177]:
def prep_data(df=None):
    "how to prep the data for the kaggle notebook"
    
    level_dict = {
        '0-4' : [1, 2, 3],
        '5-12' : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
        '13-22' : [14, 15, 16, 17, 18]
    }

    
    df_ = df.copy()
    
    # time delta means    
    time_delta_mean = df_.groupby('session_id').agg(event_time_mean=('event_time_delta', 'mean'),
                                                    event_time_std=('event_time_delta', 'std'),
                                                    event_time_max=('event_time_delta', 'max'))
    
    # total time on each event
    total_time_event = (
        df_
        .groupby(['session_id', 'event_name'])
        .agg(total_time=('time_delta_til_next', 'sum'))
        .unstack()
    )   
    
    # unique text ids
    nunique_text_ids = df_.groupby('session_id')['text_fqid'].nunique()
    
    # unique fqids 
    nunique_fqid = df_.groupby('session_id')['fqid'].nunique()
    
    # how many events of each type occurred in the session
    event_counts = df_.groupby('session_id')['event_name'].value_counts().unstack()
    
    # getting session lengths
    session_lengths = df_.groupby('session_id')['elapsed_time'].max().rename('total_length')
    
    # total events in the session
    session_events = df_.groupby('session_id')['session_id'].count()
    
    # final df pre labels
    df_features = (
        pd.concat([total_time_event, nunique_text_ids, nunique_fqid, event_counts, session_lengths, session_events, time_delta_mean], axis=1)
        .drop(columns='session_id')
        .reset_index()
    )
    
    df_features['year'] = df_features['session_id'].apply(lambda x: int(str(x)[:2]))
    df_features['month'] = df_features['session_id'].apply(lambda x: int(str(x)[2:4]))
    df_features['day'] = df_features['session_id'].apply(lambda x: int(str(x)[4:6]))
    df_features['hour'] = df_features['session_id'].apply(lambda x: int(str(x)[6:8]))

    # # creating a weekend indicator because weekends are different
    df_features['weekend'] = np.where(df_features['day'].isin([6,0]), 1, 0)
    
    return df_features

In [178]:
prep_data(df=trial)

Unnamed: 0,session_id,"(total_time, checkpoint)","(total_time, cutscene_click)","(total_time, map_click)","(total_time, map_hover)","(total_time, navigate_click)","(total_time, notebook_click)","(total_time, notification_click)","(total_time, object_click)","(total_time, object_hover)",...,person_click,total_length,event_time_mean,event_time_std,event_time_max,year,month,day,hour,weekend
0,20090109393214576,0.0,47262.0,5416.0,2882.0,142250.0,9592.0,6635.0,13567.0,3515.0,...,21,267350,1909.642857,4826.829749,56857.0,20,9,1,9,0


In [181]:
test_df = pd.read_csv('test.csv')

cond = (test_df['session_id'] == 20090109393214576) & (test_df['level_group'] == '0-4')

trial = test_df[cond].copy()

# getting elapsed diffs
trial['event_time_delta'] = (
    trial
    .groupby('session_id')['elapsed_time']
    .transform(lambda x: x.diff().fillna(x.min()))
)

# getting the time until the next event
trial['time_delta_til_next'] = (
    trial
    .groupby('session_id')['elapsed_time']
    .transform(lambda x: abs(x.diff(-1)).fillna(abs(x.min())))
)


trial2 = prep_data(df=trial)

# Step 2: Define the list of columns to check against
columns = g1.columns

# Step 3: Identify the missing columns
missing_columns = list(set(columns) - set(trial2.columns))

# Step 4: Add the missing columns to the DataFrame and fill with zeros
missing_df = pd.DataFrame(0, columns=missing_columns, index=trial2.index)

trial2 = pd.concat([trial2, missing_df], axis=1)

model = models['0-4']

preds = model.predict(trial2[feature_cols])
flat_preds = np.concatenate(preds)

In [184]:
pd.Series(flat_preds, dtype=int)

0    1
1    1
2    1
dtype: int32

In [162]:
with open('feature_cols.pickle', 'wb') as file:
    pickle.dump(feature_cols, file, protocol=4)

In [53]:
sample_submission = pd.read_csv('sample_submission.csv')

sample_submission

Unnamed: 0,session_id,correct,session_level
0,20090109393214576_q1,0,0
1,20090312143683264_q1,0,3
2,20090312331414616_q1,0,6
3,20090109393214576_q2,0,0
4,20090312143683264_q2,0,3
5,20090312331414616_q2,0,6
6,20090109393214576_q3,0,0
7,20090312143683264_q3,0,3
8,20090312331414616_q3,0,6
9,20090109393214576_q4,0,1
