In [11]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import f1_score

In [12]:
# Setup matplotlib
%matplotlib inline

In [15]:
# Path to files
test_csv_path = "../data/test.csv"
train_csv_path = "../data/train.csv"
target_labels_csv = "../data/train_labels.csv"

In [16]:
# Load only session_id column
tmp = pd.read_csv(train_csv_path, usecols=[0])
tmp = tmp.groupby("session_id")["session_id"].agg("count")

In [17]:
# Calculate chunks and skips
pieces = 25
chunks = int(np.ceil(len(tmp) / pieces))

In [18]:
reads = []
skips = [0]

for k in range(pieces):
    a = k * chunks
    b = (k + 1) * chunks

    if b > len(tmp):
        b = len(tmp)

    r = tmp.iloc[a:b].sum()
    reads.append(r)
    skips.append(skips[-1] + r)

print(f"pieces: {pieces} of sizes: {reads}")

pieces: 25 of sizes: [1063516, 1054835, 1100631, 1050250, 1048333, 1049923, 1059672, 1053593, 1060254, 1073360, 1041039, 1067514, 1075765, 1026458, 1064071, 1054090, 1026632, 1050203, 1023984, 1044632, 1055852, 1047718, 1040246, 1044565, 1019810]


In [19]:
train_df = pd.read_csv(train_csv_path, nrows=reads[0])
train_df.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [20]:
target_df = pd.read_csv(target_labels_csv)

In [21]:
target_df["session"] = target_df.session_id.apply(lambda x: int(x.split("_")[0]))

In [22]:
target_df["q"] = target_df.session_id.apply(lambda x: int(x.split("_")[-1][1:]))

In [23]:
target_df["correct"] = target_df["correct"].astype("int8")
target_df["q"] = target_df["q"].astype("int8")

In [24]:
target_df.head()

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [25]:
categorical_cols = [
    "event_name",
    "fqid",
    "room_fqid",
    "text",
    "text_fqid",
]

numerical_cols = [
    "elapsed_time",
    "level",
    "page",
    "room_coor_x",
    "room_coor_y",
    "screen_coor_x",
    "screen_coor_y",
    "hover_duration",
]

In [26]:
event_list = train_df["event_name"].unique().tolist()
event_list, len(event_list)

(['cutscene_click',
  'person_click',
  'navigate_click',
  'observation_click',
  'notification_click',
  'object_click',
  'object_hover',
  'map_hover',
  'map_click',
  'checkpoint',
  'notebook_click'],
 11)

In [27]:
name_list = train_df["text"].unique().tolist()
name_list, len(name_list)

(['undefined',
  'Whatcha doing over there, Jo?',
  'Just talking to Teddy.',
  'I gotta run to my meeting!',
  'Can I come, Gramps?',
  'Sure thing, Jo. Grab your notebook and come upstairs!',
  'See you later, Teddy.',
  "I get to go to Gramps's meeting!",
  'Now where did I put my notebook?',
  '\\u00f0\\u0178\\u02dc\\u00b4',
  nan,
  'I love these photos of me and Teddy!',
  'Found it!',
  'Gramps is in trouble for losing papers?',
  "This can't be right!",
  'Gramps is a great historian!',
  "Hmm. Button's still not working.",
  "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!",
  'Who wants to investigate the shirt artifact?',
  "Not Leopold here. He's been losing papers lately.",
  'Hey!',
  "It's true, they do keep going missing lately.",
  'See?',
  'Besides, I already figured out the shirt.',
  "It's a women's basketball jersey!",
  'That settles it.',
  'Wells, finish up your report.',
  "Leopold, why don't you help me set up in the Capitol?",
  'We need to 

In [28]:
fqid_list = train_df["fqid"].unique().tolist()
fqid_list, len(fqid_list)

(['intro',
  'gramps',
  'teddy',
  'photo',
  nan,
  'notebook',
  'retirement_letter',
  'tobasement',
  'janitor',
  'toentry',
  'groupconvo',
  'report',
  'boss',
  'wells',
  'directory',
  'tocollection',
  'cs',
  'tunic',
  'tunic.hub.slip',
  'tostacks',
  'outtolunch',
  'tocloset',
  'tomap',
  'tunic.historicalsociety',
  'tunic.kohlcenter',
  'plaque',
  'plaque.face.date',
  'togrampa',
  'tunic.capitol_0',
  'chap1_finale',
  'chap1_finale_c',
  'tocloset_dirty',
  'what_happened',
  'trigger_scarf',
  'trigger_coffee',
  'tunic.capitol_1',
  'tofrontdesk',
  'archivist',
  'magnify',
  'tunic.humanecology',
  'worker',
  'businesscards',
  'businesscards.card_0.next',
  'businesscards.card_1.next',
  'businesscards.card_bingo.next',
  'businesscards.card_bingo.bingo',
  'tohallway',
  'tunic.drycleaner',
  'logbook',
  'logbook.page.bingo',
  'tunic.library',
  'tomicrofiche',
  'reader',
  'reader.paper0.next',
  'reader.paper1.next',
  'reader.paper2.bingo',
  'well

In [29]:
room_list = train_df["room_fqid"].unique().tolist()
room_list, len(room_list)

(['tunic.historicalsociety.closet',
  'tunic.historicalsociety.basement',
  'tunic.historicalsociety.entry',
  'tunic.historicalsociety.collection',
  'tunic.historicalsociety.stacks',
  'tunic.kohlcenter.halloffame',
  'tunic.capitol_0.hall',
  'tunic.historicalsociety.closet_dirty',
  'tunic.historicalsociety.frontdesk',
  'tunic.humanecology.frontdesk',
  'tunic.drycleaner.frontdesk',
  'tunic.library.frontdesk',
  'tunic.library.microfiche',
  'tunic.capitol_1.hall',
  'tunic.historicalsociety.cage',
  'tunic.historicalsociety.collection_flag',
  'tunic.wildlife.center',
  'tunic.flaghouse.entry',
  'tunic.capitol_2.hall'],
 19)

In [30]:
groupby_cols = ["session_id", "level_group"]

In [31]:
# Feature Engineering Function
def feature_engineer(train_df):
    # Create a list of new dataframes for each feature
    dfs = []

    agg_functions = {c: ["mean", "std", "sum", "max", "min"] for c in numerical_cols}

    for c, funcs in agg_functions.items():
        tmp = train_df.groupby(groupby_cols)[c].agg(funcs)
        tmp.columns = [f"{c}_{agg_name}" for agg_name in funcs]
        dfs.append(tmp)

    for c in categorical_cols:
        tmp = train_df.groupby(groupby_cols)[c].agg("nunique")
        tmp.name = f"{tmp.name}_nunique"
        dfs.append(tmp)

    for c in event_list:
        train_df[c] = (train_df["event_name"] == c).astype(np.int8)

    for c in event_list:
        tmp = train_df.groupby(groupby_cols).agg({c: "sum", "elapsed_time": "sum"})
        tmp.rename(
            columns={c: f"{c}_sum", "elapsed_time": f"{c}_elapsed_time_sum"},
            inplace=True,
        )
        dfs.append(tmp)

    for c in room_list:
        train_df[c] = (train_df["room_fqid"] == c).astype(np.int8)

    for c in room_list:
        tmp = train_df.groupby(groupby_cols)[c].agg("sum")
        tmp.name = f"{tmp.name}_sum"
        dfs.append(tmp)

    # Frequency encoding of fqid
    fqid_counts = train_df['fqid'].value_counts()
    train_df['fqid_freq_encoded'] = train_df['fqid'].map(fqid_counts)

    tmp = train_df.groupby(groupby_cols)['fqid_freq_encoded'].agg(["mean", "sum", "max", "min"])
    tmp.columns = [f"fqid_freq_encoded_{agg_name}" for agg_name in tmp.columns]
    dfs.append(tmp)

    train_df.drop(columns=['fqid', 'fqid_freq_encoded'], inplace=True)

    # Frequency encoding of text
    text_counts = train_df['text'].value_counts()
    train_df['text_freq_encoded'] = train_df['text'].map(text_counts)

    tmp = train_df.groupby(groupby_cols)['text_freq_encoded'].agg(["mean", "sum", "max", "min"])
    tmp.columns = [f"text_freq_encoded_{agg_name}" for agg_name in tmp.columns]
    dfs.append(tmp)

    train_df.drop(columns=['text', 'text_freq_encoded'], inplace=True)

    df = pd.concat(dfs, axis=1).fillna(-1)
    df = df.reset_index().set_index("session_id")

    _ = gc.collect()
    return df

In [32]:
# Process train_df in chunks
all_chunks = []
for k in range(pieces):
    rows = 0
    if k > 0:
        rows = range(1, skips[k] + 1)
        train_df = pd.read_csv(train_csv_path, skiprows=rows, nrows=reads[k])

    df = feature_engineer(train_df)
    all_chunks.append(df)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Clean memory
del train_df
_ = gc.collect()

In [None]:
# Concatenate all chunks
df = pd.concat(all_chunks, axis=0)

In [None]:
# 
features = [c for c in df.columns if c != "level_group"]
users = df.index.unique()

In [None]:
# Create target dataframe
gkf = GroupKFold(n_splits=7)
oof = pd.DataFrame(
    data=np.zeros((len(users), 18)),
    index=users,
)
models = {}

In [None]:
# Train model for each group and question
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print(f"Fold {i + 1} => ", end="")

    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.05,
        "max_depth": 4,
        "n_estimators": 1000,
        "early_stopping_rounds": 50,
        "tree_method": "hist",
        "subsample": 0.8,
        "colsample_bytree": 0.4,
        "use_label_encoder": False,
    }

    for t in range(1, 19):
        if t <= 3:
            grp = "0-4"
        elif t <= 13:
            grp = "5-12"
        elif t <= 22:
            grp = "13-22"

        # Train data
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = target_df.loc[target_df.q == t].set_index("session").loc[train_users]

        # Valid data
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = target_df.loc[target_df.q == t].set_index("session").loc[valid_users]

        # Train model
        clf = XGBClassifier(**xgb_params)
        clf.fit(
            train_x[features].astype("float32"),
            train_y["correct"],
            eval_set=[(valid_x[features].astype("float32"), valid_y["correct"])],
            verbose=0,
        )
        print(f"{t}({clf.best_ntree_limit}), ", end="")

        # Save model and predict valid oof
        models[f"{grp}_{t}"] = clf
        oof.loc[valid_users, t - 1] = clf.predict_proba(
            valid_x[features].astype("float32")
        )[:, 1]

    print()

In [None]:
# Create Df with 18 columns
true = oof.copy()
for k in range(18):
    # Get labels for each question
    tmp = target_df.loc[target_df.q == k + 1].set_index("session").loc[users]
    true[k] = tmp.correct.values

In [None]:
# Determine best threshold for converting probabilities to labels
# Initialize variables
scores = []
thresholds = []

# Best score and threshold variables
best_score = 0
best_threshold = 0

# Iterate over all possible thresholds
for threshold in np.arange(0.4, 0.81, 0.01):
    print(f"{threshold:.02f}, ", end="")
    preds = (oof.values.reshape((-1)) > threshold).astype("int")
    m = f1_score(true.values.reshape((-1)), preds, average="macro")
    scores.append(m)
    thresholds.append(threshold)
    if m > best_score:
        best_score = m
        best_threshold = threshold

In [None]:
# Plot threshold vs. f1_score
plt.figure(figsize=(20, 5))
plt.plot(thresholds, scores, "-o", color="blue")
plt.scatter([best_threshold], [best_score], color="blue", s=300, alpha=1)
plt.xlabel("Threshold", size=14)
plt.ylabel("Validation F1 Score", size=14)
plt.title(
    f"Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}",
    size=18,
)
plt.show()

In [None]:
# Print f1 score for each question
print("When using optimal threshold...")
for k in range(18):
    # Compute f1 score for each question
    m = f1_score(
        true[k].values, (oof[k].values > best_threshold).astype("int"), average="macro"
    )
    print(f"Q{k}: F1 =", m)

# Compute overall F1 score
m = f1_score(
    true.values.reshape((-1)),
    (oof.values.reshape((-1)) > best_threshold).astype("int"),
    average="macro",
)
print("==> Overall F1 =", m)