# SETUP


In [None]:
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score, confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# LOAD DATASETS

In [None]:
dtypes = {
    'elapsed_time'  : np.int32,
    'event_name'    : 'category',
    'name'          : 'category',
    'level'         : np.uint8,
    'room_coor_x'   : np.float32,
    'room_coor_y'   : np.float32,
    'screen_coor_x' : np.float32,
    'screen_coor_y' : np.float32,
    'hover_duration': np.float32,
    'text'          : 'category',
    'fqid'          : 'category',
    'room_fqid'     : 'category',
    'text_fqid'     : 'category',
    'fullscreen'    : 'category',
    'hq'            : 'category',
    'music'         : 'category',
    'level_group'   : 'category'
}

TRAIN_DATASET_PATH = '/kaggle/input/student-performance-and-game-play/train.csv'
train_df = pd.read_csv(TRAIN_DATASET_PATH, dtype = dtypes)
print(f'train_df.shape = {train_df.shape}')
train_df.head()

In [None]:
TRAIN_LABELS_DATASET_PATH = '/kaggle/input/student-performance-and-game-play/train_labels.csv'
train_labels_df = pd.read_csv(TRAIN_LABELS_DATASET_PATH)
print(f'train_labels_df.shape = {train_labels_df.shape}')
train_labels_df.head()

## PREPARE DATASETS

### Train dataset

In [None]:
FEATURE_CATS = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
FEATURE_NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']

def feature_engineer(dataset, verbose = True):
    new_features = []

    temp = dataset.groupby(['session_id','level_group'])

    for feature in FEATURE_CATS:
        new_feature = temp[feature].agg('nunique')
        new_feature.name = feature + '_nunique'
        new_features.append(new_feature)

    for feature in FEATURE_NUMS:
        new_feature = temp[feature].agg('mean')
        new_features.append(new_feature)

    for feature in FEATURE_NUMS:
        new_feature = temp[feature].agg('std')
        new_feature.name = feature + '_std'
        new_features.append(new_feature)

    total_time_spent_feature = temp['elapsed_time'].agg('max') - temp['elapsed_time'].agg('min')
    total_time_spent_feature.name = 'total_time_spent'
    new_features.append(total_time_spent_feature)
    
    total_action_count_feature = temp['index'].agg('max') + 1
    total_action_count_feature.name = 'total_action_count'
    new_features.append(total_action_count_feature)
    
    average_time_spent_feature = total_time_spent_feature / total_action_count_feature
    average_time_spent_feature.name = 'average_time_spent'
    new_features.append(average_time_spent_feature)
        
    result = pd.concat(new_features, axis = 1)
    result = result.fillna(-1)
    result = result.reset_index()
    result = result.set_index('session_id')
    
    return result

In [None]:
modified_train_df = feature_engineer(train_df)

del train_df
gc.collect()

modified_train_df.head()

### Train labels dataset

In [None]:
train_labels_df['session']  = train_labels_df['session_id'].apply(lambda id: int(id.split('_q')[0]))
train_labels_df['question'] = train_labels_df['session_id'].apply(lambda id: int(id.split('_q')[1]))
train_labels_df.head()

## TRAIN MODEL

In [None]:
pred_df = pd.DataFrame(data = np.zeros((len(modified_train_df.index.unique()), 18)), index = modified_train_df.index.unique())

models = {}

groupKFold = GroupKFold(n_splits = 5)
for i, (train_index, test_index) in enumerate(groupKFold.split(X = modified_train_df, groups = modified_train_df.index)):
    print('#' * 25)
    print('### Fold', i + 1)
    print('#' * 25)
    
    # Duyệt từng câu hỏi từ câu 1 đến 18
    for question in range(1, 19): 
        print(question, ', ', end = '')
        
        # Phân loại các câu hỏi theo nhóm level
        if question <= 3: 
            level_group = '0-4'
        elif question <= 13:
            level_group = '5-12'
        elif question <= 18: 
            level_group = '13-22'
            
        # Tập train
        train_X = modified_train_df.iloc[train_index]
        train_X = train_X.loc[train_X['level_group'] == level_group]
        train_X_index = train_X.index.values
        train_y = train_labels_df.loc[train_labels_df['question'] == question].set_index('session').loc[train_X_index]
        
        # Tập valid
        valid_X = modified_train_df.iloc[test_index]
        valid_X = valid_X.loc[valid_X['level_group'] == level_group]
        valid_X_index = valid_X.index.values
        valid_y = train_labels_df.loc[train_labels_df['question'] == question].set_index('session').loc[valid_X_index]
        
        # Huấn luyện mô hình XGBoost trên tập train
        clf = XGBClassifier(objective = 'binary:logistic', eval_metric = 'logloss', learning_rate = 0.05, max_depth = 3, n_estimators = 1000, early_stopping_rounds = 50, tree_method = 'hist', subsample = 0.8, colsample_bytree = 0.2, use_label_encoder = False)
        clf.fit(train_X[modified_train_df.columns.drop(['level_group'])].astype('float32'), train_y['correct'], eval_set = [(valid_X[valid_X.columns.drop(['level_group'])].astype('float32'), valid_y['correct'])], verbose = 0)
        
        # Dự đoán kết quả của mô hình trên tập valid
        models[f'{level_group}_{question}'] = clf
        pred_df.loc[valid_X_index, question - 1] = clf.predict_proba(valid_X[modified_train_df.columns.drop('level_group')].astype('float32'))[:, 1]
        
    print()

In [None]:
true_df = pred_df.copy()
for question in range(1, 19):
    tmp = train_labels_df.loc[train_labels_df['question'] == question].set_index('session').loc[modified_train_df.index.unique()]
    true_df[question - 1] = tmp['correct'].values
    
true_df.head()

In [None]:
def find_best_threshold(pred, true, min = 0.4, max = 0.81, step = 0.01):
    scores = []
    best_score = 0
    best_threshold = 0
    
    true_label = true.to_numpy().flatten()
  
    thresholds = np.arange(min, max, step)
    for threshold in thresholds:
        predict = (pred.to_numpy().flatten() > threshold).astype('int')
        m = f1_score(true_label, predict, average = 'macro')
        scores.append(m)
        if m > best_score:
            best_score = m
            best_threshold = threshold

    plt.figure(figsize = (20, 5))
    plt.scatter([best_threshold], [best_score], color = 'g', s = 300)
    plt.plot(thresholds, scores, marker = 'o', color = 'g')
    plt.xlabel('Threshold')
    plt.ylabel('F1 Score')
    plt.title(f'Threshold vs F1 Score (best F1 Score = {best_score:.4f}, best threshold = {best_threshold:.4f})')
    plt.show()

    return best_threshold, best_score

best_threshold, best_score = find_best_threshold(pred_df, true_df)
print(f'Threshold :    {best_threshold}')
print(f'F1-Score  :    {best_score}')

In [None]:
modified_pred_df = (pred_df > best_threshold).astype(int)
modified_pred_df.head()

In [None]:
confusion = confusion_matrix(true_df.values.reshape((-1)), modified_pred_df.values.reshape((-1)))
plt.figure(figsize = (8, 6))
sns.heatmap(confusion, annot = True, fmt = 'd') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
for question in range(1, 19):
    m = f1_score(true_df[question - 1].values, (pred_df[question - 1].values > best_threshold).astype('int'), average = 'macro')
    print(f'Question {question:2}: F1 = ', m)
  
m = f1_score(true_df.values.reshape((-1)), (pred_df.values.reshape((-1)) > best_threshold).astype('int'), average = 'macro')
print('==> Overall F1 =', m)

## SUBMISSION

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4': (1, 4), '5-12': (4, 14), '13-22': (14, 19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test, verbose = False)
    for level_group in test_df['level_group'].unique():
        test_ds = test_df[test_df['level_group'] == level_group].drop('level_group', axis = 1)
        if test_ds.shape[0] > 0:
            a, b = limits[level_group]
            for question in range(a, b):
                model = models[f'{level_group}_{question}']
                class_idx = list(model.classes_).index(1)
                mask = sample_submission['session_id'].str.endswith(f'q{question}')
                sample_submission.loc[mask,'correct'] = (model.predict_proba(test_ds)[:, class_idx] > best_threshold).astype('int').flatten()

    env.predict(sample_submission)

In [None]:
! head submission.csv