<a href="https://www.kaggle.com/code/yutodennou/competition-simple-xgboost-with-gridsearchcv?scriptVersionId=171648223" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 🎉Purpose

Making **xgboost** model for student performance with **grid search**.  

Grid search is useful for triyng many patterns of hyperparameter.

I also explain about Gridsearch in detail [here](https://www.kaggle.com/code/yutodennou/make-gridsearchcv-faster)  


# 🗃️Import Library

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/predict-student-performance-from-game-play/sample_submission.csv
/kaggle/input/predict-student-performance-from-game-play/train_labels.csv
/kaggle/input/predict-student-performance-from-game-play/train.csv
/kaggle/input/predict-student-performance-from-game-play/test.csv
/kaggle/input/predict-student-performance-from-game-play/jo_wilder_310/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder_310/__init__.py
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/__init__.py


In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GroupKFold, StratifiedKFold, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# 🔍Import data

In [3]:
COLS = ['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid',
       'level_group']

In [4]:
train = pd.read_csv("/kaggle/input/predict-student-performance-from-game-play/train.csv", usecols=COLS, 
                dtype = {'index':np.int16, 'level':np.int8, 'page':np.float32, 'room_coor_x':np.float32, 
                        'room_coor_y':np.float32, 'screen_coor_x':np.float16, 'screen_coor_y':np.float16, 
                        'hover_duration':np.float32, 'event_name':'category', 'name':'category', 
                        'text':'category', 'fqid':'category', 'room_fqid':'category', 
                        'level_group':'category'})

In [5]:
train_label = pd.read_csv("/kaggle/input/predict-student-performance-from-game-play/train_labels.csv")
train_label['session'] = train_label.session_id.apply(lambda x: int(x.split('_')[0]))
train_label['q'] = train_label.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [6]:
categorical_features = train.select_dtypes(include = ["category","object","bool"]).columns
categorical_features = categorical_features.drop("level_group")
numerical_features = train.select_dtypes(include = ["int8","int16","int64","float16","float32","float64"]).columns

# 📏Preprocessing

In [7]:
def processing_na(train):
    print("Na of numerical features: " + str(train[numerical_features].isnull().values.sum()))
    train[numerical_features] = train[numerical_features].fillna(train[numerical_features].median())
    print("-> " + str(train[numerical_features].isnull().values.sum()))
    encoder = OrdinalEncoder()
    train[categorical_features] = encoder.fit_transform(train[categorical_features])
    print("Na of categorical features: " + str(train[categorical_features].isnull().values.sum()))
    train[categorical_features] = train[categorical_features].fillna(method="ffill")
    print("-> " + str(train[categorical_features].isnull().values.sum()))
    return train 

In [8]:
def processing_xy(train):
    train['room_coor_d'] = np.sqrt(train['room_coor_x']**2+train['room_coor_x']**2)
    train['screen_coor_d'] = np.sqrt(train['screen_coor_x']**2+train['screen_coor_x']**2)
    return train

In [9]:
def feature_engineer(train):
    dfs = []
    for c in categorical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in numerical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in numerical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in numerical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('max')-train.groupby(['session_id','level_group'])[c].agg('min')
        tmp.name = tmp.name + '_delta'
        dfs.append(tmp)
    for c in categorical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('count')
        tmp.name = tmp.name + '_count'
        dfs.append(tmp)
    for c in categorical_features:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    
        
    df_engineered = pd.concat(dfs,axis=1)
    df_engineered = df_engineered.fillna(-1)
    df_engineered = df_engineered.reset_index()
    df_engineered = df_engineered.set_index('session_id')
    return df_engineered

In [10]:
train = processing_na(train)
train = processing_xy(train)
df_tr = feature_engineer(train)

Na of numerical features: 58320192
-> 0
Na of categorical features: 41633924
-> 0


In [11]:
FEATURES = [c for c in df_tr.columns if c != 'level_group']
ALL_USERS = df_tr.index.unique()
print(len(FEATURES) ,'features, ', len(ALL_USERS) ,'users')

48 features,  23562 users


# 💪Train Data

## Gridsearch

Gridsearch is a computationally heavy process. It often takes a tremendous amount of processing time depending on the data and the algorithm. We also know it's because exploring all options like below.
```
params = {'eta': [0.01, 0.1, 1.0], 'gamma': [0.1, 0.5, 0.8], 
                  'n_estimators': [10, 100, 500], 'max_depth':[2, 4, 6], 
                  'min_child_weight': [1, 2], 'nthread': [2] }
```

In [None]:
gkf = GroupKFold(n_splits=5)
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}
xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
}

# Any Params You Want to Scan like below
# 'eta':[0.01, 0.1, 1.0]
# 'gammma':[0.1, 0.4]
# 'learning_rate':[0.04, 0.06]
# 'n_estimator': [1500, 2000, 2500]
# 'max_depth': [5,6]
params = {'eta': [0.01, 0.1], 'gamma': [0.1, 0.4], 'learning_rate': [0.06],
                  'n_estimators': [1000, 2000, 3000], 'max_depth':[5, 6, 7], 
                  'min_child_weight': [1, 2], 'nthread': [2] }
for i, (train_index, test_index) in enumerate(gkf.split(X=df_tr, groups=df_tr.index)):
    print('-'*25)
    print(i)
    # 1->18
    for t in range(1,19):
        print(t,', ',end='')
        
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
            
        # Train
        train_x = df_tr.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = train_label.loc[train_label.q==t].set_index('session').loc[train_users]
        
        # Valid
        valid_x = df_tr.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = train_label.loc[train_label.q==t].set_index('session').loc[valid_users]
        
        # Model
        model =  XGBClassifier(**xgb_params)
        clf = GridSearchCV(estimator=model, param_grid=params, 
                    cv=skf, scoring="accuracy", n_jobs=1, verbose=2, return_train_score=False)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
                
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]


-------------------------
0
1 , Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=1, n_estimators=2000, nthread=2; total time=  55.5s
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=1, n_estimators=2000, nthread=2; total time=  50.9s
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=2, n_estimators=2000, nthread=2; total time=  50.8s
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=2, n_estimators=2000, nthread=2; total time=  51.3s
2 , Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=1, n_estimators=2000, nthread=2; total time=  44.5s
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_depth=5, min_child_weight=1, n_estimators=2000, nthread=2; total time=  44.7s
[CV] END eta=0.01, gamma=0.1, learning_rate=0.06, max_

In [None]:
true = oof.copy()
for k in range(18):
    tmp = train_label.loc[train_label.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
for k in range(18):
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('-> All F1 =',m)

# 🥅Submit

In [None]:
import jo_wilder
try:
    jo_wilder.make_env.__called__ = False
    env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
except:
    pass

env = jo_wilder.make_env()
iter_test = env.iter_test() 

In [None]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test = processing_na(test)
    test = processing_xy(test)
    df = feature_engineer(test)
    grp = test.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        clf = models[f'{grp}_{t}']
        p = clf.predict_proba(df[FEATURES].astype('float32'))[:,1]
        pint = [int(x>best_threshold) for x in p ]
        mask = sample_submission.session_id.str.endswith(f'q{t}')
        sample_submission.loc[mask,'correct'] = pint
    
    env.predict(sample_submission)

In [None]:
submit = pd.read_csv('submission.csv')
submit.head(30)