In [59]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import datetime
import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import sys
sys.path.append('../src')
import pprint
from tqdm import tqdm_notebook
from time import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold

from dataset import DSB2019Dataset


In [5]:
train = DSB2019Dataset(mode='train')
test = DSB2019Dataset(mode='test')

In [38]:
from sklearn.metrics import confusion_matrix
def qwk(act,pred,n=4,hist_range=(0,3)):
    
    O = confusion_matrix(act,pred)
    O = np.divide(O,np.sum(O))
    
    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)
            
    act_hist = np.histogram(act,bins=n,range=hist_range)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E))
    
    num = np.sum(np.multiply(W,O))
    den = np.sum(np.multiply(W,E))
        
    return 1-np.divide(num,den)
    

In [6]:
# encode title
list_of_user_activities = list(set(train.main_df['title'].value_counts().index).union(set(test.main_df['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train.main_df['title'] = train.main_df['title'].map(activities_map)
test.main_df['title'] = test.main_df['title'].map(activities_map)
train.train_labels['title'] = train.train_labels['title'].map(activities_map)

In [8]:
activities_map

{'Chest Sorter (Assessment)': 0,
 'Crystal Caves - Level 1': 1,
 'Magma Peak - Level 1': 2,
 'Bug Measurer (Activity)': 3,
 'Watering Hole (Activity)': 4,
 'Happy Camel': 5,
 'Crystal Caves - Level 3': 6,
 'Crystal Caves - Level 2': 7,
 'Leaf Leader': 8,
 'Fireworks (Activity)': 9,
 'Treasure Map': 10,
 'Mushroom Sorter (Assessment)': 11,
 'Welcome to Lost Lagoon!': 12,
 'Bubble Bath': 13,
 '12 Monkeys': 14,
 'Bird Measurer (Assessment)': 15,
 'Rulers': 16,
 'Tree Top City - Level 1': 17,
 'Scrub-A-Dub': 18,
 'Air Show': 19,
 'Tree Top City - Level 3': 20,
 'All Star Sorting': 21,
 'Honey Cake': 22,
 'Magma Peak - Level 2': 23,
 'Costume Box': 24,
 'Heavy, Heavier, Heaviest': 25,
 'Balancing Act': 26,
 'Chicken Balancer (Activity)': 27,
 'Crystals Rule': 28,
 'Dino Drink': 29,
 'Bottle Filler (Activity)': 30,
 'Flower Waterer (Activity)': 31,
 'Slop Problem': 32,
 'Sandcastle Builder (Activity)': 33,
 'Egg Dropper (Activity)': 34,
 "Pirate's Tale": 35,
 'Chow Time': 36,
 'Dino Dive': 3

In [10]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [11]:
win_code

{0: 4100,
 1: 4100,
 2: 4100,
 3: 4100,
 4: 4100,
 5: 4100,
 6: 4100,
 7: 4100,
 8: 4100,
 9: 4100,
 10: 4100,
 11: 4100,
 12: 4100,
 13: 4100,
 14: 4100,
 15: 4110,
 16: 4100,
 17: 4100,
 18: 4100,
 19: 4100,
 20: 4100,
 21: 4100,
 22: 4100,
 23: 4100,
 24: 4100,
 25: 4100,
 26: 4100,
 27: 4100,
 28: 4100,
 29: 4100,
 30: 4100,
 31: 4100,
 32: 4100,
 33: 4100,
 34: 4100,
 35: 4100,
 36: 4100,
 37: 4100,
 38: 4100,
 39: 4100,
 40: 4100,
 41: 4100,
 42: 4100,
 43: 4100}

In [12]:
train.main_df['timestamp'] = pd.to_datetime(train.main_df['timestamp'])
test.main_df['timestamp'] = pd.to_datetime(test.main_df['timestamp'])

In [13]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    durations = []
    for i, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        if test_set == True:
            second_condition = True
        else:
            if len(session)>1:
                second_condition = True
            else:
                second_condition= False
            
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features = user_activities_count.copy()
            features['session_title'] = session['title'].iloc[0] 
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1

            features.update(accuracy_groups)
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
            if test_set == True:
                all_assessments.append(features)
            else:
                if true_attempts+false_attempts > 0:
                    all_assessments.append(features)
                
            counter += 1


        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

    if test_set:
        return all_assessments[-1] 
    return all_assessments


In [21]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm_notebook(enumerate(train.main_df.groupby('installation_id', sort=False)), total=17000):
    compiled_data += get_data(user_sample)

HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))



 19%|█▉        | 3262/17000 [00:53<03:47, 60.36it/s][A[A




In [23]:
new_train = pd.DataFrame(compiled_data)
del compiled_data
new_train.shape

(17690, 16)

In [25]:
new_train.head()

Unnamed: 0,Activity,Assessment,Clip,Game,accumulated_accuracy,accumulated_correct_attempts,accumulated_uncorrect_attempts,accuracy_group,duration_mean,session_title,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,3,0,11,4,0.0,0,0,3,0.0,11,0,0,0,0,0.0,647
1,4,1,14,6,1.0,1,0,0,39.0,15,0,0,0,1,3.0,1143
2,4,2,14,6,0.5,1,11,3,65.5,11,1,0,0,1,1.5,1230
3,9,4,24,10,0.5,2,11,2,41.25,11,2,0,0,2,1.5,2159
4,10,5,28,13,0.5,3,12,3,39.2,15,2,0,1,2,1.6,2586


In [26]:
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]
cat_features = ['session_title']
X, y = new_train[all_features], new_train['accuracy_group']

In [28]:
default_param = {
        'nthread': -1,
        'n_estimators': 10000,
        'learning_rate': 0.1,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'random_state': 2019,
    }

In [39]:
oof = np.zeros(len(X))
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    start_time = time()
    print(f'Training on fold {fold+1}')
    clf = LGBMClassifier(**default_param)
    clf.fit(X.loc[trn_idx, all_features], y.loc[trn_idx], eval_set=(X.loc[test_idx, all_features], y.loc[test_idx]),
            verbose=100, early_stopping_rounds=100, 
            categorical_feature=cat_features)
    
    oof[test_idx] = clf.predict(X.loc[test_idx, all_features], num_iteration=clf.best_iteration_).reshape(len(test_idx))
    
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    
print('-' * 30)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

Training on fold 1


New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.04576
Early stopping, best iteration is:
[51]	valid_0's multi_logloss: 1.04191


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

Fold 1 finished in 0:00:05.342772
Training on fold 2


New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.04431
Early stopping, best iteration is:
[59]	valid_0's multi_logloss: 1.04255


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

Fold 2 finished in 0:00:05.149764
Training on fold 3


New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.01512
Early stopping, best iteration is:
[84]	valid_0's multi_logloss: 1.01413


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

Fold 3 finished in 0:00:06.257085
Training on fold 4


New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.98826
Early stopping, best iteration is:
[59]	valid_0's multi_logloss: 0.986697


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

Fold 4 finished in 0:00:05.453253
Training on fold 5


New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.03385
Early stopping, best iteration is:
[56]	valid_0's multi_logloss: 1.03083


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

Fold 5 finished in 0:00:05.591708
------------------------------
OOF QWK: 0.5086208560742616
------------------------------


In [52]:
# train model on all data once
clf = LGBMClassifier(**default_param)
clf.fit(X, y, verbose=100,categorical_feature=cat_features)

New categorical_feature is ['session_title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, device='gpu', gpu_device_id=0,
        gpu_platform_id=0, importance_type='split', learning_rate=0.1,
        max_depth=8, min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1,
        nthread=-1, num_leaves=34, objective=None, random_state=2019,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

In [43]:
# process test set
new_test = []
for ins_id, user_sample in tqdm(test.main_df.groupby('installation_id', sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
X_test = pd.DataFrame(new_test)




  0%|          | 0/1000 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1000 [00:01<19:05,  1.15s/it][A[A[A


  1%|          | 8/1000 [00:01<13:21,  1.24it/s][A[A[A


  2%|▏         | 18/1000 [00:01<09:18,  1.76it/s][A[A[A


  2%|▏         | 24/1000 [00:01<06:36,  2.46it/s][A[A[A


  3%|▎         | 31/1000 [00:01<04:42,  3.43it/s][A[A[A


  4%|▎         | 37/1000 [00:01<03:21,  4.77it/s][A[A[A


  4%|▍         | 42/1000 [00:02<02:37,  6.08it/s][A[A[A


  5%|▍         | 49/1000 [00:02<01:53,  8.36it/s][A[A[A


  6%|▌         | 56/1000 [00:02<01:25, 10.99it/s][A[A[A


  6%|▋         | 63/1000 [00:02<01:04, 14.59it/s][A[A[A


  7%|▋         | 69/1000 [00:02<00:49, 18.82it/s][A[A[A


  8%|▊         | 75/1000 [00:02<00:44, 20.83it/s][A[A[A


  8%|▊         | 81/1000 [00:02<00:36, 25.15it/s][A[A[A


  9%|▉         | 89/1000 [00:03<00:30, 29.81it/s][A[A[A


 10%|▉         | 96/1000 [00:03<00:25, 35.38it/s][A[A[A


 10%|█         | 102/1000 [00:03

In [50]:
set(X.columns).difference(set(X_test.columns))

set()

In [53]:
len(X.columns)

15

In [56]:
set(X_test.columns) - set(X.columns)

{'accuracy_group'}

In [57]:
preds = clf.predict(X_test[all_features], num_iteration=clf.best_iteration_)

In [58]:
preds

array([3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 1, 3,
       3, 3, 1, 3, 3, 3, 3, 3, 0, 0, 0, 1, 3, 3, 0, 3, 3, 1, 3, 0, 0, 3,
       3, 3, 3, 0, 3, 3, 0, 0, 3, 3, 3, 0, 0, 3, 3, 0, 3, 3, 3, 3, 0, 3,
       3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0,
       3, 3, 3, 3, 3, 1, 0, 3, 3, 1, 0, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3,
       0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 1,
       0, 3, 3, 3, 3, 1, 3, 2, 3, 3, 0, 0, 3, 3, 1, 3, 3, 3, 3, 1, 3, 0,
       0, 0, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 1, 0, 3, 0, 3, 0, 3, 0, 3, 1, 3, 3, 3, 3, 0, 0, 3, 3, 2, 0,
       3, 3, 3, 1, 0, 0, 3, 0, 3, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 1, 0, 0,
       3, 3, 3, 0, 0, 3, 3, 1, 3, 2, 0, 3, 3, 3, 3, 3, 0, 3, 0, 0, 3, 0,
       3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 0, 0, 0, 3, 3, 0, 1, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3,
       3, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0,