- cancel data augmentation

In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, cohen_kappa_score
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        #return -mod_qwk(y, X_p, weights=weights)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.4,0.5), np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [4]:
def eval_qwk_lgb_regr(y_pred, train_t):
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

# install

In [5]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 11s, sys: 7.23 s, total: 1min 19s
Wall time: 1min 18s


# Preprocess and Feature engineering

In [6]:
%%time
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = sorted(list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique()))))
    
    train['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), train['type'], train['world']))
    test['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), test['type'], test['world']))
    list_of_type_world = sorted(list(set(train['type_world'].unique()).union(set(test['type_world'].unique()))))
    
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
        
    train["true"] = train["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)
    test["true"] = test["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)

    train["false"] = train["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    test["false"] = test["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    
    train["game_complete"] = train["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
    test["game_complete"] = test["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
    
    train["level"] = train["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    test["level"] = test["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    
    train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
               
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world = encode_title(train, test)

CPU times: user 3min 35s, sys: 9.86 s, total: 3min 44s
Wall time: 3min 44s


In [7]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    event_code_count = {str(ev): 0 for ev in list_of_event_code}
    title_count = {actv: 0 for actv in list_of_user_activities}
    type_world_count = {str(ev): 0 for ev in list_of_type_world}
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = user_sample.iloc[0]['timestamp']
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    durations = []
    prev_assess_title = -999
    assess_count = 1
    last_accuracy = -999
    prev_assess_start = -999; prev_assess_end = -999
    real_prev_assess_start = -999; real_prev_assess_end = -999
    real_assess_start = -999; real_assess_end = -999
    complete_games = 0
    no_result_count = 0
    crys_game_level = np.array([]); tree_game_level = np.array([]); magma_game_level = np.array([])
    crys_game_round = np.array([]); tree_game_round = np.array([]); magma_game_round = np.array([])
    
    for i, session in user_sample.groupby('game_session', sort=False):      
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        if session_type != 'Assessment':
            if session_type == "Game":
                true = session['true'].sum()
                false = session['false'].sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    crys_game_level = np.concatenate([crys_game_level, session["level"]], axis=0)
                    crys_game_round = np.concatenate([crys_game_round, session["round"]], axis=0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    tree_game_level = np.concatenate([tree_game_level, session["level"]], axis=0)
                    tree_game_round = np.concatenate([tree_game_round, session["round"]], axis=0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    magma_game_level = np.concatenate([magma_game_level, session["level"]], axis=0)
                    magma_game_round = np.concatenate([magma_game_round, session["round"]], axis=0)
                else:
                    pass
                
        if (session_type == 'Assessment') & (test_set or len(session)>1): 
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            assess_start = session.iloc[0,2]
            assess_end = session.iloc[-1,2]
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features.update(type_world_count.copy())
            features.update(last_accuracy_title.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
            features["no_complete_game"] = complete_games
            features["no_result_count"] = no_result_count 
            
            if true_attempts + false_attempts == 0:
                no_result_count += 1
            else:
                real_assess_start = session.iloc[0,2]
                real_assess_end = session.iloc[-1,2]

            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(crys_game_level) if len(crys_game_level[~np.isnan(crys_game_level)]) >=1 else -1
                features["level_count"] = len(crys_game_level[~np.isnan(crys_game_level)])
                features["round_count"] = len(crys_game_round[~np.isnan(crys_game_round)])
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(tree_game_level) if len(tree_game_level[~np.isnan(tree_game_level)]) >=1 else -1
                features["level_count"] = len(tree_game_level[~np.isnan(tree_game_level)])
                features["round_count"] = len(tree_game_round[~np.isnan(tree_game_round)])
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(magma_game_level) if len(magma_game_level[~np.isnan(magma_game_level)]) >=1 else -1
                features["level_count"] = len(magma_game_level[~np.isnan(magma_game_level)])
                features["round_count"] = len(magma_game_round[~np.isnan(magma_game_round)])
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            features["prev_assess_title"] = prev_assess_title
            prev_assess_title = session_title
            features["first_assessment"] = 1 if assess_count == 1 else 0
            assess_count += 1
            features["time_from_start"] = (assess_start - time_first_activity).seconds

            if prev_assess_end == -999:
                features["time_bet_assess"] = -999
            else:
                features["time_bet_assess"] = (assess_start - prev_assess_end).seconds
            prev_assess_start = assess_start
            prev_assess_end = assess_end
            if real_prev_assess_end == -999:
                features["time_bet_real_assess"] = -999
            else:
                features["time_bet_real_assess"] = (real_assess_start - real_prev_assess_end).seconds
            real_prev_assess_start = real_assess_start
            real_prev_assess_end = real_assess_end
            
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            
            features["num_correct"] = true_attempts
            features["num_incorrect"] = false_attempts
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            features['last_assess_acc'] = last_accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            last_accuracy = accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        complete_games += np.sum(session["game_complete"])
        miss += np.sum(session["misses"])
        user_world_count["world_"+str(session_world)] += session.shape[0]
        
        n_of_type_world = Counter(session['type_world']) 
        for key in n_of_type_world.keys():
            type_world_count[str(key)] += n_of_type_world[key]
            
        n_of_title = Counter(session['title']) 
        for key in n_of_title.keys():
            title_count[activities_labels[key]] += n_of_title[key]
            
        n_of_eventcode = Counter(session['event_code']) 
        for key in n_of_eventcode.keys():
            event_code_count[str(key)] += n_of_eventcode[key]
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
        
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1], all_assessments[:-1] # test previous data to incorporate into training
    return all_assessments

In [8]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_val = []

    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        compiled_train += get_data(user_sample)
    del train
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
        test_data, val_data = get_data(user_sample, test_set=True)
        compiled_test.append(test_data)
        compiled_val += val_data
    del test
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_val = pd.DataFrame(compiled_val)

    categoricals = ['session_title']
    return reduce_train, reduce_test, reduce_val, categoricals
new_train, new_test, new_val, categoricals = get_train_and_test(train, test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=17000, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




In [9]:
tmp = new_train[new_train.Game==0].copy()
tmp = tmp[tmp.Activity == 0].copy()
tmp = tmp[tmp.Clip == 0].copy()
tmp = tmp[tmp.Assessment ==0].copy()
remove_train_index = tmp.index
new_train = new_train[~new_train.index.isin(remove_train_index)].copy()

In [10]:
#tmp = new_val[new_val.Game==0].copy()
#tmp = tmp[tmp.Activity == 0].copy()
#tmp = tmp[tmp.Clip == 0].copy()
#tmp = tmp[tmp.Assessment ==0].copy()
#remove_val_index = tmp.index
#new_val = new_val[~new_val.index.isin(remove_val_index)].copy()
tmp = new_test[new_test.Assessment !=0].copy()
keep_test_index = tmp.index
add_test = new_test[new_test.index.isin(keep_test_index)].copy()

In [11]:
print(new_train.shape)
print(new_test.shape)
print(new_val.shape)

(17577, 568)
(1000, 568)
(2347, 568)


In [12]:
mod_train = pd.concat([new_train, add_test], sort=False).reset_index(drop=True)
mod_train.tail(5)

Unnamed: 0,Clip,Activity,Assessment,Game,12 Monkeys_2000,Air Show_2000,Air Show_2020,Air Show_2030,Air Show_2060,Air Show_2070,Air Show_2075,Air Show_3010,Air Show_3020,Air Show_3021,Air Show_3110,Air Show_3120,Air Show_3121,Air Show_4010,Air Show_4020,Air Show_4070,Air Show_4080,Air Show_4090,Air Show_4100,Air Show_4110,All Star Sorting_2000,All Star Sorting_2020,All Star Sorting_2025,All Star Sorting_2030,All Star Sorting_3010,All Star Sorting_3020,All Star Sorting_3021,All Star Sorting_3110,All Star Sorting_3120,All Star Sorting_3121,All Star Sorting_4010,All Star Sorting_4020,All Star Sorting_4030,All Star Sorting_4035,All Star Sorting_4070,All Star Sorting_4080,All Star Sorting_4090,All Star Sorting_4095,Balancing Act_2000,Bird Measurer (Assessment)_2000,Bird Measurer (Assessment)_2010,Bird Measurer (Assessment)_2020,Bird Measurer (Assessment)_2030,Bird Measurer (Assessment)_3010,Bird Measurer (Assessment)_3020,Bird Measurer (Assessment)_3021,Bird Measurer (Assessment)_3110,Bird Measurer (Assessment)_3120,Bird Measurer (Assessment)_3121,Bird Measurer (Assessment)_4020,Bird Measurer (Assessment)_4025,Bird Measurer (Assessment)_4030,Bird Measurer (Assessment)_4035,Bird Measurer (Assessment)_4040,Bird Measurer (Assessment)_4070,Bird Measurer (Assessment)_4080,Bird Measurer (Assessment)_4090,Bird Measurer (Assessment)_4100,Bird Measurer (Assessment)_4110,Bottle Filler (Activity)_2000,Bottle Filler (Activity)_2010,Bottle Filler (Activity)_2020,Bottle Filler (Activity)_2030,Bottle Filler (Activity)_3010,Bottle Filler (Activity)_3110,Bottle Filler (Activity)_4020,Bottle Filler (Activity)_4030,Bottle Filler (Activity)_4035,Bottle Filler (Activity)_4070,Bottle Filler (Activity)_4080,Bottle Filler (Activity)_4090,Bubble Bath_2000,Bubble Bath_2020,Bubble Bath_2025,Bubble Bath_2030,Bubble Bath_2035,Bubble Bath_2080,Bubble Bath_2083,Bubble Bath_3010,Bubble Bath_3020,Bubble Bath_3021,Bubble Bath_3110,Bubble Bath_3120,Bubble Bath_3121,Bubble Bath_4010,Bubble Bath_4020,Bubble Bath_4040,Bubble Bath_4045,Bubble Bath_4070,Bubble Bath_4080,Bubble Bath_4090,Bubble Bath_4095,Bubble Bath_4220,Bubble Bath_4230,Bubble Bath_4235,Bug Measurer (Activity)_2000,Bug Measurer (Activity)_3010,Bug Measurer (Activity)_3110,Bug Measurer (Activity)_4025,Bug Measurer (Activity)_4030,Bug Measurer (Activity)_4035,Bug Measurer (Activity)_4070,Bug Measurer (Activity)_4080,Bug Measurer (Activity)_4090,Cart Balancer (Assessment)_2000,Cart Balancer (Assessment)_2010,Cart Balancer (Assessment)_2020,Cart Balancer (Assessment)_2030,Cart Balancer (Assessment)_3010,Cart Balancer (Assessment)_3020,Cart Balancer (Assessment)_3021,Cart Balancer (Assessment)_3110,Cart Balancer (Assessment)_3120,Cart Balancer (Assessment)_3121,Cart Balancer (Assessment)_4020,Cart Balancer (Assessment)_4030,Cart Balancer (Assessment)_4035,Cart Balancer (Assessment)_4040,Cart Balancer (Assessment)_4070,Cart Balancer (Assessment)_4080,Cart Balancer (Assessment)_4090,Cart Balancer (Assessment)_4100,Cauldron Filler (Assessment)_2000,Cauldron Filler (Assessment)_2010,Cauldron Filler (Assessment)_2020,Cauldron Filler (Assessment)_2030,Cauldron Filler (Assessment)_3010,Cauldron Filler (Assessment)_3020,Cauldron Filler (Assessment)_3021,Cauldron Filler (Assessment)_3110,Cauldron Filler (Assessment)_3120,Cauldron Filler (Assessment)_3121,Cauldron Filler (Assessment)_4020,Cauldron Filler (Assessment)_4025,Cauldron Filler (Assessment)_4030,Cauldron Filler (Assessment)_4035,Cauldron Filler (Assessment)_4040,Cauldron Filler (Assessment)_4070,Cauldron Filler (Assessment)_4080,Cauldron Filler (Assessment)_4090,Cauldron Filler (Assessment)_4100,Chest Sorter (Assessment)_2000,Chest Sorter (Assessment)_2010,Chest Sorter (Assessment)_2020,Chest Sorter (Assessment)_2030,Chest Sorter (Assessment)_3010,Chest Sorter (Assessment)_3020,Chest Sorter (Assessment)_3021,Chest Sorter (Assessment)_3110,Chest Sorter (Assessment)_3120,Chest Sorter (Assessment)_3121,Chest Sorter (Assessment)_4020,Chest Sorter (Assessment)_4025,Chest Sorter (Assessment)_4030,Chest Sorter (Assessment)_4035,Chest Sorter (Assessment)_4040,Chest Sorter (Assessment)_4070,Chest Sorter (Assessment)_4080,Chest Sorter (Assessment)_4090,Chest Sorter (Assessment)_4100,Chicken Balancer (Activity)_2000,Chicken Balancer (Activity)_3010,Chicken Balancer (Activity)_3110,Chicken Balancer (Activity)_4020,Chicken Balancer (Activity)_4022,Chicken Balancer (Activity)_4030,Chicken Balancer (Activity)_4035,Chicken Balancer (Activity)_4070,Chicken Balancer (Activity)_4080,Chicken Balancer (Activity)_4090,Chow Time_2000,Chow Time_2020,Chow Time_2030,Chow Time_3010,Chow Time_3020,Chow Time_3021,Chow Time_3110,Chow Time_3120,Chow Time_3121,Chow Time_4010,Chow Time_4020,Chow Time_4030,Chow Time_4035,Chow Time_4070,Chow Time_4080,Chow Time_4090,Chow Time_4095,Costume Box_2000,Crystal Caves - Level 1_2000,Crystal Caves - Level 2_2000,Crystal Caves - Level 3_2000,Crystals Rule_2000,Crystals Rule_2010,Crystals Rule_2020,Crystals Rule_2030,Crystals Rule_3010,Crystals Rule_3020,Crystals Rule_3021,Crystals Rule_3110,Crystals Rule_3120,Crystals Rule_3121,Crystals Rule_4010,Crystals Rule_4020,Crystals Rule_4050,Crystals Rule_4070,Crystals Rule_4090,Dino Dive_2000,Dino Dive_2020,Dino Dive_2030,Dino Dive_2060,Dino Dive_2070,Dino Dive_3010,Dino Dive_3020,Dino Dive_3021,Dino Dive_3110,Dino Dive_3120,Dino Dive_3121,Dino Dive_4010,Dino Dive_4020,Dino Dive_4070,Dino Dive_4080,Dino Dive_4090,Dino Drink_2000,Dino Drink_2020,Dino Drink_2030,Dino Drink_2060,Dino Drink_2070,Dino Drink_2075,Dino Drink_3010,Dino Drink_3020,Dino Drink_3021,Dino Drink_3110,Dino Drink_3120,Dino Drink_3121,Dino Drink_4010,Dino Drink_4020,Dino Drink_4030,Dino Drink_4031,Dino Drink_4070,Dino Drink_4080,Dino Drink_4090,Egg Dropper (Activity)_2000,Egg Dropper (Activity)_2020,Egg Dropper (Activity)_3010,Egg Dropper (Activity)_3110,Egg Dropper (Activity)_4020,Egg Dropper (Activity)_4025,Egg Dropper (Activity)_4070,Egg Dropper (Activity)_4080,Egg Dropper (Activity)_4090,Fireworks (Activity)_2000,Fireworks (Activity)_3010,Fireworks (Activity)_3110,Fireworks (Activity)_4020,Fireworks (Activity)_4030,Fireworks (Activity)_4070,Fireworks (Activity)_4080,Fireworks (Activity)_4090,Flower Waterer (Activity)_2000,Flower Waterer (Activity)_3010,Flower Waterer (Activity)_3110,Flower Waterer (Activity)_4020,Flower Waterer (Activity)_4022,Flower Waterer (Activity)_4025,Flower Waterer (Activity)_4030,Flower Waterer (Activity)_4070,Flower Waterer (Activity)_4080,Flower Waterer (Activity)_4090,Happy Camel_2000,Happy Camel_2020,Happy Camel_2030,Happy Camel_2080,Happy Camel_2081,Happy Camel_2083,Happy Camel_3010,Happy Camel_3020,Happy Camel_3021,Happy Camel_3110,Happy Camel_3120,Happy Camel_3121,Happy Camel_4010,Happy Camel_4020,Happy Camel_4030,Happy Camel_4035,Happy Camel_4040,Happy Camel_4045,Happy Camel_4070,Happy Camel_4080,Happy Camel_4090,Happy Camel_4095,"Heavy, Heavier, Heaviest_2000",Honey Cake_2000,Leaf Leader_2000,Leaf Leader_2020,Leaf Leader_2030,Leaf Leader_2060,Leaf Leader_2070,Leaf Leader_2075,Leaf Leader_3010,Leaf Leader_3020,Leaf Leader_3021,Leaf Leader_3110,Leaf Leader_3120,Leaf Leader_3121,Leaf Leader_4010,Leaf Leader_4020,Leaf Leader_4070,Leaf Leader_4080,Leaf Leader_4090,Leaf Leader_4095,Lifting Heavy Things_2000,Magma Peak - Level 1_2000,Magma Peak - Level 2_2000,Mushroom Sorter (Assessment)_2000,Mushroom Sorter (Assessment)_2010,Mushroom Sorter (Assessment)_2020,Mushroom Sorter (Assessment)_2025,Mushroom Sorter (Assessment)_2030,Mushroom Sorter (Assessment)_2035,Mushroom Sorter (Assessment)_3010,Mushroom Sorter (Assessment)_3020,Mushroom Sorter (Assessment)_3021,Mushroom Sorter (Assessment)_3110,Mushroom Sorter (Assessment)_3120,Mushroom Sorter (Assessment)_3121,Mushroom Sorter (Assessment)_4020,Mushroom Sorter (Assessment)_4025,Mushroom Sorter (Assessment)_4030,Mushroom Sorter (Assessment)_4035,Mushroom Sorter (Assessment)_4040,Mushroom Sorter (Assessment)_4070,Mushroom Sorter (Assessment)_4080,Mushroom Sorter (Assessment)_4090,Mushroom Sorter (Assessment)_4100,Ordering Spheres_2000,Pan Balance_2000,Pan Balance_2010,Pan Balance_2020,Pan Balance_2030,Pan Balance_3010,Pan Balance_3020,Pan Balance_3021,Pan Balance_3110,Pan Balance_3120,Pan Balance_3121,Pan Balance_4010,Pan Balance_4020,Pan Balance_4025,Pan Balance_4030,Pan Balance_4035,Pan Balance_4070,Pan Balance_4080,Pan Balance_4090,Pan Balance_4100,Pirate's Tale_2000,Rulers_2000,Sandcastle Builder (Activity)_2000,Sandcastle Builder (Activity)_2010,Sandcastle Builder (Activity)_3010,Sandcastle Builder (Activity)_3110,Sandcastle Builder (Activity)_4020,Sandcastle Builder (Activity)_4021,Sandcastle Builder (Activity)_4030,Sandcastle Builder (Activity)_4035,Sandcastle Builder (Activity)_4070,Sandcastle Builder (Activity)_4080,Sandcastle Builder (Activity)_4090,Scrub-A-Dub_2000,Scrub-A-Dub_2020,Scrub-A-Dub_2030,Scrub-A-Dub_2040,Scrub-A-Dub_2050,Scrub-A-Dub_2080,Scrub-A-Dub_2081,Scrub-A-Dub_2083,Scrub-A-Dub_3010,Scrub-A-Dub_3020,Scrub-A-Dub_3021,Scrub-A-Dub_3110,Scrub-A-Dub_3120,Scrub-A-Dub_3121,Scrub-A-Dub_4010,Scrub-A-Dub_4020,Scrub-A-Dub_4070,Scrub-A-Dub_4080,Scrub-A-Dub_4090,Slop Problem_2000,Treasure Map_2000,Tree Top City - Level 1_2000,Tree Top City - Level 2_2000,Tree Top City - Level 3_2000,Watering Hole (Activity)_2000,Watering Hole (Activity)_2010,Watering Hole (Activity)_3010,Watering Hole (Activity)_3110,Watering Hole (Activity)_4020,Watering Hole (Activity)_4021,Watering Hole (Activity)_4025,Watering Hole (Activity)_4070,Watering Hole (Activity)_4090,Watering Hole (Activity)_5000,Watering Hole (Activity)_5010,Welcome to Lost Lagoon!_2000,world_0,world_1,world_2,world_3,2000,2010,2020,2025,2030,2035,2040,2050,2060,2070,2075,2080,2081,2083,3010,3020,3021,3110,3120,3121,4010,4020,4021,4022,4025,4030,4031,4035,4040,4045,4050,4070,4080,4090,4095,4100,4110,4220,4230,4235,5000,5010,12 Monkeys,Air Show,All Star Sorting,Balancing Act,Bird Measurer (Assessment),Bottle Filler (Activity),Bubble Bath,Bug Measurer (Activity),Cart Balancer (Assessment),Cauldron Filler (Assessment),Chest Sorter (Assessment),Chicken Balancer (Activity),Chow Time,Costume Box,Crystal Caves - Level 1,Crystal Caves - Level 2,Crystal Caves - Level 3,Crystals Rule,Dino Dive,Dino Drink,Egg Dropper (Activity),Fireworks (Activity),Flower Waterer (Activity),Happy Camel,"Heavy, Heavier, Heaviest",Honey Cake,Leaf Leader,Lifting Heavy Things,Magma Peak - Level 1,Magma Peak - Level 2,Mushroom Sorter (Assessment),Ordering Spheres,Pan Balance,Pirate's Tale,Rulers,Sandcastle Builder (Activity),Scrub-A-Dub,Slop Problem,Treasure Map,Tree Top City - Level 1,Tree Top City - Level 2,Tree Top City - Level 3,Watering Hole (Activity),Welcome to Lost Lagoon!,Activity_CRYSTALCAVES,Activity_MAGMAPEAK,Activity_TREETOPCITY,Assessment_CRYSTALCAVES,Assessment_MAGMAPEAK,Assessment_TREETOPCITY,Clip_CRYSTALCAVES,Clip_MAGMAPEAK,Clip_NONE,Clip_TREETOPCITY,Game_CRYSTALCAVES,Game_MAGMAPEAK,Game_TREETOPCITY,acc_Bird Measurer (Assessment),acc_Cart Balancer (Assessment),acc_Cauldron Filler (Assessment),acc_Chest Sorter (Assessment),acc_Mushroom Sorter (Assessment),lgt_Bird Measurer (Assessment),lgt_Cart Balancer (Assessment),lgt_Cauldron Filler (Assessment),lgt_Chest Sorter (Assessment),lgt_Mushroom Sorter (Assessment),agt_Bird Measurer (Assessment),agt_Cart Balancer (Assessment),agt_Cauldron Filler (Assessment),agt_Chest Sorter (Assessment),agt_Mushroom Sorter (Assessment),ata_Bird Measurer (Assessment),ata_Cart Balancer (Assessment),ata_Cauldron Filler (Assessment),ata_Chest Sorter (Assessment),ata_Mushroom Sorter (Assessment),afa_Bird Measurer (Assessment),afa_Cart Balancer (Assessment),afa_Cauldron Filler (Assessment),afa_Chest Sorter (Assessment),afa_Mushroom Sorter (Assessment),accumulated_correct_attempts,accumulated_uncorrect_attempts,misses,accumulated_actions,no_complete_game,no_result_count,game_true,game_false,game_accuracy,game_accuracy_std,last_game_acc,hightest_level,level_count,round_count,installation_id,session_title,prev_assess_title,first_assessment,time_from_start,time_bet_assess,time_bet_real_assess,duration_mean,duration_std,duration_max,num_correct,num_incorrect,last_assess_acc,accuracy_group,0,1,2,3,accumulated_accuracy_group
18148,4,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,1,3,8,1,3,7,1,3,1,16,16,0,26,0,0,1,0,2,1,3,2,8,0,1,8,0,1,3,16,20,0,1,14,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,4,0,4,4,0,4,3,3,3,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,210,8,2,8,2,6,1,0,0,0,0,0,0,0,0,20,1,8,19,1,8,1,22,0,0,19,39,0,0,1,0,0,41,0,0,1,2,1,0,0,0,0,0,0,0,92,0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,115,0,0,1,3,0,0,92,1.0,-1.0,-1.0,-1.0,1.0,155008,0,0,0,41221,179267,0,0,0,41221,1,0,0,0,1,0,0,0,0,0,2,0,3.0,211,1,1,0,0,0.0,0.0,0.0,-1.0,0,0,fee254cf,9,4,0,27466,34,86245,72.666667,57.978923,154,0,0,1.0,0,1,0,0,2,2.0
18149,11,2,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2,0,1,2,0,1,8,8,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,4,16,1,26,9,14,0,0,1,3,3,5,5,4,4,5,4,1,7,9,2,3,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,10,10,11,11,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,3,2,0,1,2,3,3,1,3,3,1,23,29,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,245,0,1,56,16,1,7,0,7,0,0,0,2,0,1,0,0,0,23,8,8,21,8,8,2,65,0,1,0,54,0,11,0,0,0,56,0,0,2,1,0,0,0,0,0,0,1,0,0,2,0,0,0,0,28,0,0,75,57,1,1,1,1,0,0,0,0,52,0,0,1,0,79,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,75,0,52,28,0,0,6,0,1,4,136,0,0,-1.0,1.0,-1.0,-1.0,-1.0,0,36169,0,0,0,0,36169,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,6.0,302,0,0,20,10,0.666667,0.062112,0.695652,-1.0,0,136,ff57e602,10,8,0,71595,226,86363,36.0,0.0,36,0,0,1.0,0,0,0,0,1,3.0
18150,32,2,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,4,4,7,0,6,7,0,6,7,3,9,1,1,9,0,0,2,1,0,1,0,4,4,0,4,4,0,6,17,39,11,5,39,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,24,24,34,36,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,4,1,1,1,1,1,1,4,2,4,4,2,4,4,3,4,0,0,6,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,9,9,2,4,9,3,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,153,132,1,239,38,3,6,1,5,1,0,0,0,0,0,0,0,0,48,6,10,48,6,10,0,53,4,0,23,97,0,15,6,0,0,136,0,0,0,9,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,70,139,0,0,1,1,2,0,0,0,0,0,182,0,0,1,3,0,4,1,4,46,1,0,1,3,56,0,1,1,1,1,1,0,1,0,56,182,139,70,46,14,6,1,11,0,0,0,-1.0,-1.0,1.0,0.0,0.333333,0,0,33319,149031,49331,0,0,67327,149031,49331,0,0,2,0,1,0,0,0,4,2,3,6,2.0,525,1,0,0,0,0.0,0.0,0.0,-1.0,0,0,ffc73fb2,8,9,0,76257,462,86366,66.0,48.363209,149,0,0,1.0,0,1,1,0,2,1.75
18151,11,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,28,28,6,18,12,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,5,2,10,4,4,10,4,4,12,0,19,1,6,17,0,2,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,4,0,0,3,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,1,0,1,0,0,0,0,0,0,0,0,1,4,139,1,114,18,0,6,0,2,0,0,0,1,0,1,0,0,0,44,4,4,42,4,4,1,12,0,0,6,37,0,13,6,0,0,45,0,2,0,6,0,0,0,0,0,0,0,0,0,1,0,0,0,110,0,109,0,0,0,0,1,2,0,0,0,15,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,11,0,0,0,1,2,1,2,1,0,13,110,0,109,0,4,2,1,4,0,15,0,-1.0,-1.0,0.0,-1.0,-1.0,0,0,5134,0,0,0,0,80674,0,0,0,0,2,0,0,0,0,4,0,0,2,4,4.0,258,0,1,0,0,0.0,0.0,0.0,-1.0,0,0,ffe00ca8,8,9,0,1066,210,86368,26.0,16.309506,43,0,0,0.0,0,1,1,1,0,1.0
18152,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,4,0,4,4,0,4,4,3,4,0,0,7,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,44,4,1,1,1,1,1,0,0,0,0,0,0,0,0,4,0,4,4,0,4,0,4,0,0,3,4,0,0,0,0,0,7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,41,0,0,0,3,0,0,0,-1.0,-1.0,-1.0,-1.0,1.0,0,0,0,0,36607,0,0,0,0,36607,0,0,0,0,1,0,0,0,0,0,1,0,0.0,44,1,0,0,0,0.0,0.0,0.0,-1.0,0,0,ffe774cc,4,30,0,194,70,86363,36.0,0.0,36,0,0,1.0,0,0,0,0,1,3.0


# Feature selection

In [13]:
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title', 'hightest_level', "num_correct", "num_incorrect"]:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test
features = [i for i in new_train.columns if i not in ["game_session"]]
to_exclude, ajusted_test = exclude(new_train, new_test, features)

Air Show_4080
Bottle Filler (Activity)_2010
Bubble Bath_4080
Bubble Bath_4090
Bug Measurer (Activity)_4080
Cart Balancer (Assessment)_4080
Chest Sorter (Assessment)_4080
Crystals Rule_2010
Dino Dive_4080
Dino Drink_4080
Egg Dropper (Activity)_4080
Fireworks (Activity)_4080
Happy Camel_4080
Leaf Leader_4080
Mushroom Sorter (Assessment)_4080
Mushroom Sorter (Assessment)_4090
Pan Balance_2010
Pan Balance_4080
Sandcastle Builder (Activity)_2010
Scrub-A-Dub_4080
Watering Hole (Activity)_2010
acc_Cart Balancer (Assessment)


# accuracy prediction

In [14]:
def importance_feature_selection2(X_train, y_train): 
    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 0.9792407844605087,'learning_rate': 0.017891320270412462,'max_depth': 7,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}

    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
        
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
            
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500)
            
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)

    print("ROC = \t {}".format(roc_auc_score(y_train, valid)))
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return feature_importance_df

X_train = mod_train.drop(['accuracy_group', "num_correct", "num_incorrect"],axis=1).copy()
y_train = mod_train.num_correct.copy()
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train["installation_id"]))
X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
for i in X_train.columns:
    if X_train[i].std() == 0 and i not in remove_features:
        remove_features.append(i)
X_train = X_train.drop(remove_features, axis=1)
X_train = X_train[sorted(X_train.columns.tolist())]
    
#df_for_correct  = importance_feature_selection2(X_train, y_train)

In [15]:
def num_correct_calc(train, test, fea, select_flg):
    X_train = train.drop(['accuracy_group',"num_correct", "num_incorrect"],axis=1) 
    y_train = train.num_correct.copy()
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group","num_correct", "num_incorrect"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    X_test = X_test[sorted(X_test.columns.tolist())]
    if select_flg == True:
        X_test = X_test[fea]
    print(X_test.shape[1])
    
    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 0.9792407844605087,'learning_rate': 0.017891320270412462,'max_depth': 7,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17, "random_seed":42}

    valid_correct_num = pd.DataFrame(np.zeros([X_train.shape[0]]))
    init = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
        if select_flg == True:
            X_train2 = X_train2[fea] 
            X_test2 = X_test2[fea]
        lgbm_params["random_seed"] = init + i 
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
        train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            
        models.append(clf)
        valid_correct_num.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
                
    print("logloss = \t {}".format(log_loss(y_train, valid_correct_num)))
    print("ROC = \t {}".format(roc_auc_score(y_train, valid_correct_num)))
    print('Accuracy score = \t {}'.format(accuracy_score(y_train, np.round(valid_correct_num))))
    print('Precision score = \t {}'.format(precision_score(y_train, np.round(valid_correct_num))))
    print('Recall score =   \t {}'.format(recall_score(y_train, np.round(valid_correct_num))))
    print('F1 score =      \t {}'.format(f1_score(y_train, np.round(valid_correct_num))))
    print(confusion_matrix(y_train, np.round(valid_correct_num)))
    pred_value = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value, valid_correct_num
#tmp = df_for_correct.sort_values("Cv", ascending = True).reset_index(drop=True).copy()
#tmp = df_for_correct.sort_values("Average", ascending = False).reset_index(drop=True).copy()
#feat_for_correct = tmp[tmp.index <= 230]["Feature"]
feat_for_correct = []
pred_value, valid_correct_num = num_correct_calc(new_train, new_test, feat_for_correct, False)

540
Fold 1
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.921979	valid_1's auc: 0.86204
Early stopping, best iteration is:
[609]	training's auc: 0.930092	valid_1's auc: 0.862578
Fold 2
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.922155	valid_1's auc: 0.853865
Early stopping, best iteration is:
[551]	training's auc: 0.925792	valid_1's auc: 0.854138
Fold 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[312]	training's auc: 0.906151	valid_1's auc: 0.851561
Fold 4
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.922214	valid_1's auc: 0.861863
Early stopping, best iteration is:
[435]	training's auc: 0.916354	valid_1's auc: 0.86199
Fold 5
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.922036	valid_1's auc: 0.862544
Early stopping, best iteration is:
[488]	training's auc: 0.921099	valid_1

In [16]:
def importance_feature_selection(X_train, y_train): 
    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    lgbm_params = {'boosting_type': 'gbdt', 'metric': 'rmse', 'objective': 'regression', 'eval_metric': 'cappa',
                    'tree_learner': 'serial', 'bagging_fraction': 0.5644726008433546,  'bagging_freq': 5,
                    'colsample_bytree': 0.5385657738791669, 'feature_fraction': 0.8415810198341436,
                    'learning_rate': 0.007905671344935965, 'max_depth': 25, 'min_data_in_leaf': 99,
                    'min_sum_hessian_in_leaf': 7, 'num_leaves': 28}

    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        valid.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
            
    print("RMSE = \t {}".format(np.sqrt(mean_squared_error(y_train, valid))))
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return feature_importance_df

X_train = mod_train.drop(['accuracy_group', "num_correct", "num_incorrect"],axis=1).copy()
y_train = mod_train.num_incorrect.copy()
y_train.loc[y_train >=2] = 2
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train["installation_id"]))
X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"]) 
                       or i in to_exclude]
for i in X_train.columns:
    if X_train[i].std() == 0 and i not in remove_features:
        remove_features.append(i)
X_train = X_train.drop(remove_features, axis=1)
X_train = X_train[sorted(X_train.columns.tolist())]
#df_for_incorrect  = importance_feature_selection(X_train, y_train)

In [17]:
def num_incorrect_calc(train, test, fea, select_flg): 
    X_train = train.drop(['accuracy_group', "num_correct", "num_incorrect"],axis=1) 
    y_train = train.num_incorrect.copy()
    y_train.loc[y_train >=2] = 2
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"]) 
                       or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group","num_correct", "num_incorrect"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    X_test = X_test[sorted(X_test.columns.tolist())]
    if select_flg == True:
        X_test = X_test[fea]
    print(X_test.shape[1])
    
    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'boosting_type': 'gbdt', 'metric': 'rmse', 'objective': 'regression', 'eval_metric': 'cappa',
                 'tree_learner': 'serial', 'bagging_fraction': 0.5644726008433546, 'bagging_freq': 5,
                    'colsample_bytree': 0.5385657738791669,
                     'feature_fraction': 0.8415810198341436,
                     'learning_rate': 0.007905671344935965,
                     'max_depth': 25,
                     'min_data_in_leaf': 99,
                     'min_sum_hessian_in_leaf': 7,
                     'num_leaves': 28}

    valid_incorrect_num = pd.DataFrame(np.zeros([X_train.shape[0]]))
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
        if select_flg == True:
            X_train2 = X_train2[fea] 
            X_test2 = X_test2[fea] 
            
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500)
        train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            
        models.append(clf)
        valid_incorrect_num.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
    print("RMSE = \t {}".format(np.sqrt(mean_squared_error(y_train, valid_incorrect_num))))    
    pred_value_incorrect = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value_incorrect += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value_incorrect, valid_incorrect_num
#tmp = df_for_incorrect.sort_values("Cv", ascending = True).reset_index(drop=True).copy()
#tmp = df_for_incorrect.sort_values("Average", ascending = False).reset_index(drop=True).copy()
#feat_for_incorrect = tmp[tmp.index <= 230]["Feature"]
feat_for_incorrect = []
pred_value_incorrect, valid_incorrect_num  = num_incorrect_calc(new_train, new_test, feat_for_incorrect, False)

540
Fold 1
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.680396	valid_1's rmse: 0.73458
Early stopping, best iteration is:
[774]	training's rmse: 0.657585	valid_1's rmse: 0.733215
Fold 2
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.682438	valid_1's rmse: 0.724129
Early stopping, best iteration is:
[634]	training's rmse: 0.670341	valid_1's rmse: 0.723031
Fold 3
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.680036	valid_1's rmse: 0.733523
Early stopping, best iteration is:
[695]	training's rmse: 0.663264	valid_1's rmse: 0.732341
Fold 4
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.679435	valid_1's rmse: 0.735905
Early stopping, best iteration is:
[834]	training's rmse: 0.651959	valid_1's rmse: 0.734105
Fold 5
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.681296	valid_1's rmse: 0.729116

In [18]:
train_exp_accuracy = valid_correct_num / (valid_correct_num + valid_incorrect_num)
test_exp_accuracy = pred_value / (pred_value + pred_value_incorrect)
best_score = 0
for i in range(10):
    optR = OptimizedRounder()
    optR.fit(np.array(train_exp_accuracy).reshape(-1,), new_train.accuracy_group, random_flg=True)
    coefficients = optR.coefficients()
    final_valid_pred = optR.predict(np.array(train_exp_accuracy).reshape(-1,), coefficients)
    score = qwk(new_train.accuracy_group, final_valid_pred)
    print(i, np.sort(coefficients), score)
    if score > best_score:
        best_score = score
        best_coefficients = coefficients
final_test_pred = pd.cut(np.array(test_exp_accuracy).reshape(-1,), [-np.inf] + list(np.sort(best_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = pd.cut(np.array(test_exp_accuracy).reshape(-1,), [-np.inf] + list(np.sort([0.30278718, 0.42545025, 0.56396292])) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = eval_qwk_lgb_regr(np.array(test_exp_accuracy).reshape(-1,), new_train)

sample_submission["accuracy_group"] = final_test_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission["accuracy_group"].value_counts(normalize = True)

0 [0.31438651 0.43353001 0.60703709] 0.59946001
1 [0.31176741 0.42579909 0.56095599] 0.60123795
2 [0.28803786 0.43346575 0.55887932] 0.60186744
3 [0.28703415 0.42814609 0.55868426] 0.60161094
4 [0.28790723 0.43195362 0.55884442] 0.60176298
5 [0.31447469 0.42420434 0.56084804] 0.60089172
6 [0.29183622 0.43353847 0.55877847] 0.60186475
7 [0.28520798 0.43197057 0.55872551] 0.60186064
8 [0.31440706 0.43350014 0.59364198] 0.5989458
9 [0.28756619 0.42593863 0.55864516] 0.60174009


3    0.534
2    0.185
0    0.157
1    0.124
Name: accuracy_group, dtype: float64