- increase random truncation trials to 40 in classification
- add accuracies in each assessments

In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, cohen_kappa_score
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder_cla(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        #return -mod_qwk(y, X_p, weights=weights)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.4,0.5), np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']
    
class OptimizedRounder_reg(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        #return -mod_qwk(y, X_p, weights=weights)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(1.0,1.1), np.random.uniform(1.7,1.8), np.random.uniform(2.1,2.2)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [4]:
def eval_qwk_lgb_regr(y_pred, train_t):
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

# install

In [5]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 13s, sys: 11.8 s, total: 1min 25s
Wall time: 1min 25s


# Preprocess and Feature engineering

In [6]:
%%time
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = sorted(list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique()))))
    
    train['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), train['type'], train['world']))
    test['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), test['type'], test['world']))
    list_of_type_world = sorted(list(set(train['type_world'].unique()).union(set(test['type_world'].unique()))))
    
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
        
    train["true"] = train["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)
    test["true"] = test["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)

    train["false"] = train["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    test["false"] = test["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    
    train["game_complete"] = train["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
    test["game_complete"] = test["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
               
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world = encode_title(train, test)

CPU times: user 1min 53s, sys: 11.3 s, total: 2min 4s
Wall time: 2min 3s


In [7]:
def make_ratio(features, dic):
    total = sum(dic.values())
    if total != 0:
        for key in dic.keys():
            features[str(key)] = features[str(key)] / total
    else:
        pass
    return features

def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    event_code_count = {str(ev): 0 for ev in list_of_event_code}
    title_count = {actv: 0 for actv in list_of_user_activities}
    type_world_count = {str(ev): 0 for ev in list_of_type_world}
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    as_accuracy_title = {'aat_' + title: 0 for title in assess_titles} ####
    
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = user_sample.iloc[0]['timestamp']
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    durations = []
    prev_assess_title = -999
    assess_count = 1
    last_accuracy = -999
    prev_assess_start = -999; prev_assess_end = -999
    real_prev_assess_start = -999; real_prev_assess_end = -999
    real_assess_start = -999; real_assess_end = -999
    complete_games = 0
    no_result_count = 0
    crys_game_level = np.array([]); tree_game_level = np.array([]); magma_game_level = np.array([])
    crys_game_round = np.array([]); tree_game_round = np.array([]); magma_game_round = np.array([])
    
    for i, session in user_sample.groupby('game_session', sort=False):      
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        if session_type != 'Assessment':
            if session_type == "Game":
                true = session['true'].sum()
                false = session['false'].sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                else:
                    pass
                
        if (session_type == 'Assessment') & (test_set or len(session)>1): 
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            assess_start = session.iloc[0,2]
            assess_end = session.iloc[-1,2]
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features = make_ratio(features, user_activities_count)
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features = make_ratio(features, title_eventcode_count)
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features = make_ratio(features, user_world_count)
            features.update(event_code_count.copy())
            features = make_ratio(features, event_code_count)
            features.update(title_count.copy())
            features = make_ratio(features, title_count)
            features.update(type_world_count.copy())
            features = make_ratio(features, type_world_count)
            features.update(last_accuracy_title.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features.update(as_accuracy_title.copy()) ###
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            for tmp_title in assess_titles: ###
                as_accuracy_title['aat_' + tmp_title] = ac_true_attempts_title['ata_' + tmp_title] / (ac_true_attempts_title['ata_' + tmp_title] + 
                                                                                                      ac_false_attempts_title['afa_' + tmp_title]) if (ac_true_attempts_title['ata_' + tmp_title] + 
                                                                                                      ac_false_attempts_title['afa_' + tmp_title]) > 0 else 0
            
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
            features["no_complete_game"] = complete_games
            features["no_result_count"] = no_result_count 
            
            if true_attempts + false_attempts == 0:
                no_result_count += 1
            else:
                real_assess_start = session.iloc[0,2]
                real_assess_end = session.iloc[-1,2]
             
            #features["crys_game_true"] = crys_game_true
            #features["crys_game_false"] = crys_game_false
            #features['crys_game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
            #features["crys_game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
            #features["cryslast_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
            #features["tree_game_true"] = tree_game_true
            #features["tree_game_false"] = tree_game_false
            #features['tree_game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
            #features["tree_game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
            #features["tree_last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
            #features["magma_game_true"] = magma_game_true
            #features["magma_game_false"] = magma_game_false
            #features['magma_game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
            #features["magma_game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
            #features["magma_last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
            
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            features["prev_assess_title"] = prev_assess_title
            prev_assess_title = session_title
            features["first_assessment"] = 1 if assess_count == 1 else 0
            assess_count += 1
            features["time_from_start"] = (assess_start - time_first_activity).seconds

            if prev_assess_end == -999:
                features["time_bet_assess"] = -999
            else:
                features["time_bet_assess"] = (assess_start - prev_assess_end).seconds
            prev_assess_start = assess_start
            prev_assess_end = assess_end
            if real_prev_assess_end == -999:
                features["time_bet_real_assess"] = -999
            else:
                features["time_bet_real_assess"] = (real_assess_start - real_prev_assess_end).seconds
            real_prev_assess_start = real_assess_start
            real_prev_assess_end = real_assess_end
            
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            features['last_assess_acc'] = last_accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            last_accuracy = accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        complete_games += np.sum(session["game_complete"])
        miss += np.sum(session["misses"])
        user_world_count["world_"+str(session_world)] += session.shape[0]
        
        n_of_type_world = Counter(session['type_world']) 
        for key in n_of_type_world.keys():
            type_world_count[str(key)] += n_of_type_world[key]
            
        n_of_title = Counter(session['title']) 
        for key in n_of_title.keys():
            title_count[activities_labels[key]] += n_of_title[key]
            
        n_of_eventcode = Counter(session['event_code']) 
        for key in n_of_eventcode.keys():
            event_code_count[str(key)] += n_of_eventcode[key]
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
        
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1], all_assessments[:-1] # test previous data to incorporate into training
    return all_assessments

In [8]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_val = []

    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        compiled_train += get_data(user_sample)
    del train
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
        test_data, val_data = get_data(user_sample, test_set=True)
        compiled_test.append(test_data)
        compiled_val += val_data
    del test
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_val = pd.DataFrame(compiled_val)

    categoricals = ['session_title']
    return reduce_train, reduce_test, reduce_val, categoricals
new_train, new_test, new_val, categoricals = get_train_and_test(train, test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=17000, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




In [9]:
tmp = new_train[new_train.Game==0].copy()
tmp = tmp[tmp.Activity == 0].copy()
tmp = tmp[tmp.Clip == 0].copy()
tmp = tmp[tmp.Assessment ==0].copy()
remove_train_index = tmp.index
new_train = new_train[~new_train.index.isin(remove_train_index)].copy()

In [10]:
print(new_train.shape)
print(new_test.shape)
print(new_val.shape)

(17577, 568)
(1000, 568)
(2347, 568)


In [11]:
# data augmentation
# tmp = new_val[new_val.Game==0].copy()
# tmp = tmp[tmp.Activity == 0].copy()
# tmp = tmp[tmp.Clip == 0].copy()
# tmp = tmp[tmp.Assessment ==0].copy()
# remove_val_index = tmp.index
# add_val = new_val[~new_val.index.isin(remove_val_index)].copy()

# mod_train = pd.concat([new_train, add_val], ignore_index=True)
# mod_train.shape

In [12]:
new_train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,12 Monkeys_2000,Air Show_2000,Air Show_2020,Air Show_2030,Air Show_2060,Air Show_2070,Air Show_2075,Air Show_3010,Air Show_3020,Air Show_3021,Air Show_3110,Air Show_3120,Air Show_3121,Air Show_4010,Air Show_4020,Air Show_4070,Air Show_4080,Air Show_4090,Air Show_4100,Air Show_4110,All Star Sorting_2000,All Star Sorting_2020,All Star Sorting_2025,All Star Sorting_2030,All Star Sorting_3010,All Star Sorting_3020,All Star Sorting_3021,All Star Sorting_3110,All Star Sorting_3120,All Star Sorting_3121,All Star Sorting_4010,All Star Sorting_4020,All Star Sorting_4030,All Star Sorting_4035,All Star Sorting_4070,All Star Sorting_4080,All Star Sorting_4090,All Star Sorting_4095,Balancing Act_2000,Bird Measurer (Assessment)_2000,Bird Measurer (Assessment)_2010,Bird Measurer (Assessment)_2020,Bird Measurer (Assessment)_2030,Bird Measurer (Assessment)_3010,Bird Measurer (Assessment)_3020,Bird Measurer (Assessment)_3021,Bird Measurer (Assessment)_3110,Bird Measurer (Assessment)_3120,Bird Measurer (Assessment)_3121,Bird Measurer (Assessment)_4020,Bird Measurer (Assessment)_4025,Bird Measurer (Assessment)_4030,Bird Measurer (Assessment)_4035,Bird Measurer (Assessment)_4040,Bird Measurer (Assessment)_4070,Bird Measurer (Assessment)_4080,Bird Measurer (Assessment)_4090,Bird Measurer (Assessment)_4100,Bird Measurer (Assessment)_4110,Bottle Filler (Activity)_2000,Bottle Filler (Activity)_2010,Bottle Filler (Activity)_2020,Bottle Filler (Activity)_2030,Bottle Filler (Activity)_3010,Bottle Filler (Activity)_3110,Bottle Filler (Activity)_4020,Bottle Filler (Activity)_4030,Bottle Filler (Activity)_4035,Bottle Filler (Activity)_4070,Bottle Filler (Activity)_4080,Bottle Filler (Activity)_4090,Bubble Bath_2000,Bubble Bath_2020,Bubble Bath_2025,Bubble Bath_2030,Bubble Bath_2035,Bubble Bath_2080,Bubble Bath_2083,Bubble Bath_3010,Bubble Bath_3020,Bubble Bath_3021,Bubble Bath_3110,Bubble Bath_3120,Bubble Bath_3121,Bubble Bath_4010,Bubble Bath_4020,Bubble Bath_4040,Bubble Bath_4045,Bubble Bath_4070,Bubble Bath_4080,Bubble Bath_4090,Bubble Bath_4095,Bubble Bath_4220,Bubble Bath_4230,Bubble Bath_4235,Bug Measurer (Activity)_2000,Bug Measurer (Activity)_3010,Bug Measurer (Activity)_3110,Bug Measurer (Activity)_4025,Bug Measurer (Activity)_4030,Bug Measurer (Activity)_4035,Bug Measurer (Activity)_4070,Bug Measurer (Activity)_4080,Bug Measurer (Activity)_4090,Cart Balancer (Assessment)_2000,Cart Balancer (Assessment)_2010,Cart Balancer (Assessment)_2020,Cart Balancer (Assessment)_2030,Cart Balancer (Assessment)_3010,Cart Balancer (Assessment)_3020,Cart Balancer (Assessment)_3021,Cart Balancer (Assessment)_3110,Cart Balancer (Assessment)_3120,Cart Balancer (Assessment)_3121,Cart Balancer (Assessment)_4020,Cart Balancer (Assessment)_4030,Cart Balancer (Assessment)_4035,Cart Balancer (Assessment)_4040,Cart Balancer (Assessment)_4070,Cart Balancer (Assessment)_4080,Cart Balancer (Assessment)_4090,Cart Balancer (Assessment)_4100,Cauldron Filler (Assessment)_2000,Cauldron Filler (Assessment)_2010,Cauldron Filler (Assessment)_2020,Cauldron Filler (Assessment)_2030,Cauldron Filler (Assessment)_3010,Cauldron Filler (Assessment)_3020,Cauldron Filler (Assessment)_3021,Cauldron Filler (Assessment)_3110,Cauldron Filler (Assessment)_3120,Cauldron Filler (Assessment)_3121,Cauldron Filler (Assessment)_4020,Cauldron Filler (Assessment)_4025,Cauldron Filler (Assessment)_4030,Cauldron Filler (Assessment)_4035,Cauldron Filler (Assessment)_4040,Cauldron Filler (Assessment)_4070,Cauldron Filler (Assessment)_4080,Cauldron Filler (Assessment)_4090,Cauldron Filler (Assessment)_4100,Chest Sorter (Assessment)_2000,Chest Sorter (Assessment)_2010,Chest Sorter (Assessment)_2020,Chest Sorter (Assessment)_2030,Chest Sorter (Assessment)_3010,Chest Sorter (Assessment)_3020,Chest Sorter (Assessment)_3021,Chest Sorter (Assessment)_3110,Chest Sorter (Assessment)_3120,Chest Sorter (Assessment)_3121,Chest Sorter (Assessment)_4020,Chest Sorter (Assessment)_4025,Chest Sorter (Assessment)_4030,Chest Sorter (Assessment)_4035,Chest Sorter (Assessment)_4040,Chest Sorter (Assessment)_4070,Chest Sorter (Assessment)_4080,Chest Sorter (Assessment)_4090,Chest Sorter (Assessment)_4100,Chicken Balancer (Activity)_2000,Chicken Balancer (Activity)_3010,Chicken Balancer (Activity)_3110,Chicken Balancer (Activity)_4020,Chicken Balancer (Activity)_4022,Chicken Balancer (Activity)_4030,Chicken Balancer (Activity)_4035,Chicken Balancer (Activity)_4070,Chicken Balancer (Activity)_4080,Chicken Balancer (Activity)_4090,Chow Time_2000,Chow Time_2020,Chow Time_2030,Chow Time_3010,Chow Time_3020,Chow Time_3021,Chow Time_3110,Chow Time_3120,Chow Time_3121,Chow Time_4010,Chow Time_4020,Chow Time_4030,Chow Time_4035,Chow Time_4070,Chow Time_4080,Chow Time_4090,Chow Time_4095,Costume Box_2000,Crystal Caves - Level 1_2000,Crystal Caves - Level 2_2000,Crystal Caves - Level 3_2000,Crystals Rule_2000,Crystals Rule_2010,Crystals Rule_2020,Crystals Rule_2030,Crystals Rule_3010,Crystals Rule_3020,Crystals Rule_3021,Crystals Rule_3110,Crystals Rule_3120,Crystals Rule_3121,Crystals Rule_4010,Crystals Rule_4020,Crystals Rule_4050,Crystals Rule_4070,Crystals Rule_4090,Dino Dive_2000,Dino Dive_2020,Dino Dive_2030,Dino Dive_2060,Dino Dive_2070,Dino Dive_3010,Dino Dive_3020,Dino Dive_3021,Dino Dive_3110,Dino Dive_3120,Dino Dive_3121,Dino Dive_4010,Dino Dive_4020,Dino Dive_4070,Dino Dive_4080,Dino Dive_4090,Dino Drink_2000,Dino Drink_2020,Dino Drink_2030,Dino Drink_2060,Dino Drink_2070,Dino Drink_2075,Dino Drink_3010,Dino Drink_3020,Dino Drink_3021,Dino Drink_3110,Dino Drink_3120,Dino Drink_3121,Dino Drink_4010,Dino Drink_4020,Dino Drink_4030,Dino Drink_4031,Dino Drink_4070,Dino Drink_4080,Dino Drink_4090,Egg Dropper (Activity)_2000,Egg Dropper (Activity)_2020,Egg Dropper (Activity)_3010,Egg Dropper (Activity)_3110,Egg Dropper (Activity)_4020,Egg Dropper (Activity)_4025,Egg Dropper (Activity)_4070,Egg Dropper (Activity)_4080,Egg Dropper (Activity)_4090,Fireworks (Activity)_2000,Fireworks (Activity)_3010,Fireworks (Activity)_3110,Fireworks (Activity)_4020,Fireworks (Activity)_4030,Fireworks (Activity)_4070,Fireworks (Activity)_4080,Fireworks (Activity)_4090,Flower Waterer (Activity)_2000,Flower Waterer (Activity)_3010,Flower Waterer (Activity)_3110,Flower Waterer (Activity)_4020,Flower Waterer (Activity)_4022,Flower Waterer (Activity)_4025,Flower Waterer (Activity)_4030,Flower Waterer (Activity)_4070,Flower Waterer (Activity)_4080,Flower Waterer (Activity)_4090,Happy Camel_2000,Happy Camel_2020,Happy Camel_2030,Happy Camel_2080,Happy Camel_2081,Happy Camel_2083,Happy Camel_3010,Happy Camel_3020,Happy Camel_3021,Happy Camel_3110,Happy Camel_3120,Happy Camel_3121,Happy Camel_4010,Happy Camel_4020,Happy Camel_4030,Happy Camel_4035,Happy Camel_4040,Happy Camel_4045,Happy Camel_4070,Happy Camel_4080,Happy Camel_4090,Happy Camel_4095,"Heavy, Heavier, Heaviest_2000",Honey Cake_2000,Leaf Leader_2000,Leaf Leader_2020,Leaf Leader_2030,Leaf Leader_2060,Leaf Leader_2070,Leaf Leader_2075,Leaf Leader_3010,Leaf Leader_3020,Leaf Leader_3021,Leaf Leader_3110,Leaf Leader_3120,Leaf Leader_3121,Leaf Leader_4010,Leaf Leader_4020,Leaf Leader_4070,Leaf Leader_4080,Leaf Leader_4090,Leaf Leader_4095,Lifting Heavy Things_2000,Magma Peak - Level 1_2000,Magma Peak - Level 2_2000,Mushroom Sorter (Assessment)_2000,Mushroom Sorter (Assessment)_2010,Mushroom Sorter (Assessment)_2020,Mushroom Sorter (Assessment)_2025,Mushroom Sorter (Assessment)_2030,Mushroom Sorter (Assessment)_2035,Mushroom Sorter (Assessment)_3010,Mushroom Sorter (Assessment)_3020,Mushroom Sorter (Assessment)_3021,Mushroom Sorter (Assessment)_3110,Mushroom Sorter (Assessment)_3120,Mushroom Sorter (Assessment)_3121,Mushroom Sorter (Assessment)_4020,Mushroom Sorter (Assessment)_4025,Mushroom Sorter (Assessment)_4030,Mushroom Sorter (Assessment)_4035,Mushroom Sorter (Assessment)_4040,Mushroom Sorter (Assessment)_4070,Mushroom Sorter (Assessment)_4080,Mushroom Sorter (Assessment)_4090,Mushroom Sorter (Assessment)_4100,Ordering Spheres_2000,Pan Balance_2000,Pan Balance_2010,Pan Balance_2020,Pan Balance_2030,Pan Balance_3010,Pan Balance_3020,Pan Balance_3021,Pan Balance_3110,Pan Balance_3120,Pan Balance_3121,Pan Balance_4010,Pan Balance_4020,Pan Balance_4025,Pan Balance_4030,Pan Balance_4035,Pan Balance_4070,Pan Balance_4080,Pan Balance_4090,Pan Balance_4100,Pirate's Tale_2000,Rulers_2000,Sandcastle Builder (Activity)_2000,Sandcastle Builder (Activity)_2010,Sandcastle Builder (Activity)_3010,Sandcastle Builder (Activity)_3110,Sandcastle Builder (Activity)_4020,Sandcastle Builder (Activity)_4021,Sandcastle Builder (Activity)_4030,Sandcastle Builder (Activity)_4035,Sandcastle Builder (Activity)_4070,Sandcastle Builder (Activity)_4080,Sandcastle Builder (Activity)_4090,Scrub-A-Dub_2000,Scrub-A-Dub_2020,Scrub-A-Dub_2030,Scrub-A-Dub_2040,Scrub-A-Dub_2050,Scrub-A-Dub_2080,Scrub-A-Dub_2081,Scrub-A-Dub_2083,Scrub-A-Dub_3010,Scrub-A-Dub_3020,Scrub-A-Dub_3021,Scrub-A-Dub_3110,Scrub-A-Dub_3120,Scrub-A-Dub_3121,Scrub-A-Dub_4010,Scrub-A-Dub_4020,Scrub-A-Dub_4070,Scrub-A-Dub_4080,Scrub-A-Dub_4090,Slop Problem_2000,Treasure Map_2000,Tree Top City - Level 1_2000,Tree Top City - Level 2_2000,Tree Top City - Level 3_2000,Watering Hole (Activity)_2000,Watering Hole (Activity)_2010,Watering Hole (Activity)_3010,Watering Hole (Activity)_3110,Watering Hole (Activity)_4020,Watering Hole (Activity)_4021,Watering Hole (Activity)_4025,Watering Hole (Activity)_4070,Watering Hole (Activity)_4090,Watering Hole (Activity)_5000,Watering Hole (Activity)_5010,Welcome to Lost Lagoon!_2000,world_0,world_1,world_2,world_3,2000,2010,2020,2025,2030,2035,2040,2050,2060,2070,2075,2080,2081,2083,3010,3020,3021,3110,3120,3121,4010,4020,4021,4022,4025,4030,4031,4035,4040,4045,4050,4070,4080,4090,4095,4100,4110,4220,4230,4235,5000,5010,12 Monkeys,Air Show,All Star Sorting,Balancing Act,Bird Measurer (Assessment),Bottle Filler (Activity),Bubble Bath,Bug Measurer (Activity),Cart Balancer (Assessment),Cauldron Filler (Assessment),Chest Sorter (Assessment),Chicken Balancer (Activity),Chow Time,Costume Box,Crystal Caves - Level 1,Crystal Caves - Level 2,Crystal Caves - Level 3,Crystals Rule,Dino Dive,Dino Drink,Egg Dropper (Activity),Fireworks (Activity),Flower Waterer (Activity),Happy Camel,"Heavy, Heavier, Heaviest",Honey Cake,Leaf Leader,Lifting Heavy Things,Magma Peak - Level 1,Magma Peak - Level 2,Mushroom Sorter (Assessment),Ordering Spheres,Pan Balance,Pirate's Tale,Rulers,Sandcastle Builder (Activity),Scrub-A-Dub,Slop Problem,Treasure Map,Tree Top City - Level 1,Tree Top City - Level 2,Tree Top City - Level 3,Watering Hole (Activity),Welcome to Lost Lagoon!,Activity_CRYSTALCAVES,Activity_MAGMAPEAK,Activity_TREETOPCITY,Assessment_CRYSTALCAVES,Assessment_MAGMAPEAK,Assessment_TREETOPCITY,Clip_CRYSTALCAVES,Clip_MAGMAPEAK,Clip_NONE,Clip_TREETOPCITY,Game_CRYSTALCAVES,Game_MAGMAPEAK,Game_TREETOPCITY,acc_Bird Measurer (Assessment),acc_Cart Balancer (Assessment),acc_Cauldron Filler (Assessment),acc_Chest Sorter (Assessment),acc_Mushroom Sorter (Assessment),lgt_Bird Measurer (Assessment),lgt_Cart Balancer (Assessment),lgt_Cauldron Filler (Assessment),lgt_Chest Sorter (Assessment),lgt_Mushroom Sorter (Assessment),agt_Bird Measurer (Assessment),agt_Cart Balancer (Assessment),agt_Cauldron Filler (Assessment),agt_Chest Sorter (Assessment),agt_Mushroom Sorter (Assessment),ata_Bird Measurer (Assessment),ata_Cart Balancer (Assessment),ata_Cauldron Filler (Assessment),ata_Chest Sorter (Assessment),ata_Mushroom Sorter (Assessment),afa_Bird Measurer (Assessment),afa_Cart Balancer (Assessment),afa_Cauldron Filler (Assessment),afa_Chest Sorter (Assessment),afa_Mushroom Sorter (Assessment),aat_Bird Measurer (Assessment),aat_Cart Balancer (Assessment),aat_Cauldron Filler (Assessment),aat_Chest Sorter (Assessment),aat_Mushroom Sorter (Assessment),accumulated_correct_attempts,accumulated_uncorrect_attempts,misses,accumulated_actions,no_complete_game,no_result_count,game_true,game_false,game_accuracy,game_accuracy_std,last_game_acc,installation_id,session_title,prev_assess_title,first_assessment,time_from_start,time_bet_assess,time_bet_real_assess,duration_mean,duration_std,duration_max,last_assess_acc,accuracy_group,0,1,2,3,accumulated_accuracy_group
0,0.611111,0.166667,0.0,0.222222,0.001546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004637,0.007728,0.006182,0.004637,0.018547,0.006182,0.004637,0.017002,0.006182,0.004637,0.004637,0.040185,0.041731,0.001546,0.064915,0.0,0.003091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001546,0.023184,0.023184,0.035549,0.035549,0.010819,0.0,0.0,0.001546,0.026275,0.026275,0.027821,0.047913,0.029366,0.07728,0.020093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001546,0.0,0.001546,0.0,0.030912,0.029366,0.010819,0.021638,0.032457,0.0,0.027821,0.0,0.003091,0.001546,0.023184,0.023184,0.009274,0.009274,0.006182,0.001546,0.003091,0.023184,0.004637,0.009274,0.023184,0.004637,0.009274,0.001546,0.027821,0.021638,0.0,0.0,0.001546,0.0,0.001546,0.001546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003091,0.0,0.363215,0.003091,0.633694,0.027821,0.0,0.030912,0.006182,0.027821,0.0,0.009274,0.009274,0.0,0.0,0.0,0.006182,0.001546,0.003091,0.122102,0.010819,0.01391,0.119011,0.010819,0.01391,0.006182,0.142195,0.021638,0.047913,0.029366,0.187017,0.0,0.001546,0.0,0.0,0.0,0.145286,0.0,0.006182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001546,0.0,0.236476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12983,0.256569,0.0,0.0,0.0,0.0,0.0,0.001546,0.0,0.0,0.001546,0.0,0.001546,0.0,0.157651,0.202473,0.001546,0.0,0.001546,0.001546,0.0,0.0,0.003091,0.0,0.157651,0.386399,0.0,0.0,0.0,0.0,0.003091,0.003091,0.010819,0.0,0.202473,0.236476,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,15.0,647,0,0,13,13,0.5,0.234362,0.571429,0006a69f,30,-999,1,1482,-999,-999,0.0,0.0,0,-999.0,3,0,0,0,0,0.0
1,0.56,0.16,0.04,0.24,0.000875,0.000875,0.00175,0.000875,0.000875,0.000875,0.0,0.045494,0.002625,0.000875,0.044619,0.002625,0.000875,0.000875,0.022747,0.026247,0.0,0.0,0.0035,0.00175,0.002625,0.004374,0.0035,0.002625,0.010499,0.0035,0.002625,0.009624,0.0035,0.002625,0.002625,0.022747,0.023622,0.000875,0.036745,0.0,0.00175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000875,0.067367,0.067367,0.013123,0.017498,0.004374,0.007874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00175,0.0,0.0,0.0,0.000875,0.0,0.002625,0.00175,0.012248,0.000875,0.00175,0.012248,0.000875,0.00175,0.000875,0.002625,0.0,0.014873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000875,0.013123,0.013123,0.020122,0.020122,0.006124,0.0,0.0,0.000875,0.014873,0.014873,0.015748,0.027122,0.016623,0.043745,0.011374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000875,0.0,0.000875,0.000875,0.000875,0.000875,0.000875,0.000875,0.0035,0.0,0.0035,0.0035,0.0,0.0035,0.005249,0.002625,0.006999,0.0,0.00175,0.005249,0.0,0.0,0.000875,0.000875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000875,0.000875,0.000875,0.0,0.017498,0.016623,0.006124,0.012248,0.018373,0.0,0.015748,0.0,0.00175,0.000875,0.013123,0.013123,0.005249,0.005249,0.0035,0.000875,0.00175,0.013123,0.002625,0.005249,0.013123,0.002625,0.005249,0.000875,0.015748,0.012248,0.0,0.0,0.000875,0.000875,0.000875,0.000875,0.000875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00175,0.0,0.205599,0.00175,0.792651,0.021872,0.000875,0.022747,0.004374,0.019248,0.000875,0.005249,0.005249,0.000875,0.000875,0.0,0.0035,0.000875,0.00175,0.197725,0.009624,0.013998,0.195101,0.009624,0.013998,0.005249,0.111111,0.012248,0.027122,0.032371,0.130359,0.0,0.005249,0.00175,0.0,0.0,0.136483,0.0,0.0035,0.0,0.004374,0.00175,0.0,0.0,0.0,0.0,0.0,0.000875,0.15748,0.133858,0.0,0.0,0.0,0.0,0.178478,0.0,0.0,0.0,0.0,0.0,0.00175,0.0,0.0,0.0,0.053368,0.0,0.0,0.0,0.073491,0.145232,0.0,0.0,0.0,0.0,0.0,0.000875,0.0,0.041995,0.000875,0.0,0.000875,0.000875,0.089239,0.114611,0.000875,0.000875,0.000875,0.000875,0.000875,0.0,0.00175,0.0,0.089239,0.3972,0.0,0.0,0.041995,0.0,0.00175,0.00175,0.008749,0.0,0.114611,0.344707,-1.0,-1.0,-1.0,-1.0,1.0,0,0,0,0,39803,0,0,0,0,39803,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1,0,16.0,1143,1,0,21,38,0.355932,0.242691,0.666667,0006a69f,4,30,0,2280,758,758,39.0,0.0,39,1.0,0,0,0,0,1,3.0
2,0.538462,0.153846,0.076923,0.230769,0.000813,0.000813,0.001626,0.000813,0.000813,0.000813,0.0,0.042276,0.002439,0.000813,0.041463,0.002439,0.000813,0.000813,0.021138,0.02439,0.0,0.0,0.003252,0.001626,0.002439,0.004065,0.003252,0.002439,0.009756,0.003252,0.002439,0.008943,0.003252,0.002439,0.002439,0.021138,0.021951,0.000813,0.034146,0.0,0.001626,0.0,0.0,0.000813,0.0,0.000813,0.0,0.001626,0.008943,0.0,0.001626,0.008943,0.0,0.0,0.017886,0.017886,0.0,0.0,0.003252,0.0,0.0,0.0,0.008943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000813,0.062602,0.062602,0.012195,0.01626,0.004065,0.007317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001626,0.0,0.0,0.0,0.000813,0.0,0.002439,0.001626,0.011382,0.000813,0.001626,0.011382,0.000813,0.001626,0.000813,0.002439,0.0,0.013821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000813,0.012195,0.012195,0.018699,0.018699,0.005691,0.0,0.0,0.000813,0.013821,0.013821,0.014634,0.025203,0.015447,0.04065,0.010569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000813,0.0,0.000813,0.000813,0.000813,0.000813,0.000813,0.000813,0.003252,0.0,0.003252,0.003252,0.0,0.003252,0.004878,0.002439,0.006504,0.0,0.001626,0.004878,0.0,0.0,0.000813,0.000813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000813,0.000813,0.000813,0.0,0.01626,0.015447,0.005691,0.011382,0.017073,0.0,0.014634,0.0,0.001626,0.000813,0.012195,0.012195,0.004878,0.004878,0.003252,0.000813,0.001626,0.012195,0.002439,0.004878,0.012195,0.002439,0.004878,0.000813,0.014634,0.011382,0.0,0.0,0.000813,0.000813,0.000813,0.000813,0.000813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001626,0.0,0.191057,0.001626,0.807317,0.021138,0.000813,0.021951,0.004065,0.017886,0.000813,0.004878,0.004878,0.000813,0.000813,0.0,0.003252,0.000813,0.001626,0.185366,0.017886,0.013008,0.182927,0.017886,0.013008,0.004878,0.103252,0.011382,0.025203,0.047967,0.139024,0.0,0.004878,0.001626,0.0,0.0,0.130081,0.0,0.003252,0.0,0.004065,0.010569,0.0,0.0,0.0,0.0,0.0,0.000813,0.146341,0.12439,0.0,0.070732,0.0,0.0,0.165854,0.0,0.0,0.0,0.0,0.0,0.001626,0.0,0.0,0.0,0.049593,0.0,0.0,0.0,0.068293,0.134959,0.0,0.0,0.0,0.0,0.0,0.000813,0.0,0.039024,0.000813,0.0,0.000813,0.000813,0.082927,0.106504,0.000813,0.000813,0.000813,0.000813,0.000813,0.0,0.001626,0.0,0.082927,0.369106,0.0,0.0,0.109756,0.0,0.001626,0.001626,0.00813,0.0,0.106504,0.320325,0.0,-1.0,-1.0,-1.0,1.0,92799,0,0,0,39803,92799,0,0,0,39803,0,0,0,0,1,11,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1,11,16.0,1230,1,0,21,38,0.355932,0.242691,0.666667,0006a69f,30,4,0,2431,58,58,65.5,26.5,92,0.0,3,1,0,0,1,1.5
3,0.510638,0.191489,0.085106,0.212766,0.000926,0.000463,0.000926,0.000463,0.000463,0.000463,0.0,0.024085,0.00139,0.000463,0.023622,0.00139,0.000463,0.000463,0.012043,0.013895,0.0,0.0,0.001853,0.000926,0.001853,0.003705,0.001853,0.002779,0.007411,0.001853,0.002779,0.006948,0.001853,0.002779,0.001853,0.017601,0.018064,0.000463,0.021306,0.0,0.000926,0.0,0.0,0.000463,0.0,0.000463,0.0,0.000926,0.005095,0.0,0.000926,0.005095,0.0,0.0,0.01019,0.01019,0.0,0.0,0.001853,0.0,0.0,0.0,0.005095,0.000463,0.0,0.003705,0.003242,0.006948,0.006948,0.009727,0.012969,0.003242,0.003705,0.0,0.0,0.000463,0.000926,0.000926,0.000926,0.000926,0.000463,0.000463,0.006484,0.000926,0.002779,0.006484,0.000926,0.002779,0.000463,0.000926,0.003242,0.000926,0.017601,0.0,0.0,0.000463,0.004169,0.0,0.0,0.000463,0.035665,0.035665,0.006948,0.009264,0.002316,0.004169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00139,0.0,0.0,0.0,0.000463,0.0,0.00139,0.000926,0.006484,0.000463,0.000926,0.006484,0.000463,0.000926,0.000463,0.00139,0.0,0.007874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000463,0.00139,0.000926,0.000463,0.000463,0.0,0.003242,0.000463,0.002316,0.002779,0.000463,0.002316,0.000463,0.002779,0.003705,0.002779,0.017138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000926,0.021769,0.021769,0.026401,0.032422,0.035201,0.0,0.0,0.000926,0.012043,0.012043,0.017138,0.020843,0.017601,0.038444,0.009727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000926,0.000463,0.00139,0.000926,0.00139,0.00139,0.000926,0.00139,0.004169,0.0,0.005095,0.004169,0.0,0.005095,0.004169,0.004169,0.005095,0.0,0.000926,0.005095,0.0,0.0,0.000926,0.000926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000926,0.000463,0.000926,0.0,0.015748,0.015285,0.005095,0.009727,0.015285,0.000463,0.011116,0.0,0.000926,0.000926,0.01019,0.009727,0.004632,0.004169,0.003242,0.000926,0.001853,0.010653,0.00139,0.004169,0.01019,0.00139,0.004169,0.000926,0.011116,0.010653,0.0,0.0,0.000926,0.000463,0.000926,0.000926,0.000463,0.000463,0.0,0.002316,0.002316,0.004169,0.003705,0.004169,0.001853,0.0,0.002316,0.002316,0.00139,0.0,0.337193,0.00139,0.661417,0.021769,0.000926,0.024085,0.004169,0.019917,0.002316,0.004632,0.004169,0.000926,0.000926,0.0,0.003705,0.000926,0.002316,0.157943,0.011579,0.018527,0.155628,0.011579,0.018527,0.004632,0.112552,0.013432,0.020843,0.043075,0.145438,0.002779,0.006484,0.004169,0.000926,0.0,0.161186,0.0,0.001853,0.000463,0.002779,0.006021,0.004169,0.0,0.0,0.002316,0.002316,0.000926,0.083372,0.094025,0.0,0.040296,0.05095,0.053265,0.094488,0.0,0.0,0.0,0.0,0.0,0.00139,0.0,0.0,0.0,0.028254,0.0,0.042149,0.0,0.13849,0.128763,0.0,0.0,0.0,0.0,0.0,0.000926,0.000463,0.046318,0.000926,0.0,0.000926,0.000463,0.074572,0.09032,0.000926,0.000463,0.000926,0.000926,0.000463,0.023622,0.00139,0.0,0.149143,0.361742,0.0,0.0,0.086614,0.0,0.002316,0.00139,0.007411,0.0,0.185734,0.205651,0.0,-1.0,-1.0,-1.0,0.0,92799,0,0,0,8789,92799,0,0,0,75419,0,0,0,0,2,11,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2,11,19.0,2159,2,1,33,38,0.464789,0.327096,1.0,0006a69f,30,30,0,56254,9,53796,41.25,31.29996,92,0.0,2,2,0,0,2,1.5
4,0.5,0.178571,0.089286,0.232143,0.000773,0.000773,0.001933,0.00116,0.000773,0.000387,0.000387,0.031323,0.001933,0.00116,0.030549,0.001933,0.00116,0.000773,0.017401,0.018561,0.0,0.0,0.003094,0.000773,0.001547,0.003094,0.001547,0.00232,0.006187,0.001547,0.00232,0.0058,0.001547,0.00232,0.001547,0.014695,0.015081,0.000387,0.017788,0.0,0.000773,0.0,0.0,0.000387,0.0,0.000387,0.0,0.000773,0.004254,0.0,0.000773,0.004254,0.0,0.0,0.008507,0.008507,0.0,0.0,0.001547,0.0,0.0,0.0,0.004254,0.000387,0.0,0.003094,0.002707,0.0058,0.0058,0.008121,0.010828,0.002707,0.003094,0.0,0.0,0.000387,0.000773,0.000773,0.000773,0.000773,0.000387,0.000387,0.005414,0.000773,0.00232,0.005414,0.000773,0.00232,0.000387,0.000773,0.002707,0.000773,0.014695,0.0,0.0,0.000387,0.00348,0.0,0.0,0.000773,0.04679,0.04679,0.009281,0.011601,0.00232,0.0058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00116,0.0,0.0,0.0,0.00116,0.0,0.004254,0.00348,0.023202,0.00116,0.00348,0.023202,0.00116,0.00348,0.000773,0.00464,0.0,0.011988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000387,0.00116,0.000773,0.000387,0.000387,0.0,0.002707,0.000387,0.001933,0.00232,0.000387,0.001933,0.000387,0.00232,0.003094,0.00232,0.014308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000773,0.018175,0.018175,0.022042,0.027069,0.029389,0.0,0.0,0.000773,0.010054,0.010054,0.014308,0.017401,0.014695,0.032096,0.008121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000773,0.000387,0.001547,0.00116,0.001547,0.001547,0.00116,0.001547,0.00464,0.000387,0.0058,0.00464,0.000387,0.0058,0.0058,0.00464,0.006961,0.0,0.00116,0.00464,0.0,0.0,0.001547,0.000773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000773,0.000773,0.000773,0.0,0.013148,0.012761,0.004254,0.008121,0.012761,0.000387,0.009281,0.0,0.000773,0.000773,0.008507,0.008121,0.003867,0.00348,0.002707,0.000773,0.001547,0.008894,0.00116,0.00348,0.008507,0.00116,0.00348,0.000773,0.009281,0.008894,0.0,0.0,0.000773,0.00116,0.000773,0.000773,0.000773,0.000387,0.0,0.001933,0.001933,0.00348,0.003094,0.00348,0.001547,0.0,0.001933,0.001933,0.00116,0.0,0.281516,0.00116,0.717324,0.021655,0.00116,0.024749,0.003867,0.020495,0.00232,0.003867,0.00348,0.00116,0.000773,0.000387,0.003094,0.000773,0.001933,0.179041,0.011601,0.020495,0.176721,0.011601,0.020495,0.00464,0.107115,0.011214,0.017401,0.040603,0.127997,0.00232,0.0058,0.003867,0.000773,0.0,0.149652,0.0,0.001547,0.000387,0.00464,0.005027,0.00348,0.0,0.0,0.001933,0.001933,0.000773,0.114076,0.0785,0.0,0.033643,0.042537,0.04447,0.123357,0.0,0.0,0.0,0.0,0.0,0.00116,0.0,0.0,0.0,0.08198,0.0,0.035189,0.0,0.115623,0.107502,0.0,0.0,0.0,0.0,0.0,0.000773,0.000387,0.054911,0.000773,0.0,0.000773,0.000773,0.062258,0.075406,0.000773,0.00116,0.000773,0.000773,0.000773,0.019722,0.00116,0.0,0.124517,0.346481,0.0,0.0,0.088554,0.0,0.001933,0.00116,0.007734,0.0,0.155066,0.274555,0.0,-1.0,-1.0,-1.0,0.5,92799,0,0,0,31843,92799,0,0,0,107262,0,0,0,0,3,11,0,0,0,1,0.0,0.0,0.0,0.0,1.0,3,12,22.0,2586,3,1,55,48,0.533981,0.331841,0.777778,0006a69f,4,30,0,57160,873,873,39.2,28.294169,92,0.5,3,2,0,1,2,1.6


# Feature selection

In [13]:
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title', 'hightest_level']:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test
features = [i for i in new_train.columns if i not in ["game_session"]]
to_exclude, ajusted_test = exclude(new_train, new_test, features)

Air Show_4080
Bottle Filler (Activity)_2010
Bubble Bath_4080
Bubble Bath_4090
Bug Measurer (Activity)_4080
Cart Balancer (Assessment)_4080
Chest Sorter (Assessment)_4080
Crystals Rule_2010
Dino Dive_4080
Dino Drink_4080
Egg Dropper (Activity)_4080
Fireworks (Activity)_4080
Happy Camel_4080
Leaf Leader_4080
Mushroom Sorter (Assessment)_4080
Pan Balance_2010
Pan Balance_4080
Sandcastle Builder (Activity)_2010
Scrub-A-Dub_4080
Watering Hole (Activity)_2010
acc_Cart Balancer (Assessment)


In [14]:
def get_random_assessment(reduce_train):
    used_idx = []
    #for iid in tqdm(set(reduce_train['installation_id']), miniters=200):
    for iid in set(reduce_train['installation_id']):
        list_ = list(reduce_train[reduce_train['installation_id'] == iid].index)
        cur = random.choices(list_, k=1)[0]
        used_idx.append(cur)
    reduce_train_t = reduce_train.loc[used_idx]
    return reduce_train_t, used_idx

def feature_selection(train):
    X_train = train.drop(['accuracy_group'],axis=1) 
    y_train = train.accuracy_group.copy()
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]

    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 1,'learning_rate': 0.017891320270412462,'max_depth': 7,'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}

    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    feat_values = np.zeros([len(features_list)])
    random_try = 10
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
        
        for try_time in range(random_try): 
            print("Fold "+str(i+1)+" random try " +str(try_time+1))
            X_test2 = X_train.iloc[test_index,:]
            y_test2 = y_train.iloc[test_index]
            
            X_test2, idx_val = get_random_assessment(X_test2)
            #tmp_target = X_test2.loc[idx_val]["past_target"]
            X_test2.drop(['installation_id'], inplace=True, axis=1) # 'past_target'
            y_test2 = y_test2.loc[idx_val]
            print("After truncation:", (X_test2.shape, y_test2.shape))
            
            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
            valid = np.array(clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], ))
            real = np.array(y_test2)
            feat_values += np.array(clf.feature_importance()) / random_try
                
            #print("logloss = \t {}".format(log_loss(real, valid)))
            print("ROC = \t {}".format(roc_auc_score(real, valid)))
        feature_importance_df["Fold_"+str(i+1)] = feat_values
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    return feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True)
df_for_classification = feature_selection(new_train)
feat = sorted(list(df_for_classification.iloc[:300]["Feature"]))

Fold 1 random try 1
After truncation: ((714, 543), (714,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[396]	training's auc: 0.889951	valid_1's auc: 0.816056
ROC = 	 0.816055862390205
Fold 1 random try 2
After truncation: ((714, 543), (714,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[276]	training's auc: 0.87624	valid_1's auc: 0.805849
ROC = 	 0.80584947682481
Fold 1 random try 3
After truncation: ((714, 543), (714,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[394]	training's auc: 0.889735	valid_1's auc: 0.794711
ROC = 	 0.7947113780447115
Fold 1 random try 4
After truncation: ((714, 543), (714,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[271]	training's auc: 0.875634	valid_1's auc: 0.80317
ROC = 	 0.8031698365031699
Fold 1 random try 5
After truncation: ((714, 543), (7

# parameter tuning

In [15]:
def my_hyperopt(X, Y):
    def para_tuning_obj(params):
        params = {
        'boosting_type': 'gbdt', 
        'metric': "auc", 
        'objective': 'binary', 
        'eval_metric': 'cappa', 
        "tree_learner": "serial",
        'max_depth': int(params['max_depth']),
        'bagging_freq': int(params['bagging_freq']),
        'bagging_fraction': float(params['bagging_fraction']),
        'num_leaves': int(params['num_leaves']),
        'learning_rate': float(params['learning_rate']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'min_sum_hessian_in_leaf': int(params['min_sum_hessian_in_leaf']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
}
    
        real = np.array([])
        pred = np.array([])
        skf = GroupKFold(n_splits=10)
        for trn_idx, val_idx in skf.split(X, Y, X["installation_id"]):
            x_train, x_val = X.iloc[trn_idx, :], X.iloc[val_idx, :]
            y_train, y_val = Y.iloc[trn_idx], Y.iloc[val_idx]
            x_val['accuracy_group'] = y_val
            np.random.seed(0)
            x_val_mod = x_val.groupby('installation_id').agg(np.random.choice).reset_index(drop=False)
            y_val_mod = x_val_mod.accuracy_group.copy()
            x_train.drop('installation_id', inplace = True, axis = 1)
            x_val_mod.drop(['installation_id', "accuracy_group"], inplace = True, axis = 1)
            train_set = lgb.Dataset(x_train, y_train, categorical_feature = ['session_title'])
            val_set = lgb.Dataset(x_val_mod, y_val_mod, categorical_feature = ['session_title'])
        
            clf = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                         valid_sets = [train_set, val_set], verbose_eval = 300)
            pred = np.concatenate((pred, np.array(clf.predict(x_val_mod, num_iteration = clf.best_iteration))), axis=0) 
            real = np.concatenate((real, np.array(y_val_mod)), axis=0) 
        score = roc_auc_score(real, pred)
    
        return - score

    trials = Trials()

    space ={
        'max_depth': hp.quniform('max_depth', 1, 15, 1),
        'bagging_freq': hp.quniform('bagging_freq', 1, 10, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.2, 1.0),
        'num_leaves': hp.quniform('num_leaves', 8, 64, 1),
        'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 8, 64, 1),
        'min_sum_hessian_in_leaf': hp.quniform('min_sum_hessian_in_leaf', 5, 30, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0)
    }

    best = fmin(para_tuning_obj, space = space, algo=tpe.suggest, max_evals=10, trials=trials, verbose=1)

    best_params = space_eval(space, best)
    return best_params

#X_train = new_train.drop(["accuracy_group"], axis=1).copy()
#Y = new_train.accuracy_group.copy()
#Y.loc[Y <=1] = 0
#Y.loc[Y >=2] = 1
#lbl = preprocessing.LabelEncoder()
#lbl.fit(list(X_train["installation_id"]))
#X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
#remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
#                      or i in to_exclude]
#for i in X_train.columns:
#    if X_train[i].std() == 0 and i not in remove_features:
#        remove_features.append(i)
#X_train = X_train.drop(remove_features, axis=1)
#X_train = X_train[sorted(X_train.columns.tolist())]

#random_state = 42
#my_hyperopt(X_train, Y)

# truncated classification

In [16]:
def accuracy_class(train, test, fea, select_flg):
    y_train = train.accuracy_group.copy()
    X_train = train.rename(columns={"accuracy_group": "past_target"})
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    if select_flg == True:
        X_train = X_train[fea + ["installation_id", "past_target"]]
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    if select_flg == True:
        X_test = X_test[fea]
    X_test = X_test[sorted(X_test.columns.tolist())]
    
    n_folds = 5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.5698056418890787,'bagging_freq': 4,
 'colsample_bytree': 0.37564408454469,'learning_rate': 0.015433389422506185,'max_depth': 8,
 'min_data_in_leaf': 51,'min_sum_hessian_in_leaf': 10,'num_leaves': 48}
    random_try = 40
    mean_qwk_score = 0
    # model learning ---------------------------
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id', 'past_target'],axis=1)
    
        for try_time in range(random_try):
            print("Fold "+str(i+1)+" random try " +str(try_time))
            X_test2 = X_train.iloc[test_index,:]
            y_test2 = y_train.iloc[test_index]
            
            # random truncation ------------------------------------------
            X_test2, idx_val = get_random_assessment(X_test2)
            tmp_target = X_test2.loc[idx_val]["past_target"]
            X_test2.drop(['installation_id', 'past_target'], inplace=True, axis=1)
            y_test2 = y_test2.loc[idx_val]
            print("After truncation:", (X_test2.shape, y_test2.shape))
            # ------------------------------------------------------------
            
            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
            valid = np.array(clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], ))
            real = np.array(y_test2)
            target = np.array(tmp_target)
            
            models.append(clf)
            #print("logloss = \t {}".format(log_loss(real, valid)))
            print("ROC = \t {}".format(roc_auc_score(real, valid)))
        
            # threshold optimization --------------
            best_score = 0
            for j in range(20):
                optR = OptimizedRounder_cla()
                optR.fit(np.array(valid).reshape(-1,), target, random_flg=True)
                coefficients = optR.coefficients()
                final_valid_pred = optR.predict(np.array(valid).reshape(-1,), coefficients)
                score = qwk(target, final_valid_pred)
                print(j, np.sort(coefficients), score)
                if score > best_score:
                    best_score = score
                    best_coefficients = coefficients
            mean_qwk_score += best_score / (random_try * n_folds)
            if try_time == 0 and i == 0:
                final_coefficients = np.sort(best_coefficients) / (random_try * n_folds)
            else:
                final_coefficients += np.sort(best_coefficients) / (random_try * n_folds)
            
    print("MEAN QWK = \t {}".format(mean_qwk_score))
    # test prediction  ------------------------
    pred_value = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value, valid, final_coefficients

pred_value, valid, final_coefficients = accuracy_class(new_train, new_test, feat, True)
final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort(final_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort([0.32229148, 0.51887455, 0.77529457])) + [np.inf], labels = [0, 1, 2, 3])

sample_submission["accuracy_group"] = final_test_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission["accuracy_group"].value_counts(normalize = True)

Fold 1 random try 0
After truncation: ((714, 300), (714,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.935294	valid_1's auc: 0.845054
Early stopping, best iteration is:
[412]	training's auc: 0.926404	valid_1's auc: 0.845911
ROC = 	 0.8459110946318956
0 [0.42589484 0.52825832 0.68521029] 0.57893707
1 [0.41419134 0.52648226 0.75630002] 0.58575075
2 [0.41456457 0.58439157 0.75574249] 0.59300522
3 [0.47958177 0.58589665 0.75598641] 0.59039719
4 [0.41988501 0.58578465 0.7591388 ] 0.59270849
5 [0.42109464 0.56427043 0.63561976] 0.57040555
6 [0.42264537 0.60070936 0.63109239] 0.57583624
7 [0.44332646 0.59825542 0.61646784] 0.57271845
8 [0.41288079 0.59944072 0.75860869] 0.59284749
9 [0.48721269 0.59882938 0.63109753] 0.5737237
10 [0.4744058  0.59866248 0.75846456] 0.59113283
11 [0.42436655 0.601313   0.75862386] 0.5939537
12 [0.4141464  0.59912108 0.63128748] 0.57555714
13 [0.41766432 0.56453056 0.63064237] 0.57122178
14 [0.41222903 0.56453073 0.73478

3    0.449
2    0.228
0    0.175
1    0.148
Name: accuracy_group, dtype: float64