- modify truncated validation

In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, cohen_kappa_score
import optuna.integration.lightgbm as lgb_opt
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder_cla(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        #return -mod_qwk(y, X_p, weights=weights)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.4,0.5), np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']
    
class OptimizedRounder_reg(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        #return -mod_qwk(y, X_p, weights=weights)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(1.0,1.1), np.random.uniform(1.7,1.8), np.random.uniform(2.1,2.2)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [4]:
def eval_qwk_lgb_regr(y_pred, train_t):
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

# install

In [5]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 10s, sys: 7.61 s, total: 1min 18s
Wall time: 1min 18s


# Preprocess and Feature engineering

In [6]:
%%time
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = sorted(list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique()))))
    
    train['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), train['type'], train['world']))
    test['type_world'] = list(map(lambda x, y: str(x) + '_' + str(y), test['type'], test['world']))
    list_of_type_world = sorted(list(set(train['type_world'].unique()).union(set(test['type_world'].unique()))))
    
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
        
    train["true"] = train["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)
    test["true"] = test["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)

    train["false"] = train["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    test["false"] = test["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    
    train["game_complete"] = train["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
    test["game_complete"] = test["event_data"].apply(lambda x: 1 if "game_completed" in x else 0)
    
    #train["level"] = train["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    #test["level"] = test["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    
    #train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    #test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
               
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_type_world = encode_title(train, test)

CPU times: user 1min 47s, sys: 9.03 s, total: 1min 56s
Wall time: 1min 56s


In [7]:
def make_ratio(features, dic):
    total = sum(dic.values())
    if total != 0:
        for key in dic.keys():
            features[str(key)] = features[str(key)] / total
    else:
        pass
    return features

def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    event_code_count = {str(ev): 0 for ev in list_of_event_code}
    title_count = {actv: 0 for actv in list_of_user_activities}
    type_world_count = {str(ev): 0 for ev in list_of_type_world}
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = user_sample.iloc[0]['timestamp']
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    durations = []
    prev_assess_title = -999
    assess_count = 1
    last_accuracy = -999
    prev_assess_start = -999; prev_assess_end = -999
    real_prev_assess_start = -999; real_prev_assess_end = -999
    real_assess_start = -999; real_assess_end = -999
    complete_games = 0
    no_result_count = 0
    crys_game_level = np.array([]); tree_game_level = np.array([]); magma_game_level = np.array([])
    crys_game_round = np.array([]); tree_game_round = np.array([]); magma_game_round = np.array([])
    
    for i, session in user_sample.groupby('game_session', sort=False):      
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        if session_type != 'Assessment':
            if session_type == "Game":
                true = session['true'].sum()
                false = session['false'].sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    #crys_game_level = np.concatenate([crys_game_level, session["level"]], axis=0)
                    #crys_game_round = np.concatenate([crys_game_round, session["round"]], axis=0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    #tree_game_level = np.concatenate([tree_game_level, session["level"]], axis=0)
                    #tree_game_round = np.concatenate([tree_game_round, session["round"]], axis=0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    #magma_game_level = np.concatenate([magma_game_level, session["level"]], axis=0)
                    #magma_game_round = np.concatenate([magma_game_round, session["round"]], axis=0)
                else:
                    pass
                
        if (session_type == 'Assessment') & (test_set or len(session)>1): 
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            assess_start = session.iloc[0,2]
            assess_end = session.iloc[-1,2]
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features = make_ratio(features, user_activities_count)
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features = make_ratio(features, title_eventcode_count)
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features = make_ratio(features, user_world_count)
            features.update(event_code_count.copy())
            features = make_ratio(features, event_code_count)
            features.update(title_count.copy())
            features = make_ratio(features, title_count)
            features.update(type_world_count.copy())
            features = make_ratio(features, type_world_count)
            features.update(last_accuracy_title.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
            features["no_complete_game"] = complete_games
            features["no_result_count"] = no_result_count 
            
            if true_attempts + false_attempts == 0:
                no_result_count += 1
            else:
                real_assess_start = session.iloc[0,2]
                real_assess_end = session.iloc[-1,2]
             
            #features["crys_game_true"] = crys_game_true
            #features["crys_game_false"] = crys_game_false
            #features['crys_game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
            #features["crys_game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
            #features["cryslast_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
            #features["tree_game_true"] = tree_game_true
            #features["tree_game_false"] = tree_game_false
            #features['tree_game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
            #features["tree_game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
            #features["tree_last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
            #features["magma_game_true"] = magma_game_true
            #features["magma_game_false"] = magma_game_false
            #features['magma_game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
            #features["magma_game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
            #features["magma_last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
            
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            features["prev_assess_title"] = prev_assess_title
            prev_assess_title = session_title
            features["first_assessment"] = 1 if assess_count == 1 else 0
            #features["assess_count"] = assess_count
            assess_count += 1
            features["time_from_start"] = (assess_start - time_first_activity).seconds

            if prev_assess_end == -999:
                features["time_bet_assess"] = -999
            else:
                features["time_bet_assess"] = (assess_start - prev_assess_end).seconds
            prev_assess_start = assess_start
            prev_assess_end = assess_end
            if real_prev_assess_end == -999:
                features["time_bet_real_assess"] = -999
            else:
                features["time_bet_real_assess"] = (real_assess_start - real_prev_assess_end).seconds
            real_prev_assess_start = real_assess_start
            real_prev_assess_end = real_assess_end
            
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            features['last_assess_acc'] = last_accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            last_accuracy = accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        complete_games += np.sum(session["game_complete"])
        miss += np.sum(session["misses"])
        user_world_count["world_"+str(session_world)] += session.shape[0]
        
        n_of_type_world = Counter(session['type_world']) 
        for key in n_of_type_world.keys():
            type_world_count[str(key)] += n_of_type_world[key]
            
        n_of_title = Counter(session['title']) 
        for key in n_of_title.keys():
            title_count[activities_labels[key]] += n_of_title[key]
            
        n_of_eventcode = Counter(session['event_code']) 
        for key in n_of_eventcode.keys():
            event_code_count[str(key)] += n_of_eventcode[key]
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
        
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1], all_assessments[:-1] # test previous data to incorporate into training
    return all_assessments

In [8]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_val = []

    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        compiled_train += get_data(user_sample)
    del train
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
        test_data, val_data = get_data(user_sample, test_set=True)
        compiled_test.append(test_data)
        compiled_val += val_data
    del test
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_val = pd.DataFrame(compiled_val)

    categoricals = ['session_title']
    return reduce_train, reduce_test, reduce_val, categoricals
new_train, new_test, new_val, categoricals = get_train_and_test(train, test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=17000, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




In [9]:
tmp = new_train[new_train.Game==0].copy()
tmp = tmp[tmp.Activity == 0].copy()
tmp = tmp[tmp.Clip == 0].copy()
tmp = tmp[tmp.Assessment ==0].copy()
remove_train_index = tmp.index
new_train = new_train[~new_train.index.isin(remove_train_index)].copy()

In [10]:
print(new_train.shape)
print(new_test.shape)
print(new_val.shape)

(17577, 563)
(1000, 563)
(2347, 563)


In [11]:
# data augmentation
tmp = new_val[new_val.Game==0].copy()
tmp = tmp[tmp.Activity == 0].copy()
tmp = tmp[tmp.Clip == 0].copy()
tmp = tmp[tmp.Assessment ==0].copy()
remove_val_index = tmp.index
add_val = new_val[~new_val.index.isin(remove_val_index)].copy()

#tmp = add_val.installation_id.value_counts().reset_index(drop=False) # include some part of new_test installation_id
#val_id = list(tmp[tmp.installation_id >= 20]["index"])
#add_val = add_val[add_val.installation_id.isin(val_id)]

mod_train = pd.concat([new_train, add_val], ignore_index=True)
mod_train.shape

(19906, 563)

# Feature selection

In [12]:
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title', 'hightest_level']:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test
features = [i for i in new_train.columns if i not in ["game_session"]]
to_exclude, ajusted_test = exclude(new_train, new_test, features)

Air Show_4080
Bottle Filler (Activity)_2010
Bubble Bath_4080
Bubble Bath_4090
Bug Measurer (Activity)_4080
Cart Balancer (Assessment)_4080
Chest Sorter (Assessment)_4080
Crystals Rule_2010
Dino Dive_4080
Dino Drink_4080
Egg Dropper (Activity)_4080
Fireworks (Activity)_4080
Happy Camel_4080
Leaf Leader_4080
Mushroom Sorter (Assessment)_4080
Pan Balance_2010
Pan Balance_4080
Sandcastle Builder (Activity)_2010
Scrub-A-Dub_4080
Watering Hole (Activity)_2010
acc_Cart Balancer (Assessment)


# classification

In [13]:
def feature_selection(train):
    X_train = train.drop(['accuracy_group'],axis=1) 
    y_train = train.accuracy_group.copy()
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]

    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 1,'learning_rate': 0.017891320270412462,'max_depth': 7,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}

    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
            
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
                
    print("logloss = \t {}".format(log_loss(y_train, valid)))
    print("ROC = \t {}".format(roc_auc_score(y_train, valid)))
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    return feature_importance_df
#df_for_classification = feature_selection(new_train)

In [14]:
def accuracy_class(train, test, fea, select_flg):
    X_train = train.drop(['accuracy_group'],axis=1) 
    y_train = train.accuracy_group.copy()
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude ]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    if select_flg == True:
        X_train = X_train[fea]
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    if select_flg == True:
        X_test = X_test[fea]
    X_test = X_test[sorted(X_test.columns.tolist())]
    print(X_test.shape[1])
    
    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 1,'learning_rate': 0.017891320270412462,'max_depth': 7,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}

    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2 = X_test2.drop(['installation_id'],axis=1)
        if select_flg == True:
            X_train2 = X_train2[fea] 
            X_test2 = X_test2[fea]
            
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        models.append(clf)
        valid.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
                
    print("logloss = \t {}".format(log_loss(y_train, valid)))
    print("ROC = \t {}".format(roc_auc_score(y_train, valid)))
    print('Accuracy score = \t {}'.format(accuracy_score(y_train, np.round(valid))))
    print('Precision score = \t {}'.format(precision_score(y_train, np.round(valid))))
    print('Recall score =   \t {}'.format(recall_score(y_train, np.round(valid))))
    print('F1 score =      \t {}'.format(f1_score(y_train, np.round(valid))))
    print(confusion_matrix(y_train, np.round(valid)))
    pred_value = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value, valid

#tmp = df_for_classification.sort_values("Cv", ascending = True).reset_index(drop=True).copy()
#feat = tmp[tmp.index <= 120]["Feature"]
#feat = []
#pred_value, valid = accuracy_class(new_train, new_test, feat, False)
#pred_value, valid = accuracy_class(mod_train, new_test, feat, False)

In [15]:
#best_score = 0
#for i in range(10):
#    optR = OptimizedRounder_cla()
#    #optR.fit(np.array(valid).reshape(-1,), new_train.accuracy_group, random_flg=True)
#    optR.fit(np.array(valid).reshape(-1,), mod_train.accuracy_group, random_flg=True)
#    coefficients = optR.coefficients()
#    final_valid_pred = optR.predict(np.array(valid).reshape(-1,), coefficients)
    #score = qwk(new_train.accuracy_group, final_valid_pred)
    #score = qwk(mod_train.accuracy_group, final_valid_pred)
#    print(i, np.sort(coefficients), score)
#    if score > best_score:
#        best_score = score
#        best_coefficients = coefficients
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort(best_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort([0.31635244, 0.54903181, 0.74679975])) + [np.inf], labels = [0, 1, 2, 3])

#sample_submission["accuracy_group"] = final_test_pred.astype(int)
#sample_submission.to_csv('submission.csv', index=False)
#sample_submission["accuracy_group"].value_counts(normalize = True)

# truncated version

In [16]:
def feature_selection_mod(train):
    X_train = train.drop(['accuracy_group'],axis=1) 
    y_train = train.accuracy_group.copy()
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]

    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.9605425291685099,'bagging_freq': 4,'colsample_bytree': 0.6784238046856443,
 'feature_fraction': 1,'learning_rate': 0.017891320270412462,'max_depth': 7,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}

    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    random_try = 10
    for try_time in range(random_try):
        valid = np.array([])
        real = np.array([])
        for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
            print("Fold "+str(i+1))
            X_train2 = X_train.iloc[train_index,:]
            y_train2 = y_train.iloc[train_index]
            X_train2 = X_train2.drop(['installation_id'],axis=1)
    
            X_test2 = X_train.iloc[test_index,:]
            y_test2 = y_train.iloc[test_index]
            print("Before truncation:", (X_test2.shape, y_test2.shape))
            X_test2['accuracy_group'] = y_test2
            np.random.seed(try_time)
            X_test2 = X_test2.groupby('installation_id').agg(np.random.choice).reset_index(drop=False)
            y_test2 = X_test2.accuracy_group.copy()
            X_test2.drop(["accuracy_group"], axis=1, inplace=True)
            print("After truncation:", (X_test2.shape, y_test2.shape))
            X_test2 = X_test2.drop(['installation_id'],axis=1)
            
            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
            valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], )
            #mean_score += average_precision_score(y_test2,valid_predict) / n_folds
            valid = np.concatenate([valid, valid_predict])
            real = np.concatenate([real, y_test2])
            feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
                
        print("logloss = \t {}".format(log_loss(real, valid)))
        print("ROC = \t {}".format(roc_auc_score(real, valid)))
        feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
        feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
        feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    return feature_importance_df
#df_for_classification = feature_selection_mod(new_train)

In [17]:
def get_random_assessment(reduce_train):
    used_idx = []
    for iid in tqdm(set(reduce_train['installation_id']), miniters=200):
        list_ = list(reduce_train[reduce_train['installation_id'] == iid].index)
        cur = random.choices(list_, k=1)[0]
        used_idx.append(cur)
    reduce_train_t = reduce_train.loc[used_idx]
    return reduce_train_t, used_idx

def accuracy_class_mod(train, test, fea, select_flg):
    #X_train = train.drop(["accuracy_group"],axis=1) 
    y_train = train.accuracy_group.copy()
    X_train = train.rename(columns={"accuracy_group": "past_target"})
    y_train.loc[y_train <=1] = 0
    y_train.loc[y_train >=2] = 1
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    if select_flg == True:
        X_train = X_train[fea]
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    if select_flg == True:
        X_test = X_test[fea]
    X_test = X_test[sorted(X_test.columns.tolist())]
    
    n_folds = 10
    skf=GroupKFold(n_splits = n_folds)
    models = []
    lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','bagging_fraction': 0.5698056418890787,'bagging_freq': 4,
 'colsample_bytree': 0.37564408454469,'learning_rate': 0.015433389422506185,'max_depth': 8,
 'min_data_in_leaf': 51,'min_sum_hessian_in_leaf': 10,'num_leaves': 48}
    random_try = 5
    mean_qwk_score = 0
    for try_time in range(random_try):
        # model learning ---------------------------
        for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
            target = np.array([])
            print("Fold "+str(i+1))
            X_train2 = X_train.iloc[train_index,:]
            y_train2 = y_train.iloc[train_index]
            X_train2 = X_train2.drop(['installation_id', 'past_target'],axis=1)
    
            X_test2 = X_train.iloc[test_index,:]
            y_test2 = y_train.iloc[test_index]
            
            X_test2, idx_val = get_random_assessment(X_test2)
            tmp_target = X_test2.loc[idx_val]["past_target"]
            X_test2.drop(['installation_id', 'past_target'], inplace=True, axis=1)
            y_test2 = y_test2.loc[idx_val]
            print("After truncation:", (X_test2.shape, y_test2.shape))
            
            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,categorical_feature = categoricals)
            valid = np.array(clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], ))
            real = np.array(y_test2)
            target = np.array(tmp_target)
            
            models.append(clf)
            print("logloss = \t {}".format(log_loss(real, valid)))
            print("ROC = \t {}".format(roc_auc_score(real, valid)))
        
            # threshold optimization --------------
            best_score = 0
            for i in range(20):
                optR = OptimizedRounder_cla()
                optR.fit(np.array(valid).reshape(-1,), target, random_flg=True)
                coefficients = optR.coefficients()
                final_valid_pred = optR.predict(np.array(valid).reshape(-1,), coefficients)
                score = qwk(target, final_valid_pred)
                print(i, np.sort(coefficients), score)
                if score > best_score:
                    best_score = score
                    best_coefficients = coefficients
            mean_qwk_score += best_score / (random_try * n_folds)
            if try_time == 0:
                final_coefficients = np.sort(best_coefficients) / (random_try * n_folds)
            else:
                final_coefficients += np.sort(best_coefficients) / (random_try * n_folds)
            
    print("MEAN QWK = \t {}".format(mean_qwk_score))
    # test prediction  ------------------------
    pred_value = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value, valid, final_coefficients

#tmp = df_for_classification.sort_values("Cv", ascending = True).reset_index(drop=True).copy()
#feat = tmp[tmp.index <= 120]["Feature"]
feat = []
pred_value, valid, final_coefficients = accuracy_class_mod(new_train, new_test, feat, False)
final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort(final_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort([0.32229148, 0.51887455, 0.77529457])) + [np.inf], labels = [0, 1, 2, 3])

sample_submission["accuracy_group"] = final_test_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission["accuracy_group"].value_counts(normalize = True)

Fold 1


HBox(children=(IntProgress(value=0, max=355), HTML(value='')))


After truncation: ((355, 538), (355,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	training's auc: 0.884052	valid_1's auc: 0.802575
logloss = 	 0.5232098309406895
ROC = 	 0.8025750329243779
0 [0.41047781 0.59993622 0.61067068] 0.53606924
1 [0.40729469 0.55923765 0.67633519] 0.54828137
2 [0.41142905 0.56244625 0.67736436] 0.54964578
3 [0.4085878  0.59987651 0.67571715] 0.55667164
4 [0.43295025 0.56244407 0.60825133] 0.5231377
5 [0.41144516 0.5578541  0.67639367] 0.55047862
6 [0.456449   0.56357231 0.67850087] 0.54097211
7 [0.40975892 0.56483849 0.6091776 ] 0.52767878
8 [0.41231296 0.5686101  0.67631279] 0.54988841
9 [0.4349951  0.55973902 0.67629044] 0.54433579
10 [0.43515883 0.59995704 0.67449352] 0.55037421
11 [0.45855804 0.60019746 0.68205013] 0.54527648
12 [0.4337443  0.56383673 0.67722591] 0.54292853
13 [0.40982472 0.55971353 0.68014843] 0.54744696
14 [0.4124459  0.60932571 0.67880099] 0.55054154
15 [0.43732881 0.56267588 

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))


After truncation: ((356, 538), (356,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.835264	valid_1's auc: 0.795905
logloss = 	 0.6565671319367186
ROC = 	 0.7959047998936312
0 [0.42634539 0.54939174 0.68881243] 0.0
1 [0.4910971  0.52340556 0.62534722] 0.25144619
2 [0.42334228 0.51983596 0.62353593] 0.2491031
3 [0.40315969 0.62303984 0.62895168] 0.47043208
4 [0.32229717 0.62156934 0.62534045] 0.45096455
5 [0.42401923 0.51503249 0.62505306] 0.25366876
6 [0.45790781 0.53712867 0.62535876] 0.25144619
7 [0.45908689 0.51457667 0.69876663] 0.0
8 [0.51444154 0.56526827 0.625393  ] 0.25144619
9 [0.46184359 0.62312522 0.6593146 ] 0.33502866
10 [0.40396972 0.62319938 0.62898035] 0.47433619
11 [0.42396339 0.55005226 0.62503329] 0.25366876
12 [0.44148874 0.5492105  0.62473044] 0.25363357
13 [0.46986671 0.54829282 0.62538303] 0.25144619
14 [0.44863492 0.6235677  0.62892063] 0.47208896
15 [0.44397292 0.56467819 0.62537741] 0.25

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))


After truncation: ((357, 538), (357,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	training's auc: 0.867125	valid_1's auc: 0.840544
logloss = 	 0.5621899614693733
ROC = 	 0.8405441274054414
0 [0.45333572 0.588183   0.6595589 ] 0.59376751
1 [0.52369468 0.58825677 0.66150841] 0.62414847
2 [0.45637119 0.61495603 0.66187636] 0.60059403
3 [0.54802922 0.58866254 0.66140671] 0.61751363
4 [0.53103365 0.60191494 0.66102577] 0.6206298
5 [0.4556603  0.58899544 0.66154536] 0.59594566
6 [0.53034753 0.62555447 0.65972789] 0.62356957
7 [0.48401663 0.58880391 0.66042246] 0.61106863
8 [0.53123494 0.61499698 0.65981883] 0.62938786
9 [0.47629835 0.57678324 0.65970938] 0.59809998
10 [0.5225735  0.6274666  0.65939618] 0.61978324
11 [0.44673203 0.58898677 0.62834146] 0.57359811
12 [0.47059801 0.58837895 0.65740904] 0.59833164
13 [0.44142196 0.54944198 0.65941233] 0.56398281
14 [0.53201345 0.62419208 0.65970554] 0.62181825
15 [0.5480372  0.58849871 0

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[338]	training's auc: 0.914993	valid_1's auc: 0.824194
logloss = 	 0.49303340592379785
ROC = 	 0.824193763326226
0 [0.48520855 0.48823931 0.71857778] 0.60834584
1 [0.44162268 0.54263039 0.67514895] 0.60147493
2 [0.46716724 0.48014277 0.67541237] 0.61310989
3 [0.48917998 0.60248128 0.67596883] 0.58780815
4 [0.48832297 0.4941435  0.67516291] 0.61438217
5 [0.48759949 0.53056742 0.6788529 ] 0.60944333
6 [0.47676829 0.56031674 0.68765071] 0.6028613
7 [0.49357692 0.56013533 0.67763174] 0.60376927
8 [0.48935856 0.49001148 0.67473266] 0.61438217
9 [0.47890901 0.53060393 0.68792825] 0.60981957
10 [0.47217901 0.47413152 0.67540245] 0.6131816
11 [0.48321996 0.60403239 0.63214272] 0.5704033
12 [0.44141174 0.53091223 0.6816547 ] 0.60721984
13 [0.43655845 0.48579439 0.67790402] 0.61030543
14 [0.48328888 0.5523536  0.67763272] 0.60376927
15 [0.48828658 0.49539623 0.

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[368]	training's auc: 0.919733	valid_1's auc: 0.832432
logloss = 	 0.5160889915661622
ROC = 	 0.8324320050600885
0 [0.41508017 0.5415616  0.71073952] 0.56269867
1 [0.38935432 0.65335664 0.78042534] 0.57413342
2 [0.50033521 0.54358367 0.78029672] 0.584079
3 [0.4192843  0.50105625 0.75528117] 0.5801979
4 [0.41946022 0.49794868 0.75481103] 0.5801979
5 [0.40292769 0.53778156 0.75496231] 0.58276003
6 [0.41700293 0.54146107 0.77837482] 0.58373519
7 [0.40769993 0.54701509 0.7545469 ] 0.58603738
8 [0.48674997 0.51609462 0.75482128] 0.57779894
9 [0.41956789 0.54573761 0.76894243] 0.58545353
10 [0.4169473  0.66148498 0.76809723] 0.58015068
11 [0.36081537 0.54176096 0.7681881 ] 0.59127699
12 [0.37025488 0.50985118 0.78035452] 0.57936961
13 [0.41481987 0.65383598 0.76771564] 0.58158508
14 [0.35899781 0.65433253 0.76811335] 0.58132348
15 [0.4212753  0.50144371 0.7

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932027	valid_1's auc: 0.807868
Early stopping, best iteration is:
[401]	training's auc: 0.921993	valid_1's auc: 0.808488
logloss = 	 0.4909727344869291
ROC = 	 0.8084884994523549
0 [0.38064289 0.45744884 0.75784926] 0.51767733
1 [0.36913381 0.56288928 0.69942087] 0.50616855
2 [0.44872502 0.56150296 0.6983188 ] 0.48888544
3 [0.32771639 0.63713034 0.70048979] 0.52452668
4 [0.38158085 0.56125932 0.75554425] 0.51393993
5 [0.38345083 0.67615304 0.71840129] 0.52574971
6 [0.30972395 0.63720125 0.77358442] 0.52273554
7 [0.38316027 0.39953526 0.75575786] 0.52504253
8 [0.38297718 0.39795465 0.78183293] 0.52183888
9 [0.36700126 0.63861685 0.71837409] 0.52837475
10 [0.38345375 0.63857561 0.70148676] 0.52933954
11 [0.45699227 0.66862303 0.75657691] 0.50698437
12 [0.3671111 0.5613702 0.6995813] 0.50854254
13 [0.3836619  0.63866179 0.70852917] 0.52830406
14 [0.32767402 0.67656

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.93262	valid_1's auc: 0.817228
Early stopping, best iteration is:
[530]	training's auc: 0.935084	valid_1's auc: 0.818959
logloss = 	 0.469416841903498
ROC = 	 0.8189588189588191
0 [0.44862687 0.54052976 0.7191448 ] 0.59297583
1 [0.44878135 0.5080936  0.718487  ] 0.59307305
2 [0.44953986 0.49905001 0.68891679] 0.58655859
3 [0.44861128 0.50660634 0.72087989] 0.59230498
4 [0.45122241 0.50358748 0.61458959] 0.59089351
5 [0.43813899 0.5359122  0.72032379] 0.58534018
6 [0.43833573 0.54015286 0.60904016] 0.5842129
7 [0.44959846 0.53240277 0.71903834] 0.58977851
8 [0.45127604 0.5356215  0.71865736] 0.59297583
9 [0.44932811 0.50300855 0.71410945] 0.58929227
10 [0.44088684 0.44885212 0.80563003] 0.58729659
11 [0.450168   0.50500189 0.6923052 ] 0.58897034
12 [0.5014123  0.5395939  0.61277687] 0.57883051
13 [0.50404088 0.53538899 0.61828871] 0.57883051
14 [0.44764214 0.53876

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[39]	training's auc: 0.860698	valid_1's auc: 0.825911
logloss = 	 0.5789530136337093
ROC = 	 0.825910931174089
0 [0.49674948 0.59224059 0.66715035] 0.59361545
1 [0.50361353 0.59122362 0.66713048] 0.59290885
2 [0.43693194 0.59084438 0.64500507] 0.53161699
3 [0.35920186 0.61979057 0.66944002] 0.55307639
4 [0.41654014 0.61982211 0.62086028] 0.53781975
5 [0.56125404 0.60030207 0.66692962] 0.61846128
6 [0.51906155 0.58500908 0.66628996] 0.590557
7 [0.56283198 0.59110169 0.66690025] 0.621221
8 [0.56121627 0.6198202  0.66948167] 0.62763195
9 [0.56139444 0.60830618 0.62163391] 0.59573634
10 [0.56333154 0.59112144 0.66530081] 0.61688817
11 [0.56110988 0.59609169 0.6671573 ] 0.62041468
12 [0.51876796 0.6194866  0.66693512] 0.61026439
13 [0.39033893 0.6166393  0.66694164] 0.55162261
14 [0.51870603 0.56321222 0.66698502] 0.58784105
15 [0.51840231 0.61961352 0.665

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932693	valid_1's auc: 0.82137
Early stopping, best iteration is:
[568]	training's auc: 0.938661	valid_1's auc: 0.822924
logloss = 	 0.5021164307196836
ROC = 	 0.8229237493929092
0 [0.45805528 0.45881464 0.79827898] 0.5882823
1 [0.45879333 0.58709971 0.76965998] 0.56675059
2 [0.45627728 0.5569544  0.77163396] 0.56786079
3 [0.41098877 0.45828742 0.81819723] 0.57912561
4 [0.4584457  0.59567861 0.65115313] 0.53337023
5 [0.45437973 0.5023543  0.79547365] 0.56901574
6 [0.43076113 0.5572783  0.7712419 ] 0.55959811
7 [0.44646606 0.45814495 0.81907759] 0.58244138
8 [0.41255493 0.50564556 0.77901234] 0.562255
9 [0.41230808 0.4630712  0.81599338] 0.57674139
10 [0.33897206 0.46423519 0.81528483] 0.57723006
11 [0.45812452 0.45902428 0.7744762 ] 0.59019697
12 [0.34998327 0.58217031 0.7952757 ] 0.55413849
13 [0.45480351 0.46421387 0.80052459] 0.58385125
14 [0.45846833 0.524784

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	training's auc: 0.863642	valid_1's auc: 0.78276
logloss = 	 0.6080213524863044
ROC = 	 0.7827604032049624
0 [0.45743022 0.64119496 0.66415212] 0.50570783
1 [0.51077705 0.63953617 0.64143029] 0.51326863
2 [0.50293607 0.62087552 0.66684983] 0.51910187
3 [0.4999259  0.64112125 0.6764783 ] 0.51926566
4 [0.50203324 0.62268886 0.66683046] 0.52029208
5 [0.45667064 0.62159935 0.67464681] 0.50122661
6 [0.50029345 0.53072227 0.66686216] 0.43382365
7 [0.45440906 0.53058686 0.66408501] 0.41245084
8 [0.5094258  0.63314845 0.66660486] 0.51462067
9 [0.49422813 0.59521421 0.66410217] 0.4932718
10 [0.55712366 0.62419929 0.67483568] 0.52396764
11 [0.45793712 0.62269681 0.64548798] 0.49796193
12 [0.50680648 0.64043164 0.66689886] 0.52046278
13 [0.50546199 0.64009771 0.67495181] 0.51822901
14 [0.50296719 0.63914209 0.66418563] 0.51969939
15 [0.50835706 0.62551849 0.

HBox(children=(IntProgress(value=0, max=355), HTML(value='')))


After truncation: ((355, 538), (355,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	training's auc: 0.866071	valid_1's auc: 0.771169
logloss = 	 0.5858589440533024
ROC = 	 0.7711690840719848
0 [0.50035741 0.58469026 0.69157769] 0.48151985
1 [0.48565334 0.5030338  0.69390508] 0.46782498
2 [0.49825027 0.56028311 0.68258215] 0.47659492
3 [0.48616143 0.58537388 0.6406184 ] 0.47610161
4 [0.49941043 0.50014708 0.64040555] 0.46902367
5 [0.48672424 0.56588848 0.69460414] 0.4796621
6 [0.48899318 0.55854765 0.68253158] 0.47768414
7 [0.48548032 0.57326986 0.63942365] 0.47849179
8 [0.4867265  0.55665861 0.6398891 ] 0.47379423
9 [0.5003204  0.55163829 0.68266877] 0.46993869
10 [0.48503735 0.57355265 0.64164322] 0.4768709
11 [0.48578627 0.56081518 0.69501649] 0.48417151
12 [0.48678461 0.57033642 0.63899781] 0.47782585
13 [0.4873348  0.57336751 0.64566386] 0.4768709
14 [0.48543133 0.56306604 0.6342818 ] 0.47719765
15 [0.48736081 0.56955061 0.6

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))


After truncation: ((356, 538), (356,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[137]	training's auc: 0.885135	valid_1's auc: 0.828291
logloss = 	 0.5127430080576866
ROC = 	 0.8282909775447088
0 [0.40987401 0.58405437 0.71866939] 0.62035648
1 [0.43483645 0.56443101 0.72060435] 0.61561006
2 [0.49975111 0.57949974 0.71653344] 0.61622784
3 [0.42435682 0.55839127 0.66847703] 0.59630344
4 [0.41892197 0.51757257 0.71915025] 0.60618573
5 [0.42341542 0.57364692 0.71919624] 0.62280585
6 [0.43167158 0.58362187 0.71888502] 0.62522598
7 [0.46798237 0.58267995 0.7186693 ] 0.62363846
8 [0.46591553 0.5676304  0.65802318] 0.59552841
9 [0.43054859 0.57921477 0.71778463] 0.62484234
10 [0.46719839 0.56398878 0.71892981] 0.61944617
11 [0.42283398 0.56328527 0.71885842] 0.62280585
12 [0.46849769 0.57181597 0.65118208] 0.59650143
13 [0.46878249 0.58425008 0.71924589] 0.62105432
14 [0.42833509 0.555276   0.71818407] 0.62046274
15 [0.47254725 0.58344167

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))


After truncation: ((357, 538), (357,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[271]	training's auc: 0.90857	valid_1's auc: 0.83958
logloss = 	 0.47858039173143296
ROC = 	 0.8395798034564554
0 [0.43019337 0.54663142 0.73828596] 0.62947441
1 [0.42929747 0.50960121 0.65267679] 0.59927652
2 [0.45229676 0.54811848 0.74158835] 0.62343914
3 [0.43410489 0.59185816 0.59327636] 0.59099472
4 [0.41668701 0.54794132 0.71267057] 0.62234564
5 [0.43043211 0.54708623 0.74125877] 0.62865675
6 [0.43170477 0.54829055 0.74148043] 0.62865675
7 [0.43525494 0.50666912 0.74015304] 0.62827438
8 [0.43300661 0.5923564  0.59465688] 0.59099472
9 [0.42818836 0.47577998 0.74159128] 0.62339057
10 [0.38821663 0.50736378 0.74124721] 0.6335996
11 [0.38589384 0.50341263 0.74164287] 0.6335996
12 [0.42940688 0.5043099  0.74018012] 0.6282265
13 [0.4731863  0.59224567 0.67252585] 0.60310853
14 [0.46829723 0.54583936 0.60100255] 0.58513426
15 [0.4237258  0.50319093 0.7

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	training's auc: 0.883567	valid_1's auc: 0.833406
logloss = 	 0.5065084702170728
ROC = 	 0.833406194303393
0 [0.49327705 0.49675796 0.72983296] 0.61326337
1 [0.49122297 0.52150642 0.73676079] 0.61134313
2 [0.42972812 0.4882032  0.73039316] 0.5894449
3 [0.49162007 0.5213388  0.72993803] 0.6130254
4 [0.45093314 0.52788441 0.62269534] 0.55660298
5 [0.49022329 0.49792067 0.73699833] 0.61150037
6 [0.48978624 0.49169202 0.72759343] 0.60859502
7 [0.49673717 0.49740776 0.56512975] 0.57134572
8 [0.45398782 0.53052319 0.69226001] 0.58536585
9 [0.49659224 0.49784333 0.7298394 ] 0.61523764
10 [0.43483248 0.48096972 0.72989946] 0.59256601
11 [0.43845298 0.49701196 0.72668957] 0.60086868
12 [0.45211169 0.49760164 0.72844504] 0.60469983
13 [0.49207225 0.49620574 0.69265074] 0.59515878
14 [0.49616475 0.49646504 0.73692267] 0.61357732
15 [0.49557843 0.49685366 0.

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932449	valid_1's auc: 0.807573
Early stopping, best iteration is:
[702]	training's auc: 0.947565	valid_1's auc: 0.812246
logloss = 	 0.5152961242259572
ROC = 	 0.8122462133419271
0 [0.42142323 0.54996339 0.76272601] 0.5868752
1 [0.43015368 0.58116279 0.62194223] 0.55915132
2 [0.42459374 0.56386462 0.61671797] 0.5631176
3 [0.39203965 0.56497187 0.76205821] 0.58756399
4 [0.32764786 0.52314448 0.76208609] 0.58765458
5 [0.43676428 0.61697131 0.71485105] 0.57572696
6 [0.42241762 0.55192785 0.63450908] 0.56217863
7 [0.41481731 0.54931542 0.61803473] 0.56324808
8 [0.47486302 0.55281657 0.71577758] 0.57062562
9 [0.40186819 0.6174159  0.63716157] 0.56306547
10 [0.3911006  0.51957145 0.76242807] 0.58125364
11 [0.42336928 0.56487891 0.71514609] 0.57926088
12 [0.49369054 0.55151846 0.61816445] 0.55724547
13 [0.3967038  0.54869642 0.76203749] 0.58886644
14 [0.4232166  0.5535

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932027	valid_1's auc: 0.819044
Early stopping, best iteration is:
[695]	training's auc: 0.947463	valid_1's auc: 0.822042
logloss = 	 0.4935857614340499
ROC = 	 0.8220416922133661
0 [0.3933226  0.6078395  0.74904325] 0.58752141
1 [0.35353827 0.61418303 0.75884983] 0.58238669
2 [0.2792323  0.59246283 0.74981451] 0.58209283
3 [0.30110465 0.66926345 0.75030529] 0.58084786
4 [0.39341914 0.59231218 0.75849536] 0.59097853
5 [0.39275589 0.52203035 0.71331108] 0.56764352
6 [0.39165887 0.61722372 0.71950538] 0.58275137
7 [0.49477775 0.61056513 0.61904416] 0.51862105
8 [0.39402808 0.61960503 0.74464324] 0.58770651
9 [0.37030089 0.60744881 0.71334316] 0.58107981
10 [0.46181857 0.61046459 0.74854028] 0.57100445
11 [0.46065623 0.6072123  0.75851358] 0.56997834
12 [0.38892892 0.60552703 0.74973811] 0.58549192
13 [0.39284249 0.60741959 0.75014807] 0.58752141
14 [0.4155811  0.62

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.93262	valid_1's auc: 0.808601
Early stopping, best iteration is:
[444]	training's auc: 0.927447	valid_1's auc: 0.812434
logloss = 	 0.48839129790520025
ROC = 	 0.8124340670933258
0 [0.49392124 0.52941591 0.71953983] 0.51926797
1 [0.41625929 0.55344437 0.76564329] 0.52936922
2 [0.40845625 0.5354545  0.76541004] 0.5311489
3 [0.44779859 0.5360492  0.77140404] 0.53312593
4 [0.43607723 0.54932018 0.65025097] 0.50172782
5 [0.49117298 0.5357312  0.71956312] 0.52576946
6 [0.43886981 0.55185373 0.71957003] 0.5282472
7 [0.43531977 0.59030278 0.64970349] 0.50307967
8 [0.43637541 0.49154049 0.71959938] 0.52627998
9 [0.43337201 0.57983022 0.71950548] 0.52863386
10 [0.44802902 0.57746084 0.71917002] 0.52466677
11 [0.49484366 0.58765689 0.64573226] 0.50155981
12 [0.36125359 0.59136688 0.7206649 ] 0.53489428
13 [0.49135886 0.5357101  0.62655505] 0.49763551
14 [0.49459179 0.5581

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[84]	training's auc: 0.872799	valid_1's auc: 0.852682
logloss = 	 0.517842671225396
ROC = 	 0.8526817042606516
0 [0.49499765 0.53957896 0.64214772] 0.65078639
1 [0.49588857 0.53786439 0.64256707] 0.65078639
2 [0.42963085 0.54332559 0.65816398] 0.63658454
3 [0.5054928  0.5494737  0.64382158] 0.65182587
4 [0.50734309 0.5505671  0.64374036] 0.65182587
5 [0.42870723 0.56193016 0.64338792] 0.64115048
6 [0.426054   0.53577408 0.64373402] 0.63879921
7 [0.52848407 0.61615708 0.64328467] 0.64924992
8 [0.50440842 0.5496448  0.64349716] 0.65182587
9 [0.42934354 0.56048944 0.64325944] 0.64115048
10 [0.54534613 0.55346832 0.64354492] 0.6521615
11 [0.42448054 0.547697   0.64375271] 0.63811459
12 [0.50719718 0.56619151 0.64327622] 0.65183212
13 [0.44518997 0.61671171 0.70427042] 0.62787872
14 [0.42855368 0.53735179 0.70446615] 0.64205848
15 [0.49660857 0.56645063 0.

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[388]	training's auc: 0.922007	valid_1's auc: 0.799364
logloss = 	 0.5359086678925074
ROC = 	 0.7993643731129827
0 [0.46102179 0.55496234 0.68974443] 0.52718
1 [0.42527435 0.47823907 0.78677935] 0.5601184
2 [0.42805547 0.57442215 0.77496462] 0.55298753
3 [0.309726   0.57333338 0.78876786] 0.55926262
4 [0.4321996  0.57365909 0.81776893] 0.55078553
5 [0.47475967 0.55239841 0.69121175] 0.52953237
6 [0.40910493 0.5614717  0.7860076 ] 0.54970277
7 [0.46497542 0.571167   0.62721771] 0.51781704
8 [0.45524208 0.50987716 0.69021627] 0.51729336
9 [0.40196905 0.56647316 0.69242096] 0.52405597
10 [0.42312734 0.57131349 0.58631334] 0.52102111
11 [0.45444587 0.56463086 0.77514576] 0.54959111
12 [0.47499322 0.57148562 0.78689108] 0.55765564
13 [0.42489542 0.55805677 0.77587757] 0.55221175
14 [0.47514784 0.5567537  0.62780055] 0.51926898
15 [0.42570771 0.56511628 0.6

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[381]	training's auc: 0.918931	valid_1's auc: 0.852866
logloss = 	 0.4742242932426145
ROC = 	 0.8528656126482214
0 [0.37456729 0.53441996 0.71664604] 0.62273908
1 [0.42464772 0.71674349 0.7176069 ] 0.61540641
2 [0.3945112  0.63822403 0.67545025] 0.60093333
3 [0.4611826  0.53285406 0.67145137] 0.5932137
4 [0.37551267 0.53617651 0.66720574] 0.59493609
5 [0.3945612  0.63416765 0.7678549 ] 0.63591758
6 [0.38871295 0.53524936 0.75551501] 0.62960308
7 [0.46051655 0.53476418 0.66819448] 0.59450052
8 [0.37339672 0.60546762 0.75907494] 0.63685776
9 [0.39223502 0.6428512  0.75677009] 0.63686604
10 [0.44817808 0.53334779 0.6658763 ] 0.58917759
11 [0.42609798 0.53542298 0.75530147] 0.62225712
12 [0.39483878 0.53671917 0.7592019 ] 0.63180667
13 [0.46102106 0.54959069 0.66817793] 0.59081019
14 [0.51566562 0.61428487 0.75686219] 0.62005307
15 [0.37270512 0.66335006 

HBox(children=(IntProgress(value=0, max=355), HTML(value='')))


After truncation: ((355, 538), (355,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	training's auc: 0.866071	valid_1's auc: 0.807265
logloss = 	 0.5662687296219309
ROC = 	 0.8072648693146738
0 [0.47566876 0.56479632 0.68159082] 0.53159893
1 [0.47695674 0.54492974 0.68162913] 0.52534207
2 [0.47640511 0.56199352 0.68680403] 0.52606938
3 [0.472166   0.54593172 0.68496853] 0.52177329
4 [0.45488365 0.58555789 0.68144952] 0.51996345
5 [0.45732592 0.54464888 0.68161857] 0.51535987
6 [0.4723261  0.54439235 0.68164438] 0.52282044
7 [0.44957829 0.54601836 0.68167411] 0.51009604
8 [0.47714004 0.58547466 0.6491086 ] 0.50374033
9 [0.47918195 0.56195998 0.68165866] 0.53159893
10 [0.47590818 0.56291281 0.6816804 ] 0.53159893
11 [0.47240105 0.56376333 0.68153545] 0.52911811
12 [0.47813558 0.58538732 0.68166141] 0.52960061
13 [0.4787572  0.54920429 0.68234757] 0.52365774
14 [0.47566927 0.56357313 0.68492859] 0.53064257
15 [0.4551835  0.56472451 

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))


After truncation: ((356, 538), (356,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.93246	valid_1's auc: 0.796779
Early stopping, best iteration is:
[581]	training's auc: 0.939512	valid_1's auc: 0.799065
logloss = 	 0.5016517735412273
ROC = 	 0.799064935064935
0 [0.44893798 0.48018952 0.82440432] 0.58924585
1 [0.44569221 0.4661001  0.74525088] 0.56707995
2 [0.45292058 0.46668122 0.77275326] 0.57872631
3 [0.40230198 0.47796937 0.78314916] 0.5670818
4 [0.44275697 0.46632643 0.77674348] 0.57646701
5 [0.45887337 0.46799295 0.7097762 ] 0.56289352
6 [0.44792411 0.46548628 0.8222149 ] 0.59128393
7 [0.41613408 0.45978386 0.82265983] 0.57709945
8 [0.44827908 0.52888189 0.70923521] 0.55907246
9 [0.44896054 0.45945186 0.82260342] 0.58867288
10 [0.4553027  0.46712671 0.70924064] 0.56289352
11 [0.41129172 0.45822318 0.82230223] 0.57709945
12 [0.4497359  0.46626392 0.82198068] 0.59297284
13 [0.45659735 0.58460409 0.60969634] 0.53438328
14 [0.46617912 0.50239

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))


After truncation: ((357, 538), (357,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[270]	training's auc: 0.908418	valid_1's auc: 0.836708
logloss = 	 0.4942879494512079
ROC = 	 0.8367077464788731
0 [0.41624712 0.56916921 0.68753409] 0.60282234
1 [0.49267739 0.55283804 0.76726457] 0.61727388
2 [0.42001613 0.58865344 0.60702629] 0.59428972
3 [0.45108202 0.54736724 0.77261465] 0.61831031
4 [0.47938369 0.57330432 0.59400544] 0.59541034
5 [0.41939997 0.53215422 0.64968066] 0.58619771
6 [0.42770166 0.5463527  0.77298984] 0.61821184
7 [0.41968732 0.55340225 0.76826776] 0.61983423
8 [0.44291228 0.58852186 0.60664553] 0.59306382
9 [0.43779407 0.58947339 0.59154623] 0.59647301
10 [0.47945468 0.54698144 0.6058837 ] 0.59558518
11 [0.44326171 0.58864998 0.59419726] 0.59647301
12 [0.47053853 0.54605184 0.59445718] 0.59686416
13 [0.47697699 0.52263458 0.77251146] 0.61596152
14 [0.4605873  0.51867782 0.73931808] 0.61013775
15 [0.42161475 0.57068568

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[377]	training's auc: 0.9198	valid_1's auc: 0.833863
logloss = 	 0.4758367728878802
ROC = 	 0.8338632750397457
0 [0.4397209  0.53720069 0.67646018] 0.60028092
1 [0.45376924 0.55101399 0.67783037] 0.59925184
2 [0.44597055 0.55961747 0.67682731] 0.59695152
3 [0.44132817 0.54630642 0.68195054] 0.59850436
4 [0.43782855 0.55250652 0.68320733] 0.60115978
5 [0.4455855  0.5501198  0.67900699] 0.59925184
6 [0.43082071 0.48437936 0.67902706] 0.60858228
7 [0.45949072 0.4631195  0.67947824] 0.61089379
8 [0.43762305 0.47302528 0.67903284] 0.61119451
9 [0.44127783 0.4866151  0.68086185] 0.61048348
10 [0.44002052 0.63186123 0.68497462] 0.59496749
11 [0.43606838 0.55901593 0.67959763] 0.60013688
12 [0.44122129 0.55048672 0.68299076] 0.60128529
13 [0.44560902 0.55073724 0.67859158] 0.59925184
14 [0.45091026 0.52443868 0.68036293] 0.59614172
15 [0.45991431 0.59190252 0

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932449	valid_1's auc: 0.808722
Early stopping, best iteration is:
[755]	training's auc: 0.950915	valid_1's auc: 0.814465
logloss = 	 0.5229361249211949
ROC = 	 0.8144654088050316
0 [0.32151539 0.43552939 0.77399595] 0.52857875
1 [0.3255011  0.63433429 0.79365262] 0.54728395
2 [0.34995117 0.61037981 0.79145189] 0.5408396
3 [0.43720109 0.49217618 0.65499816] 0.49324324
4 [0.40642004 0.62525468 0.77589242] 0.54044032
5 [0.28385771 0.61101377 0.7941322 ] 0.54992669
6 [0.30127618 0.6060949  0.77621542] 0.54727423
7 [0.48719975 0.6250525  0.64379122] 0.50316459
8 [0.34893147 0.4613067  0.77576199] 0.52074801
9 [0.35524123 0.60823214 0.77616582] 0.53989466
10 [0.399087   0.48774436 0.78950289] 0.52672047
11 [0.48719373 0.48745688 0.7243538 ] 0.52483927
12 [0.48096529 0.49092699 0.76710771] 0.5276151
13 [0.43745909 0.49103131 0.76691276] 0.52991941
14 [0.39815996 0.6132

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932027	valid_1's auc: 0.868937
Early stopping, best iteration is:
[552]	training's auc: 0.93643	valid_1's auc: 0.869385
logloss = 	 0.44246742365428315
ROC = 	 0.8693847319829131
0 [0.40623005 0.67003776 0.76167264] 0.63455588
1 [0.38242435 0.61507078 0.71876702] 0.6385946
2 [0.33400919 0.66794848 0.74585444] 0.64690705
3 [0.37056138 0.61357472 0.7166713 ] 0.63955544
4 [0.33736897 0.66958596 0.74648071] 0.64628698
5 [0.40033742 0.53332902 0.71675633] 0.62606447
6 [0.37167313 0.58937335 0.71708444] 0.64013428
7 [0.45118033 0.58762146 0.71635007] 0.63205501
8 [0.36815419 0.66749001 0.74624961] 0.64710817
9 [0.38628496 0.53269318 0.71611213] 0.62567672
10 [0.43459955 0.63137085 0.74493217] 0.63195956
11 [0.38168118 0.63173141 0.72198864] 0.63979537
12 [0.48964971 0.53438664 0.71555593] 0.61127681
13 [0.40528495 0.61571366 0.73122971] 0.6309246
14 [0.30003257 0.6139

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.93262	valid_1's auc: 0.806844
Early stopping, best iteration is:
[433]	training's auc: 0.926315	valid_1's auc: 0.81049
logloss = 	 0.5010773391084503
ROC = 	 0.8104903118272102
0 [0.44174645 0.61721923 0.77011329] 0.55552914
1 [0.43461804 0.54125207 0.77205873] 0.54653825
2 [0.32771687 0.6401011  0.77073258] 0.56734235
3 [0.44697619 0.62700582 0.73234055] 0.54161213
4 [0.47806887 0.54325758 0.77018803] 0.5421548
5 [0.45688829 0.57329726 0.77456948] 0.54603909
6 [0.43542013 0.63732127 0.77006038] 0.56079288
7 [0.43458243 0.59452995 0.72501781] 0.53710339
8 [0.44994657 0.54896231 0.67196821] 0.51850455
9 [0.4572434  0.50476957 0.77319352] 0.54306577
10 [0.44622174 0.6373855  0.77163415] 0.56120474
11 [0.44878203 0.6268249  0.77021288] 0.559231
12 [0.43474271 0.65537761 0.73215209] 0.54268004
13 [0.45010255 0.55982959 0.77092218] 0.54872775
14 [0.47498229 0.5419722

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	training's auc: 0.862597	valid_1's auc: 0.819701
logloss = 	 0.5702759956743101
ROC = 	 0.8197010869565218
0 [0.55208068 0.60623313 0.67103414] 0.59202446
1 [0.54369188 0.61476786 0.67589628] 0.59293032
2 [0.54650833 0.61153408 0.65990473] 0.59078262
3 [0.5476384  0.61035866 0.6709152 ] 0.59172114
4 [0.54350298 0.60605906 0.66499064] 0.59334208
5 [0.52861505 0.62843719 0.67095466] 0.58335387
6 [0.55650963 0.60642425 0.66779392] 0.59093016
7 [0.54650354 0.60743145 0.67037808] 0.59304842
8 [0.40321645 0.60614274 0.66758422] 0.51104457
9 [0.41150507 0.60762869 0.67083703] 0.51411509
10 [0.39285661 0.61149064 0.66055907] 0.50922547
11 [0.54734137 0.59924318 0.65981922] 0.58773791
12 [0.54359293 0.60835888 0.63821632] 0.58102612
13 [0.4200877  0.60967887 0.65229105] 0.50909998
14 [0.53173397 0.60600572 0.66971847] 0.58843752
15 [0.47868979 0.60717076 

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932693	valid_1's auc: 0.79024
Early stopping, best iteration is:
[573]	training's auc: 0.939172	valid_1's auc: 0.792891
logloss = 	 0.536165413445355
ROC = 	 0.79289090444558
0 [0.45826971 0.50854074 0.77787395] 0.53943805
1 [0.2975061  0.54213601 0.86096832] 0.5361913
2 [0.43002563 0.4587968  0.81565041] 0.53941252
3 [0.40884862 0.46440413 0.81814691] 0.53918127
4 [0.41678814 0.45763279 0.77789182] 0.54662419
5 [0.45750726 0.54037453 0.67430803] 0.50641943
6 [0.39699397 0.47052689 0.81904383] 0.54074312
7 [0.39717913 0.45710765 0.81776891] 0.54618811
8 [0.41471185 0.4693978  0.81737862] 0.53544624
9 [0.37917006 0.46753005 0.8171153 ] 0.54036333
10 [0.4542516  0.45912669 0.8130244 ] 0.54427873
11 [0.45835059 0.54122518 0.65720739] 0.50418923
12 [0.45771282 0.47716235 0.77805704] 0.54582452
13 [0.45735603 0.46388372 0.77806946] 0.54653705
14 [0.41968412 0.4722424

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.931157	valid_1's auc: 0.808013
Early stopping, best iteration is:
[545]	training's auc: 0.935016	valid_1's auc: 0.80859
logloss = 	 0.5336891517987647
ROC = 	 0.8085897435897437
0 [0.39217122 0.57488695 0.753051  ] 0.56116458
1 [0.39257511 0.63642625 0.66903024] 0.53270796
2 [0.38823647 0.57143385 0.78042314] 0.5561184
3 [0.39076351 0.57418453 0.75610961] 0.56116458
4 [0.40047185 0.57696983 0.79585209] 0.55815462
5 [0.43795272 0.58478664 0.7526118 ] 0.55178947
6 [0.39171612 0.57613282 0.75580859] 0.56116458
7 [0.39175044 0.57492786 0.76116744] 0.56000591
8 [0.39134028 0.57422226 0.6797426 ] 0.53749354
9 [0.41109688 0.58516607 0.79505535] 0.55613387
10 [0.39082744 0.63708572 0.75600487] 0.5621164
11 [0.39212229 0.57443293 0.75354245] 0.56116458
12 [0.41435225 0.58447946 0.75528284] 0.5549306
13 [0.39157151 0.66264939 0.8062679 ] 0.56488734
14 [0.37962554 0.636379

HBox(children=(IntProgress(value=0, max=355), HTML(value='')))


After truncation: ((355, 538), (355,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[143]	training's auc: 0.886578	valid_1's auc: 0.769521
logloss = 	 0.5428794002341542
ROC = 	 0.7695213675213675
0 [0.43270478 0.53010426 0.6812769 ] 0.49605307
1 [0.44986093 0.50419098 0.68495131] 0.49444496
2 [0.48222285 0.52917688 0.68578029] 0.49547037
3 [0.40874194 0.59428119 0.68627309] 0.49434119
4 [0.4687348  0.50470206 0.62994335] 0.48264656
5 [0.47115259 0.59861765 0.69703335] 0.48930115
6 [0.50424951 0.52821192 0.67843646] 0.49946746
7 [0.4334778  0.53710434 0.68600607] 0.49522496
8 [0.47357569 0.59606442 0.6863934 ] 0.49559483
9 [0.43011466 0.55369482 0.68271801] 0.49066624
10 [0.47258553 0.57970462 0.68577444] 0.49410922
11 [0.47969004 0.53054169 0.68160006] 0.49513541
12 [0.48738307 0.6010839  0.68697513] 0.49169813
13 [0.45791823 0.59836387 0.68561813] 0.49122897
14 [0.44660411 0.52984753 0.68686288] 0.49409723
15 [0.48404349 0.5089124 

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))


After truncation: ((356, 538), (356,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[137]	training's auc: 0.885135	valid_1's auc: 0.824728
logloss = 	 0.5165541107268128
ROC = 	 0.8247277127874142
0 [0.44940022 0.59520431 0.70433083] 0.61837015
1 [0.46869035 0.59414613 0.7163159 ] 0.62402158
2 [0.4319893  0.54619358 0.71794542] 0.61388074
3 [0.46229575 0.59253214 0.71775671] 0.6253563
4 [0.52395737 0.59525306 0.70406842] 0.61911618
5 [0.46032036 0.59196591 0.71061396] 0.62057548
6 [0.46201078 0.55088235 0.60694799] 0.58650191
7 [0.40549137 0.5473774  0.71583521] 0.61471213
8 [0.4143803  0.59193605 0.72074496] 0.62206187
9 [0.41851025 0.60157404 0.609897  ] 0.58652833
10 [0.40757999 0.59268387 0.71818285] 0.62100012
11 [0.48096842 0.57085217 0.60861804] 0.58496267
12 [0.43392491 0.59661562 0.71509174] 0.62116232
13 [0.41820353 0.59238549 0.71714924] 0.62303072
14 [0.49825796 0.59229886 0.62081918] 0.58889697
15 [0.43344556 0.5918262  

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))


After truncation: ((357, 538), (357,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.93217	valid_1's auc: 0.805184
Early stopping, best iteration is:
[473]	training's auc: 0.930006	valid_1's auc: 0.805941
logloss = 	 0.5188994998407505
ROC = 	 0.8059414088215932
0 [0.50991291 0.59344039 0.74151791] 0.54421769
1 [0.41145984 0.6013958  0.74453292] 0.55659771
2 [0.50691696 0.59608726 0.63215834] 0.50334024
3 [0.40636649 0.59489257 0.74046673] 0.56462759
4 [0.37233255 0.59590828 0.7701197 ] 0.5571276
5 [0.5010841  0.55000688 0.59299677] 0.49592282
6 [0.41197402 0.59895327 0.74208314] 0.56222023
7 [0.40385452 0.56785177 0.74120034] 0.56058657
8 [0.3181077  0.5972538  0.74153733] 0.5710853
9 [0.40049893 0.50875118 0.65150966] 0.51239058
10 [0.34576507 0.59480835 0.77113215] 0.55995674
11 [0.41199448 0.51710992 0.74153393] 0.55531117
12 [0.40358782 0.59406514 0.63547491] 0.52057115
13 [0.48902907 0.52092937 0.61035146] 0.49360316
14 [0.29735976 0.56693

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[340]	training's auc: 0.915172	valid_1's auc: 0.83206
logloss = 	 0.49455361441020207
ROC = 	 0.8320598381532708
0 [0.45407047 0.55622223 0.67871856] 0.58317724
1 [0.45107141 0.55403591 0.66238215] 0.58517565
2 [0.45365196 0.5466301  0.67399054] 0.58417839
3 [0.44819996 0.47570626 0.67333613] 0.59129546
4 [0.45714143 0.55303643 0.68755286] 0.58250145
5 [0.4589042  0.55749663 0.67837217] 0.58317724
6 [0.45053978 0.55486276 0.67676472] 0.58317724
7 [0.45284292 0.55488717 0.67972711] 0.58317724
8 [0.44947163 0.54497915 0.67548486] 0.58317724
9 [0.45884707 0.55252993 0.67601705] 0.58317724
10 [0.45108488 0.55393055 0.66270192] 0.58517565
11 [0.45777062 0.57946864 0.68125426] 0.57705173
12 [0.458725   0.54926664 0.69533868] 0.58014467
13 [0.45492683 0.55867461 0.69370055] 0.58116042
14 [0.45696332 0.55399259 0.68724274] 0.58250145
15 [0.42450882 0.5573698 

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[385]	training's auc: 0.9216	valid_1's auc: 0.816369
logloss = 	 0.5196683936812252
ROC = 	 0.8163688163688163
0 [0.35220998 0.54724145 0.7816768 ] 0.5691956
1 [0.46091615 0.55873683 0.7394895 ] 0.55387544
2 [0.44447049 0.55697789 0.77448199] 0.5555175
3 [0.44058286 0.54971549 0.75196602] 0.55942271
4 [0.35404882 0.60652126 0.76533679] 0.56363147
5 [0.440252   0.54412425 0.76780115] 0.55736449
6 [0.45475285 0.66500243 0.76426118] 0.56064727
7 [0.38964425 0.60682152 0.78175174] 0.56173853
8 [0.43775958 0.50203825 0.78173213] 0.54800371
9 [0.42788336 0.54925853 0.75216472] 0.55673828
10 [0.41073544 0.54685485 0.7807986 ] 0.55820154
11 [0.40812534 0.54706864 0.73952346] 0.55797288
12 [0.48482413 0.54465721 0.73949288] 0.56221786
13 [0.36009565 0.54684867 0.76455593] 0.56252188
14 [0.35897724 0.56561293 0.78177292] 0.5637258
15 [0.41666498 0.66555726 0.75

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932027	valid_1's auc: 0.839134
Early stopping, best iteration is:
[443]	training's auc: 0.926496	valid_1's auc: 0.841582
logloss = 	 0.46290372051069273
ROC = 	 0.8415815240527884
0 [0.44191363 0.63831151 0.74035889] 0.56766768
1 [0.3753263  0.61668517 0.73596849] 0.57639155
2 [0.34300854 0.62404183 0.74097161] 0.57369253
3 [0.45504712 0.64198548 0.74036364] 0.56877419
4 [0.47279117 0.64298286 0.74449842] 0.56882961
5 [0.47104833 0.63850599 0.69856172] 0.5632262
6 [0.2923549  0.64446516 0.74161014] 0.58113632
7 [0.48278008 0.49855118 0.71376024] 0.5448983
8 [0.42959958 0.64378591 0.74040499] 0.563751
9 [0.20706508 0.64217144 0.74187563] 0.59301702
10 [0.47953812 0.6141131  0.74031978] 0.56846068
11 [0.45425424 0.64488038 0.73294689] 0.56491386
12 [0.47506492 0.57874308 0.65597459] 0.53753593
13 [0.38254775 0.6162163  0.74035129] 0.57800934
14 [0.31669359 0.63853

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	training's auc: 0.913373	valid_1's auc: 0.818193
logloss = 	 0.49230710058315086
ROC = 	 0.8181926561754888
0 [0.4414499  0.57643501 0.71480763] 0.58067683
1 [0.46853025 0.59083096 0.62993513] 0.57185291
2 [0.41930898 0.57602498 0.71371285] 0.58388157
3 [0.40989124 0.59529851 0.60472656] 0.57325756
4 [0.45974439 0.53154414 0.67967632] 0.55713867
5 [0.46533115 0.57622714 0.60533871] 0.57419237
6 [0.44888747 0.53348035 0.6332453 ] 0.56130494
7 [0.41905199 0.53344455 0.60571713] 0.56249794
8 [0.46373743 0.53340272 0.71450422] 0.57091821
9 [0.4656495  0.5754834  0.60479229] 0.57419237
10 [0.42866055 0.57507876 0.74599565] 0.5794619
11 [0.3845269  0.58092054 0.77595379] 0.58893943
12 [0.47148584 0.53719581 0.74044888] 0.56444991
13 [0.46309214 0.60552925 0.71357734] 0.57816505
14 [0.46124799 0.5964476  0.71281049] 0.58175763
15 [0.46550601 0.53104172

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[264]	training's auc: 0.906152	valid_1's auc: 0.851779
logloss = 	 0.4679793026987472
ROC = 	 0.8517786561264823
0 [0.44844177 0.59838554 0.63944795] 0.65964771
1 [0.41183367 0.59472535 0.61283769] 0.66493619
2 [0.39754596 0.59348985 0.72380433] 0.66810606
3 [0.42783864 0.59032931 0.64343382] 0.66217802
4 [0.40145463 0.53126389 0.60909349] 0.65167604
5 [0.41237558 0.59305655 0.68569294] 0.66724986
6 [0.44825461 0.59142799 0.68635532] 0.66422138
7 [0.45717991 0.59410269 0.74424903] 0.66432013
8 [0.39069667 0.59418117 0.7475375 ] 0.67370118
9 [0.39859018 0.59302231 0.64687269] 0.66308373
10 [0.40836831 0.54320314 0.6418369 ] 0.65886472
11 [0.41224859 0.54546665 0.70709181] 0.66209206
12 [0.40923045 0.59338827 0.64604678] 0.66447829
13 [0.47625633 0.59585676 0.68536564] 0.65884863
14 [0.41264067 0.58793444 0.74476402] 0.67424446
15 [0.39642492 0.58928764

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932693	valid_1's auc: 0.775418
Early stopping, best iteration is:
[617]	training's auc: 0.942244	valid_1's auc: 0.778829
logloss = 	 0.5473478743113888
ROC = 	 0.7788288288288289
0 [0.41878744 0.55037786 0.61110263] 0.47080976
1 [0.43749952 0.54966481 0.76760193] 0.48311173
2 [0.4132775  0.56480257 0.69762676] 0.47884235
3 [0.45413745 0.56438458 0.64029491] 0.47528475
4 [0.38099238 0.6053678  0.6694025 ] 0.47509058
5 [0.37755518 0.56364819 0.60414575] 0.47617971
6 [0.30578001 0.60353662 0.64803146] 0.47872457
7 [0.4527348  0.58836265 0.70454058] 0.47690353
8 [0.44511905 0.56387081 0.61031749] 0.47523323
9 [0.43957856 0.57837585 0.7083006 ] 0.47295611
10 [0.40832407 0.54920386 0.80885205] 0.4916693
11 [0.45412068 0.59630064 0.64938289] 0.47201422
12 [0.39355465 0.56661446 0.61148927] 0.47381144
13 [0.45286249 0.59574448 0.64737822] 0.47201422
14 [0.45564861 0.563

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	training's auc: 0.897126	valid_1's auc: 0.835105
logloss = 	 0.5193457977310947
ROC = 	 0.835104845701808
0 [0.39625889 0.5877671  0.76242374] 0.56931092
1 [0.28726369 0.63922592 0.81481374] 0.59658521
2 [0.28246845 0.67421812 0.76148846] 0.60242144
3 [0.32750721 0.64708855 0.76282921] 0.59464706
4 [0.37477365 0.67422798 0.76153891] 0.59689485
5 [0.33088729 0.66700418 0.76128706] 0.59858185
6 [0.3281923  0.68728292 0.81621811] 0.59323785
7 [0.39893683 0.64149161 0.76187213] 0.59015137
8 [0.4050352  0.67414764 0.75307968] 0.58932091
9 [0.34869919 0.67430267 0.76318841] 0.59604357
10 [0.3419801  0.66498735 0.81229542] 0.58997664
11 [0.3979026  0.63963988 0.76152616] 0.59312297
12 [0.3949213  0.67634439 0.75379957] 0.58680136
13 [0.30308062 0.63959914 0.76126141] 0.59845464
14 [0.32266198 0.67537299 0.76156579] 0.59975821
15 [0.42019945 0.67429816 

HBox(children=(IntProgress(value=0, max=355), HTML(value='')))


After truncation: ((355, 538), (355,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[140]	training's auc: 0.885806	valid_1's auc: 0.82136
logloss = 	 0.5108887404955542
ROC = 	 0.8213604143947656
0 [0.40304625 0.61039807 0.68764889] 0.55451835
1 [0.40999664 0.62097612 0.69396541] 0.55723639
2 [0.42594437 0.55391767 0.69211187] 0.55514019
3 [0.41222497 0.6232092  0.68677913] 0.55330025
4 [0.44447529 0.54672679 0.69521385] 0.55392651
5 [0.38531755 0.62104393 0.68460394] 0.55243014
6 [0.41196458 0.5297764  0.71660032] 0.56012827
7 [0.40970815 0.52618116 0.68417492] 0.56531941
8 [0.37294438 0.52658203 0.6943597 ] 0.55880603
9 [0.44647601 0.56579702 0.69478933] 0.54926846
10 [0.40376079 0.55179684 0.68842416] 0.55977797
11 [0.41010489 0.59778108 0.69574415] 0.55829419
12 [0.41022957 0.52990082 0.68183965] 0.56314896
13 [0.40365038 0.5268322  0.70254469] 0.56029683
14 [0.44410408 0.52740731 0.69362474] 0.55534448
15 [0.4097267  0.55662026 

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))


After truncation: ((356, 538), (356,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.832944	valid_1's auc: 0.786507
logloss = 	 0.6523371712628168
ROC = 	 0.7865066559096411
0 [0.45219976 0.55852419 0.62606367] 0.23242216
1 [0.48386901 0.50345952 0.68110924] 0.0
2 [0.44242237 0.53268485 0.62196348] 0.22567472
3 [0.42985862 0.52543932 0.68231081] 0.0
4 [0.48550658 0.52045896 0.6217607 ] 0.22567472
5 [0.41883243 0.56220691 0.62217269] 0.22567472
6 [0.4507356  0.61961198 0.63017534] 0.45432174
7 [0.49461111 0.54005507 0.68070866] 0.0
8 [0.42187862 0.52527641 0.62186428] 0.22567472
9 [0.44388021 0.55450522 0.6956311 ] 0.0
10 [0.44704965 0.5646685  0.62606791] 0.23242216
11 [0.45626856 0.54127279 0.62609903] 0.23242216
12 [0.50175643 0.56993877 0.62306178] 0.22626611
13 [0.46304904 0.56901484 0.62150121] 0.22780968
14 [0.51560187 0.5273999  0.62609753] 0.23242216
15 [0.49635193 0.50546626 0.62609123] 0.23242216
16 [0.

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))


After truncation: ((357, 538), (357,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[385]	training's auc: 0.921463	valid_1's auc: 0.833316
logloss = 	 0.4785215080472113
ROC = 	 0.8333161547447261
0 [0.40090782 0.56836921 0.67143986] 0.59621187
1 [0.38882286 0.56773211 0.67126897] 0.59745383
2 [0.40132719 0.57901502 0.67044218] 0.59696903
3 [0.43475723 0.58682579 0.67189173] 0.59051737
4 [0.42789578 0.58216431 0.68251574] 0.58845048
5 [0.40435266 0.57010413 0.68216783] 0.59498936
6 [0.39810426 0.57893256 0.68276431] 0.5944817
7 [0.383145   0.58073944 0.67593806] 0.59413948
8 [0.38547481 0.58743227 0.70002397] 0.59066712
9 [0.40108214 0.5699961  0.68354637] 0.59363244
10 [0.4750775  0.56691833 0.67076775] 0.58684549
11 [0.46607608 0.56291894 0.71682333] 0.58284049
12 [0.40552054 0.57938442 0.68419031] 0.59312882
13 [0.4350301  0.58268224 0.68889213] 0.58912623
14 [0.46160038 0.58260196 0.67030017] 0.58707997
15 [0.42943646 0.57994477 

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[118]	training's auc: 0.880792	valid_1's auc: 0.827671
logloss = 	 0.5364928112667875
ROC = 	 0.8276705276705277
0 [0.48688518 0.56781338 0.6814262 ] 0.58891047
1 [0.44450551 0.52624605 0.66860471] 0.57107099
2 [0.40463484 0.53932514 0.72822475] 0.57616576
3 [0.47320433 0.54584388 0.68048168] 0.58130114
4 [0.48675953 0.56165877 0.67849213] 0.58780925
5 [0.50854676 0.56974036 0.67914862] 0.58882677
6 [0.47087219 0.56691609 0.67952177] 0.58509904
7 [0.40612061 0.57736523 0.72909726] 0.58459432
8 [0.50794669 0.56640704 0.72892708] 0.59733328
9 [0.4042289  0.56883135 0.67907041] 0.5769918
10 [0.5081268  0.54602522 0.68181115] 0.58406883
11 [0.45765936 0.57571792 0.66894927] 0.57766453
12 [0.42780864 0.57083939 0.68318423] 0.5701715
13 [0.47026832 0.5080024  0.71011155] 0.58102687
14 [0.52684411 0.57362905 0.72770439] 0.59732268
15 [0.47790001 0.5282084  0

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932449	valid_1's auc: 0.845478
Early stopping, best iteration is:
[682]	training's auc: 0.946133	valid_1's auc: 0.850815
logloss = 	 0.4808238618650472
ROC = 	 0.8508149568552253
0 [0.41174608 0.52500801 0.77845569] 0.61816211
1 [0.49724639 0.53608667 0.62475722] 0.59292873
2 [0.48886897 0.52524414 0.79771464] 0.62316088
3 [0.49679728 0.53787395 0.79936939] 0.62388454
4 [0.47353253 0.61786866 0.78735823] 0.63030519
5 [0.44406784 0.54003518 0.787022  ] 0.62157635
6 [0.43541142 0.61558489 0.73115578] 0.61340462
7 [0.48088582 0.52264544 0.8024896 ] 0.62228804
8 [0.47211807 0.53619105 0.67607984] 0.60015726
9 [0.40505182 0.56354519 0.78740398] 0.62317509
10 [0.50386459 0.5264317  0.625441  ] 0.59028059
11 [0.43078368 0.52629489 0.7210937 ] 0.60504915
12 [0.46452807 0.61742167 0.71912474] 0.61578136
13 [0.43749385 0.59852259 0.62540448] 0.58917743
14 [0.49669142 0.61

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[244]	training's auc: 0.903728	valid_1's auc: 0.804585
logloss = 	 0.5135595316094054
ROC = 	 0.8045847809796308
0 [0.4160283  0.51673502 0.74990395] 0.5024516
1 [0.48180418 0.51274406 0.72792528] 0.50334524
2 [0.43843947 0.61217381 0.75606901] 0.51505941
3 [0.42033058 0.51556731 0.72561715] 0.51090308
4 [0.41560903 0.67156829 0.72611972] 0.52322085
5 [0.43903238 0.62017732 0.72742935] 0.52104394
6 [0.41298002 0.53358379 0.72622457] 0.50852162
7 [0.43928582 0.5456429  0.7262188 ] 0.51112035
8 [0.4144704  0.54833082 0.72608287] 0.51181517
9 [0.45036022 0.64352539 0.72832969] 0.51822272
10 [0.41470513 0.58204083 0.62176707] 0.46129434
11 [0.41530949 0.5158206  0.7262726 ] 0.51015577
12 [0.39014824 0.51469765 0.72615351] 0.49750953
13 [0.47462911 0.62076988 0.72639641] 0.51763128
14 [0.44912036 0.61971674 0.72855005] 0.52321854
15 [0.41385497 0.6202602  

HBox(children=(IntProgress(value=0, max=359), HTML(value='')))


After truncation: ((359, 538), (359,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.867023	valid_1's auc: 0.79714
logloss = 	 0.5584680908596833
ROC = 	 0.7971397938714809
0 [0.4971899  0.57615725 0.65383058] 0.53548527
1 [0.53173309 0.57940786 0.6720151 ] 0.54872974
2 [0.53062384 0.57665083 0.70547262] 0.5460178
3 [0.5291183  0.58533712 0.65505081] 0.54484179
4 [0.50452307 0.57411535 0.65727006] 0.540304
5 [0.50678213 0.58499218 0.67324261] 0.55099607
6 [0.42252232 0.54560674 0.62380118] 0.47292062
7 [0.50624517 0.57275103 0.65464396] 0.53906674
8 [0.49653323 0.53079544 0.65372952] 0.51859872
9 [0.57615138 0.57665501 0.62473489] 0.52695816
10 [0.50647723 0.54437278 0.66800518] 0.53426889
11 [0.53013273 0.55547775 0.68903287] 0.53222433
12 [0.51844376 0.54461083 0.67379291] 0.53379468
13 [0.50110541 0.54561815 0.67278392] 0.53089939
14 [0.53833491 0.58507298 0.65891542] 0.54137052
15 [0.53060971 0.57671136 0.67

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.853073	valid_1's auc: 0.808188
logloss = 	 0.6133149558717008
ROC = 	 0.8081880086364391
0 [0.44386845 0.59339912 0.63837132] 0.5103667
1 [0.41555905 0.58991278 0.63826689] 0.51014932
2 [0.42594044 0.59271399 0.62071467] 0.51105706
3 [0.40008051 0.59299098 0.61217172] 0.50733945
4 [0.43405323 0.59318513 0.655459  ] 0.51104288
5 [0.47588964 0.58997411 0.61990016] 0.5108554
6 [0.58379758 0.58991518 0.64947529] 0.59430582
7 [0.58157906 0.61062645 0.63631198] 0.59516702
8 [0.42208587 0.58989077 0.62021799] 0.5108554
9 [0.41443373 0.61044673 0.62006705] 0.51143918
10 [0.42581337 0.59268647 0.62546772] 0.50679335
11 [0.41076236 0.60983593 0.62107198] 0.51143918
12 [0.43677363 0.61059632 0.62051048] 0.51143918
13 [0.41529077 0.61048868 0.6200295 ] 0.51143918
14 [0.5768056  0.59210953 0.65526832] 0.60069401
15 [0.46304146 0.59230964 0.6

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.932693	valid_1's auc: 0.818456
Early stopping, best iteration is:
[573]	training's auc: 0.939172	valid_1's auc: 0.821564
logloss = 	 0.49150643638230124
ROC = 	 0.8215638659543468
0 [0.50808472 0.53030634 0.64160575] 0.55402178
1 [0.43416695 0.57906617 0.64883579] 0.56256302
2 [0.51915272 0.57598189 0.58368707] 0.55244646
3 [0.36068003 0.55331237 0.81176112] 0.58099293
4 [0.42312437 0.55280733 0.69506446] 0.55849901
5 [0.43471107 0.55397476 0.6432042 ] 0.56310111
6 [0.42430438 0.56458797 0.64198299] 0.55864463
7 [0.40832107 0.58319562 0.70987342] 0.55936314
8 [0.434597   0.58328483 0.64437484] 0.56357766
9 [0.52990694 0.59001625 0.64254689] 0.55622761
10 [0.4337125  0.58435649 0.64426858] 0.56357766
11 [0.4349327  0.58475332 0.69550037] 0.56069777
12 [0.5088418  0.55133977 0.69543993] 0.55211895
13 [0.43443512 0.58293888 0.64743662] 0.56256302
14 [0.43280515 0.5

HBox(children=(IntProgress(value=0, max=358), HTML(value='')))


After truncation: ((358, 538), (358,))
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.931157	valid_1's auc: 0.83734
Early stopping, best iteration is:
[457]	training's auc: 0.927223	valid_1's auc: 0.838846
logloss = 	 0.5064461845026442
ROC = 	 0.8388461538461539
0 [0.3942053  0.55178124 0.81688465] 0.57505814
1 [0.41149856 0.54455183 0.80468423] 0.57259871
2 [0.45833926 0.53589241 0.80512454] 0.57584905
3 [0.35425667 0.63411176 0.77619282] 0.58648226
4 [0.53475601 0.54587221 0.76878309] 0.56333177
5 [0.39457763 0.54004634 0.73990216] 0.57285595
6 [0.35769076 0.68178354 0.76961584] 0.58743044
7 [0.29917056 0.67818577 0.8061302 ] 0.58517586
8 [0.42838105 0.55523253 0.77591746] 0.573612
9 [0.37156635 0.54766477 0.76987543] 0.57598413
10 [0.43022112 0.5342738  0.73493827] 0.5677933
11 [0.37097907 0.62879987 0.77616544] 0.59051479
12 [0.4562094  0.54554842 0.80460362] 0.57321787
13 [0.38710663 0.67649564 0.80625647] 0.59082034
14 [0.45469953 0.542691

3    0.674
0    0.130
2    0.107
1    0.089
Name: accuracy_group, dtype: float64

In [18]:
# https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
from collections import defaultdict

def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices
        
def modelling_sgk_reg(train, test, feat, select_flg): # stratify by target, group by session_id
    lgbm_params = {'objective': 'regression', 'boosting_type': 'gbdt',
     'tree_learner': 'serial','bagging_fraction': 0.5698056418890787,'bagging_freq': 4,
     'colsample_bytree': 0.37564408454469,'learning_rate': 0.015433389422506185,'max_depth': 8,
     'min_data_in_leaf': 51,'min_sum_hessian_in_leaf': 10,'num_leaves': 48}

    y_train = train.accuracy_group.copy()
    X_train = train.drop(["accuracy_group"],axis=1) 
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
                      or i in to_exclude]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    if select_flg == True:
        X_train = X_train[fea]
    X_train = X_train[sorted(X_train.columns.tolist())]

    X_test = test.drop(["installation_id","accuracy_group"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    if select_flg == True:
        X_test = X_test[fea]
    X_test = X_test[sorted(X_test.columns.tolist())]
    groups = np.array(X_train.installation_id.values)

    models = []
    random_try = 1
    n_folds = 10
    evals_result = {}
    mean_qwk_score = 0
    for try_time in range(random_try):
        for i, (train_index, test_index) in enumerate(stratified_group_k_fold(X_train, y_train, groups, k=n_folds, seed=12)):
            print("Fold "+str(i+1))
            X_train2 = X_train.iloc[train_index,:]
            y_train2 = y_train.iloc[train_index]
            X_train2.drop("installation_id", axis=1, inplace=True)

            X_test2 = X_train.iloc[test_index,:]
            y_test2 = y_train.iloc[test_index]
            
            X_test2, idx_val = get_random_assessment(X_test2)
            X_test2.drop('installation_id', inplace=True, axis=1)
            y_test2 = y_test2.loc[idx_val]

            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500, categorical_feature = categoricals)
            valid = np.array(clf.predict(X_test2, num_iteration = clf.best_iteration))
            real = np.array(y_test2)

            models.append(clf)
            rmse = np.sqrt(mean_squared_error(real, valid))
            print("RMSE = {}".format(rmse))
            
            # threshold optimization --------------
            best_score = 0
            for i in range(20):
                optR = OptimizedRounder_reg()
                optR.fit(np.array(valid).reshape(-1,), real, random_flg=True)
                coefficients = optR.coefficients()
                final_valid_pred = optR.predict(np.array(valid).reshape(-1,), coefficients)
                score = qwk(real, final_valid_pred)
                print(i, np.sort(coefficients), score)
                if score > best_score:
                    best_score = score
                    best_coefficients = coefficients
            mean_qwk_score += best_score / (random_try * n_folds)
            if try_time == 0:
                final_coefficients = np.sort(best_coefficients) / (random_try * n_folds)
            else:
                final_coefficients += np.sort(best_coefficients) / (random_try * n_folds)

    print("mean QWK = {}".format(mean_qwk_score))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return mean_qwk_score, pred_value, final_coefficients
    
#qwk_sgk1, pred_value, final_coefficients = modelling_sgk_reg(new_train, new_test, feat, False)
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort(final_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#final_test_pred = pd.cut(np.array(pred_value).reshape(-1,), [-np.inf] + list(np.sort([0.32229148, 0.51887455, 0.77529457])) + [np.inf], labels = [0, 1, 2, 3])

#sample_submission["accuracy_group"] = final_test_pred.astype(int)
#sample_submission.to_csv('submission.csv', index=False)
#sample_submission["accuracy_group"].value_counts(normalize = True)

In [19]:
def my_hyperopt(X, Y):
    def para_tuning_obj(params):
        params = {
        'boosting_type': 'gbdt', 
        'metric': "auc", 
        'objective': 'binary', 
        'eval_metric': 'cappa', 
        "tree_learner": "serial",
        'max_depth': int(params['max_depth']),
        'bagging_freq': int(params['bagging_freq']),
        'bagging_fraction': float(params['bagging_fraction']),
        'num_leaves': int(params['num_leaves']),
        'learning_rate': float(params['learning_rate']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'min_sum_hessian_in_leaf': int(params['min_sum_hessian_in_leaf']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
}
    
        real = np.array([])
        pred = np.array([])
        skf = GroupKFold(n_splits=10)
        for trn_idx, val_idx in skf.split(X, Y, X["installation_id"]):
            x_train, x_val = X.iloc[trn_idx, :], X.iloc[val_idx, :]
            y_train, y_val = Y.iloc[trn_idx], Y.iloc[val_idx]
            x_val['accuracy_group'] = y_val
            np.random.seed(0)
            x_val_mod = x_val.groupby('installation_id').agg(np.random.choice).reset_index(drop=False)
            y_val_mod = x_val_mod.accuracy_group.copy()
            x_train.drop('installation_id', inplace = True, axis = 1)
            x_val_mod.drop(['installation_id', "accuracy_group"], inplace = True, axis = 1)
            train_set = lgb.Dataset(x_train, y_train, categorical_feature = ['session_title'])
            val_set = lgb.Dataset(x_val_mod, y_val_mod, categorical_feature = ['session_title'])
        
            clf = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                         valid_sets = [train_set, val_set], verbose_eval = 300)
            pred = np.concatenate((pred, np.array(clf.predict(x_val_mod, num_iteration = clf.best_iteration))), axis=0) 
            real = np.concatenate((real, np.array(y_val_mod)), axis=0) 
        score = roc_auc_score(real, pred)
    
        return - score

    trials = Trials()

    space ={
        'max_depth': hp.quniform('max_depth', 1, 15, 1),
        'bagging_freq': hp.quniform('bagging_freq', 1, 10, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.2, 1.0),
        'num_leaves': hp.quniform('num_leaves', 8, 64, 1),
        'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 8, 64, 1),
        'min_sum_hessian_in_leaf': hp.quniform('min_sum_hessian_in_leaf', 5, 30, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0)
    }

    best = fmin(para_tuning_obj, space = space, algo=tpe.suggest, max_evals=10, trials=trials, verbose=1)

    best_params = space_eval(space, best)
    return best_params

#X_train = new_train.drop(["accuracy_group"], axis=1).copy()
#Y = new_train.accuracy_group.copy()
#Y.loc[Y <=1] = 0
#Y.loc[Y >=2] = 1
#lbl = preprocessing.LabelEncoder()
#lbl.fit(list(X_train["installation_id"]))
#X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
#remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])
#                      or i in to_exclude]
#for i in X_train.columns:
#    if X_train[i].std() == 0 and i not in remove_features:
#        remove_features.append(i)
#X_train = X_train.drop(remove_features, axis=1)
#X_train = X_train[sorted(X_train.columns.tolist())]

#random_state = 42
#my_hyperopt(X_train, Y)