- add linear model and nn model

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras import backend as K
import gc
import json
pd.set_option('display.max_columns', 1000)
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import random

In [2]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission

In [3]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world

In [4]:
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    durations_game = []
    durations_activity = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    session_count = 0
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    crys_act_true = 0; crys_act_false = 0
    tree_act_true = 0; tree_act_false = 0
    magma_act_true = 0; magma_act_false = 0
    crys_act_acc = []; tree_act_acc = []; magma_act_acc = []
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]  
        session_world = session["world"].iloc[0]
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated      
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features.update(event_id_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['installation_session_count'] = session_count
            
            variety_features = [('var_event_code', event_code_count), 
                                ('var_event_id', event_id_count), 
                                ('var_title', title_count), 
                                ('var_title_event_code', title_event_code_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)
                
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            # ----------------------------------------------
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            
            
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            # ----------------------------------------------
            
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['last_duration'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['last_duration'] = durations[-1]
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            if durations_game == []:
                features['duration_game_mean'] = 0
                features['duration_game_std'] = 0
                features['game_last_duration'] = 0
                features['game_max_duration'] = 0
            else:
                features['duration_game_mean'] = np.mean(durations_game)
                features['duration_game_std'] = np.std(durations_game)
                features['game_last_duration'] = durations_game[-1]
                features['game_max_duration'] = np.max(durations_game)
                
            if durations_activity == []:
                features['duration_activity_mean'] = 0
                features['duration_activity_std'] = 0
                features['game_activity_duration'] = 0
                features['game_activity_max'] = 0
            else:
                features['duration_activity_mean'] = np.mean(durations_activity)
                features['duration_activity_std'] = np.std(durations_activity)
                features['game_activity_duration'] = durations_activity[-1]
                features['game_activity_max'] = np.max(durations_activity)
                
            features["misses"] = miss
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
                features["act_true"] = crys_act_true
                features["act_false"] = crys_act_false
                features['act_accuracy'] = crys_act_true / (crys_act_true + crys_act_false) if (crys_act_true + crys_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(crys_act_acc) if len(crys_act_acc) >=1 else 0
                features["last_act_acc"] = crys_act_acc[-1] if len(crys_act_acc) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
                features["act_true"] = tree_act_true
                features["act_false"] = tree_act_false
                features['act_accuracy'] = tree_act_true / (tree_act_true + tree_act_false) if (tree_act_true + tree_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(tree_act_acc) if len(tree_act_acc) >=1 else 0
                features["last_act_acc"] = tree_act_acc[-1] if len(tree_act_acc) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
                features["act_true"] = magma_act_true
                features["act_false"] = magma_act_false
                features['act_accuracy'] = magma_act_true / (magma_act_true + magma_act_false) if (magma_act_true + magma_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(magma_act_acc) if len(magma_act_acc) >=1 else 0
                features["last_act_acc"] = magma_act_acc[-1] if len(magma_act_acc) >=1 else 0
            
            # the accuracy is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        if session_type == 'Game':
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            true = session['event_data'].str.contains('true').sum()
            false = session['event_data'].str.contains('false').sum() 
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            if session_world == activities_world["CRYSTALCAVES"]:
                crys_game_true += true
                crys_game_false += false
                crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["TREETOPCITY"]:
                tree_game_true += true
                tree_game_false += false
                tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["MAGMAPEAK"]:
                magma_game_true += true
                magma_game_false += false
                magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            else:
                pass
            
        if session_type == 'Activity':
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            true = session['event_data'].str.contains('true').sum()
            false = session['event_data'].str.contains('false').sum() 
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            if session_world == activities_world["CRYSTALCAVES"]:
                crys_act_true += true
                crys_act_false += false
                crys_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["TREETOPCITY"]:
                tree_act_true += true
                tree_act_false += false
                tree_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["MAGMAPEAK"]:
                magma_act_true += true
                magma_act_false += false
                magma_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            else:
                pass    
        
        session_count += 1
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [5]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_test_his = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort = False)), total = 17000):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    for i, (ins_id, user_sample) in tqdm(enumerate(test.groupby('installation_id', sort = False)), total = 1000):
        compiled_test_his += get_data(user_sample)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_test_his = pd.DataFrame(compiled_test_his)
    
    return reduce_train, reduce_test, reduce_test_his

In [6]:
# thank to Bruno
def eval_qwk_lgb_regr(y_pred, train_t):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

def predict(sample_submission, y_pred):
    sample_submission['accuracy_group'] = y_pred
    sample_submission['accuracy_group'] = sample_submission['accuracy_group'].astype(int)
    sample_submission.to_csv('submission.csv', index = False)
    print(sample_submission['accuracy_group'].value_counts(normalize = True))

In [7]:
def get_random_assessment(reduce_train):
    used_idx = []
    for iid in tqdm(set(reduce_train['installation_id'])):
        list_ = list(reduce_train[reduce_train['installation_id']==iid].index)
        cur = random.choices(list_, k = 1)[0]
        used_idx.append(cur)
    reduce_train_t = reduce_train.loc[used_idx]
    return reduce_train_t, used_idx

In [8]:
# for each validation fold extract one random observation for each installation id to simulate the test set
def run_lgb(reduce_train, reduce_test, features):
    # features found in initial bayesian optimization
    params = {'boosting_type': 'gbdt', 
              'metric': 'rmse', 
              'objective': 'regression', 
              'eval_metric': 'cappa', 
              'n_jobs': -1, 
              'seed': 42, 
              'num_leaves': 26, 
              'learning_rate': 0.077439684887749, 
              'max_depth': 33, 
              'lambda_l1': 3.27791989030057, 
              'lambda_l2': 1.3047627805931334, 
              'bagging_fraction': 0.896924978584253, 
              'bagging_freq': 1, 
              'colsample_bytree': 0.8710772167017853}

    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)
        train_set = lgb.Dataset(x_train, y_train, categorical_feature = ['session_title'])


        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]
        val_set = lgb.Dataset(x_val, y_val, categorical_feature = ['session_title'])

        model = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                         valid_sets = [train_set, val_set], verbose_eval = 100)

        oof_pred[idx_val] = model.predict(x_val)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [9]:
# for each validation fold extract one random observation for each installation id to simulate the test set
def run_linear_model(reduce_train, reduce_test, features):
    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)

        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]

        model = LinearRegression()
        model.fit(x_train, y_train)  

        oof_pred[idx_val] = model.predict(x_val)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [10]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def run_nn(reduce_train, reduce_test, features):
    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)

        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]
        
        verbosity = 100
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(x_train.shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'), #, kernel_regularizer=tf.keras.regularizers.l2(0.001)
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation='tanh'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            #tf.keras.layers.Dense(50, activation='relu'),
            #tf.keras.layers.LayerNormalization(),
            #tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='relu')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse')
        #print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('./nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
        
        model.fit(x_train, 
                y_train, 
                validation_data=(x_val, y_val),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('./nn_model.w8')
        
        oof_pred[idx_val] = model.predict(x_val).reshape(x_val.shape[0],)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]).reshape(reduce_test.shape[0],) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [11]:
def standardize_data(reduce_train, reduce_test):
    features = [i for i in reduce_train.columns if i not in ["installation_id", "accuracy_group"]]
    categoricals = ['session_title']
    features = features.copy()
    new_train = reduce_train.copy()
    new_test = reduce_test.copy()
    if len(categoricals) > 0:
        for cat in categoricals:
            enc = OneHotEncoder()
            train_cats = enc.fit_transform(new_train[[cat]])
            test_cats = enc.transform(new_test[[cat]])
            cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
            features += cat_cols
            train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
            test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
            new_train = pd.concat([new_train, train_cats], axis=1)
            new_test = pd.concat([new_test, test_cats], axis=1)
        scalar = MinMaxScaler()
        new_train[features] = scalar.fit_transform(new_train[features])
        new_test[features] = scalar.transform(new_test[features])
    return new_train, new_test

# installation

In [12]:
train, test, train_labels, specs, sample_submission = read_data()
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world = encode_title(train, test, train_labels)
reduce_train, reduce_test, reduce_his_test = get_train_and_test(train, test)
del train, test

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns
Reading specs.csv file....
Specs.csv file have 386 rows and 3 columns
Reading sample_submission.csv file....
Sample_submission.csv file have 1000 rows and 2 columns


HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

# feature selection

In [13]:
# extracted from feature elimination first round script
old_features = list(reduce_train.columns[0:95]) + list(reduce_train.columns[882:])
el_features = ['accuracy_group', 'accuracy', 'installation_id']
old_features = [col for col in old_features if col not in el_features]
event_id_features = list_of_event_id #list(reduce_train.columns[95:479])
title_event_code_cross = all_title_event_code #list(reduce_train.columns[479:882])
features = old_features + event_id_features + title_event_code_cross

In [14]:
def remove_correlated_features(reduce_train):
    counter = 0
    to_remove = []
    for feat_a in features:
        for feat_b in features:
            if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
                c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
                if c > 0.995:
                    counter += 1
                    to_remove.append(feat_b)
                    print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))
    return to_remove
to_remove = remove_correlated_features(reduce_train)
features = [col for col in features if col not in to_remove]
features = [col for col in features if col not in ['Heavy, Heavier, Heaviest_2000', 'Heavy, Heavier, Heaviest']]
features.append('installation_id')
print('Training with {} features'.format(len(features)))

1: FEAT_A: Clip FEAT_B: 27253bdc - Correlation: 0.9999999999999999
2: FEAT_A: 2050 FEAT_B: 2040 - Correlation: 0.9965259434878118
3: FEAT_A: 2050 FEAT_B: dcaede90 - Correlation: 0.9965259434878118
4: FEAT_A: 2050 FEAT_B: 37c53127 - Correlation: 1.0
5: FEAT_A: 2050 FEAT_B: 08fd73f3 - Correlation: 0.9966123918733654
6: FEAT_A: 2050 FEAT_B: 26fd2d99 - Correlation: 0.9965084543995759
7: FEAT_A: 2050 FEAT_B: 73757a5e - Correlation: 0.9998050146713992
8: FEAT_A: 2050 FEAT_B: 2b9272f4 - Correlation: 0.9999839030068793
9: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2030 - Correlation: 0.9966123918733654
10: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2040 - Correlation: 0.9965259434878118
11: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2020 - Correlation: 0.9965084543995759
12: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_3021 - Correlation: 0.9998050146713992
13: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_3121 - Correlation: 0.9999839030068793
14: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2050 - Correlation: 1.0
15: FEAT_A: 4230 FEAT_B: 4235 - Correlation

In [15]:
# function to exclude columns from the train and test set if the mean is different, also adjust test column by a factor to simulate the same distribution
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title']:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test

to_exclude, ajusted_test = exclude(reduce_train, reduce_test, features)
features = [col for col in features if col not in to_exclude]

acc_Cart Balancer (Assessment)
misses
a8cc6fec
29a42aea
119b5b02
4074bac2
13f56524
bfc77bd6
611485c5
dcb1663e
01ca3a3c
eb2c19cd
5dc079d8
ecc6157f
6aeafed4
e4d32835
003cd2ee
2ec694de
0ce40006
1b54d27f
17ca3959
ab4ec3a4
7fd1ac25
Sandcastle Builder (Activity)_2010
Scrub-A-Dub_4080
Bottle Filler (Activity)_2010
Air Show_4080
Crystals Rule_2010
Pan Balance_2010


# modelling and prediction

In [16]:
# train 5 times because the evaluation and training data change with the randomness
y_pred_1, oof_rmse_score_1, oof_cohen_score_1 = run_lgb(reduce_train, ajusted_test, features)
y_pred_2, oof_rmse_score_2, oof_cohen_score_2 = run_lgb(reduce_train, ajusted_test, features)
y_pred_3, oof_rmse_score_3, oof_cohen_score_3 = run_lgb(reduce_train, ajusted_test, features)
y_pred_4, oof_rmse_score_4, oof_cohen_score_4 = run_lgb(reduce_train, ajusted_test, features)
y_pred_5, oof_rmse_score_5, oof_cohen_score_5 = run_lgb(reduce_train, ajusted_test, features)
mean_rmse = (oof_rmse_score_1 + oof_rmse_score_2 + oof_rmse_score_3 + oof_rmse_score_4 + oof_rmse_score_5) / 5
mean_cohen_kappa = (oof_cohen_score_1 + oof_cohen_score_2 + oof_cohen_score_3 + oof_cohen_score_4 + oof_cohen_score_5) / 5
print('Our mean rmse score is: ', mean_rmse)
print('Our mean cohen kappa score is: ', mean_cohen_kappa)
y_final = (y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5) / 5
y_final = eval_qwk_lgb_regr(y_final, reduce_train)
predict(sample_submission, y_final)

Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869983	valid_1's rmse: 1.03434
[200]	training's rmse: 0.790919	valid_1's rmse: 1.02993
[300]	training's rmse: 0.728671	valid_1's rmse: 1.02886
[400]	training's rmse: 0.677269	valid_1's rmse: 1.02668
Early stopping, best iteration is:
[392]	training's rmse: 0.680927	valid_1's rmse: 1.02598
Fold: 2


HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870738	valid_1's rmse: 1.01303
[200]	training's rmse: 0.793541	valid_1's rmse: 1.00937
[300]	training's rmse: 0.732789	valid_1's rmse: 1.00838
[400]	training's rmse: 0.682437	valid_1's rmse: 1.00918
Early stopping, best iteration is:
[364]	training's rmse: 0.700036	valid_1's rmse: 1.00743
Fold: 3


HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869114	valid_1's rmse: 0.980867
[200]	training's rmse: 0.791041	valid_1's rmse: 0.980475
Early stopping, best iteration is:
[170]	training's rmse: 0.811814	valid_1's rmse: 0.978892
Fold: 4


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.864984	valid_1's rmse: 1.03843
[200]	training's rmse: 0.786722	valid_1's rmse: 1.0322
Early stopping, best iteration is:
[197]	training's rmse: 0.788872	valid_1's rmse: 1.03119
Fold: 5


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.860545	valid_1's rmse: 1.03946
[200]	training's rmse: 0.781019	valid_1's rmse: 1.03929
Early stopping, best iteration is:
[135]	training's rmse: 0.830095	valid_1's rmse: 1.0376
Our oof rmse score is: 1.0164463325579465
Our oof cohen kappa score is: 0.5695989834872628
Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869983	valid_1's rmse: 1.06205
[200]	training's rmse: 0.790919	valid_1's rmse: 1.06094
[300]	training's rmse: 0.728671	valid_1's rmse: 1.06098
[400]	training's rmse: 0.677269	valid_1's rmse: 1.05797
[500]	training's rmse: 0.634081	valid_1's rmse: 1.06198
Early stopping, best iteration is:
[409]	training's rmse: 0.673309	valid_1's rmse: 1.05729
Fold: 2


HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870738	valid_1's rmse: 1.05665
[200]	training's rmse: 0.793541	valid_1's rmse: 1.05129
[300]	training's rmse: 0.732789	valid_1's rmse: 1.051
Early stopping, best iteration is:
[216]	training's rmse: 0.78268	valid_1's rmse: 1.04814
Fold: 3


HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869114	valid_1's rmse: 1.00436
[200]	training's rmse: 0.791041	valid_1's rmse: 1.00395
[300]	training's rmse: 0.731619	valid_1's rmse: 1.00828
Early stopping, best iteration is:
[200]	training's rmse: 0.791041	valid_1's rmse: 1.00395
Fold: 4


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.864984	valid_1's rmse: 1.03336
[200]	training's rmse: 0.786722	valid_1's rmse: 1.03277
Early stopping, best iteration is:
[162]	training's rmse: 0.813246	valid_1's rmse: 1.03133
Fold: 5


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.860545	valid_1's rmse: 1.05231
[200]	training's rmse: 0.781019	valid_1's rmse: 1.05334
Early stopping, best iteration is:
[125]	training's rmse: 0.838177	valid_1's rmse: 1.05028
Our oof rmse score is: 1.038362793887349
Our oof cohen kappa score is: 0.5462144637225613
Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869983	valid_1's rmse: 1.04725
[200]	training's rmse: 0.790919	valid_1's rmse: 1.049
Early stopping, best iteration is:
[110]	training's rmse: 0.860602	valid_1's rmse: 1.04581
Fold: 2


HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870738	valid_1's rmse: 1.02847
[200]	training's rmse: 0.793541	valid_1's rmse: 1.02572
Early stopping, best iteration is:
[168]	training's rmse: 0.815946	valid_1's rmse: 1.02389
Fold: 3


HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869114	valid_1's rmse: 0.990467
[200]	training's rmse: 0.791041	valid_1's rmse: 0.986945
Early stopping, best iteration is:
[173]	training's rmse: 0.809734	valid_1's rmse: 0.984119
Fold: 4


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.864984	valid_1's rmse: 1.06579
[200]	training's rmse: 0.786722	valid_1's rmse: 1.06629
Early stopping, best iteration is:
[162]	training's rmse: 0.813246	valid_1's rmse: 1.0642
Fold: 5


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.860545	valid_1's rmse: 1.05063
[200]	training's rmse: 0.781019	valid_1's rmse: 1.05309
Early stopping, best iteration is:
[135]	training's rmse: 0.830095	valid_1's rmse: 1.04911
Our oof rmse score is: 1.033807958780482
Our oof cohen kappa score is: 0.5536720295091166
Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869983	valid_1's rmse: 1.07738
[200]	training's rmse: 0.790919	valid_1's rmse: 1.07518
Early stopping, best iteration is:
[187]	training's rmse: 0.80034	valid_1's rmse: 1.07335
Fold: 2


HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870738	valid_1's rmse: 1.01339
[200]	training's rmse: 0.793541	valid_1's rmse: 1.01288
[300]	training's rmse: 0.732789	valid_1's rmse: 1.01417
Early stopping, best iteration is:
[220]	training's rmse: 0.7801	valid_1's rmse: 1.01097
Fold: 3


HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869114	valid_1's rmse: 0.984239
[200]	training's rmse: 0.791041	valid_1's rmse: 0.987293
Early stopping, best iteration is:
[107]	training's rmse: 0.862569	valid_1's rmse: 0.98365
Fold: 4


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.864984	valid_1's rmse: 1.09724
[200]	training's rmse: 0.786722	valid_1's rmse: 1.09719
Early stopping, best iteration is:
[150]	training's rmse: 0.822241	valid_1's rmse: 1.09579
Fold: 5


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.860545	valid_1's rmse: 1.0461
[200]	training's rmse: 0.781019	valid_1's rmse: 1.04747
Early stopping, best iteration is:
[121]	training's rmse: 0.841697	valid_1's rmse: 1.04423
Our oof rmse score is: 1.0423950259230446
Our oof cohen kappa score is: 0.5413993274821325
Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869983	valid_1's rmse: 1.05001
[200]	training's rmse: 0.790919	valid_1's rmse: 1.05043
[300]	training's rmse: 0.728671	valid_1's rmse: 1.04645
[400]	training's rmse: 0.677269	valid_1's rmse: 1.046
Early stopping, best iteration is:
[388]	training's rmse: 0.68282	valid_1's rmse: 1.04497
Fold: 2


HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870738	valid_1's rmse: 1.00898
[200]	training's rmse: 0.793541	valid_1's rmse: 1.01129
Early stopping, best iteration is:
[142]	training's rmse: 0.835594	valid_1's rmse: 1.00774
Fold: 3


HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.869114	valid_1's rmse: 0.979359
[200]	training's rmse: 0.791041	valid_1's rmse: 0.979996
Early stopping, best iteration is:
[117]	training's rmse: 0.854396	valid_1's rmse: 0.978288
Fold: 4


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.864984	valid_1's rmse: 1.08149
[200]	training's rmse: 0.786722	valid_1's rmse: 1.08189
Early stopping, best iteration is:
[104]	training's rmse: 0.860955	valid_1's rmse: 1.08075
Fold: 5


HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.860545	valid_1's rmse: 1.02997
[200]	training's rmse: 0.781019	valid_1's rmse: 1.03059
Early stopping, best iteration is:
[135]	training's rmse: 0.830095	valid_1's rmse: 1.02752
Our oof rmse score is: 1.0284432667332495
Our oof cohen kappa score is: 0.5601989697951992
Our mean rmse score is:  1.0318910755764144
Our mean cohen kappa score is:  0.5542167547992545
3    0.500
0    0.239
1    0.136
2    0.125
Name: accuracy_group, dtype: float64


In [17]:
train_std, test_std = standardize_data(reduce_train, ajusted_test)
tmp_feat = [i for i in features if i != "session_title"]

In [18]:
y_pred_1_lr, oof_rmse_score_1_lr, oof_cohen_score_1_lr = run_nn(train_std, test_std, tmp_feat)
y_pred_2_lr, oof_rmse_score_2_lr, oof_cohen_score_2_lr = run_nn(train_std, test_std, tmp_feat)
y_pred_3_lr, oof_rmse_score_3_lr, oof_cohen_score_3_lr = run_nn(train_std, test_std, tmp_feat)
y_pred_4_lr, oof_rmse_score_4_lr, oof_cohen_score_4_lr = run_nn(train_std, test_std, tmp_feat)
y_pred_5_lr, oof_rmse_score_5_lr, oof_cohen_score_5_lr = run_nn(train_std, test_std, tmp_feat)
mean_rmse_lr = (oof_rmse_score_1_lr + oof_rmse_score_2_lr + oof_rmse_score_3_lr + oof_rmse_score_4_lr + oof_rmse_score_5_lr) / 5
mean_cohen_kappa_lr = (oof_cohen_score_1_lr + oof_cohen_score_2_lr + oof_cohen_score_3_lr + oof_cohen_score_4_lr + oof_cohen_score_5_lr) / 5
print('Our mean rmse score is: ', mean_rmse_lr)
print('Our mean cohen kappa score is: ', mean_cohen_kappa_lr)

Fold: 1


HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Train on 14152 samples, validate on 721 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.44261, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.44261 to 1.39621, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.39621 to 1.38277, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.38277
Epoch 5/100
Epoch 00005: val_loss improved from 1.38277 to 1.35310, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.35310 to 1.32466, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.32466
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.32466
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.32466
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.32466
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.32466
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.32466
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Train on 14152 samples, validate on 722 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.67145, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.67145 to 1.38709, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.38709
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.38709
Epoch 5/100
Epoch 00005: val_loss improved from 1.38709 to 1.38656, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.38656 to 1.37150, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.37150
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.37150
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.37150
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.37150
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.37150
Epoch 12/100
Epoch 00012: val_loss improved from 1.37150 to 1.36412, saving model to ./nn_model.w8
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Train on 14152 samples, validate on 723 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.34986, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.34986 to 1.33457, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.33457
Epoch 4/100
Epoch 00004: val_loss improved from 1.33457 to 1.29983, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.29983
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.29983
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.29983
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.29983
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.29983
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.29983
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.29983
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.29983
Epoch 13/100
Epoch 00013: val_loss did not improve from 1.29983
Epoch 14/100
Epoch 00014: v

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.41369, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.41369 to 1.40430, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.40430 to 1.38823, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 1.38823 to 1.37398, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.37398
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.37398
Epoch 7/100
Epoch 00007: val_loss improved from 1.37398 to 1.35179, saving model to ./nn_model.w8
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.35179
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.35179
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.35179
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.35179
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.35179
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.45082, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.45082 to 1.38189, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.38189 to 1.34868, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.34868
Epoch 5/100
Epoch 00005: val_loss improved from 1.34868 to 1.32337, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.32337 to 1.32033, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.32033
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.32033
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.32033
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.32033
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.32033
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.32033
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Train on 14152 samples, validate on 721 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.43817, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.43817 to 1.38742, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.38742 to 1.36793, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 1.36793 to 1.34587, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.34587
Epoch 6/100
Epoch 00006: val_loss improved from 1.34587 to 1.34125, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.34125
Epoch 8/100
Epoch 00008: val_loss improved from 1.34125 to 1.32993, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss improved from 1.32993 to 1.32857, saving model to ./nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.32857
Epoch 11/100
Epoch 00011: val_loss improved from 1.32857 to 1.32503, saving mod

HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Train on 14152 samples, validate on 722 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.45303, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.45303 to 1.40303, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.40303
Epoch 4/100
Epoch 00004: val_loss improved from 1.40303 to 1.39597, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.39597
Epoch 6/100
Epoch 00006: val_loss improved from 1.39597 to 1.37634, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.37634
Epoch 8/100
Epoch 00008: val_loss improved from 1.37634 to 1.36385, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.36385
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.36385
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.36385
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.36385
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Train on 14152 samples, validate on 723 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.39951, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.39951 to 1.37034, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.37034 to 1.36454, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 1.36454 to 1.31760, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss improved from 1.31760 to 1.30966, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.30966 to 1.29692, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.29692
Epoch 8/100
Epoch 00008: val_loss improved from 1.29692 to 1.27489, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.27489
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.27489
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.27489
Epoch 12/100
Ep

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.41717, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.41717 to 1.40180, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.40180 to 1.39665, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 1.39665 to 1.37106, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss improved from 1.37106 to 1.34359, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.34359
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.34359
Epoch 8/100
Epoch 00008: val_loss improved from 1.34359 to 1.33980, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss improved from 1.33980 to 1.33212, saving model to ./nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.33212
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.33212
Epoch 12/100
Ep

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.39467, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss did not improve from 1.39467
Epoch 3/100
Epoch 00003: val_loss improved from 1.39467 to 1.32670, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.32670
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.32670
Epoch 6/100
Epoch 00006: val_loss improved from 1.32670 to 1.32483, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.32483
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.32483
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.32483
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.32483
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.32483
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.32483
Epoch 13/100
Epoch 00013: val_loss did not improve from 1.32483
Epoch 14/100
Epoch 00014: v

HBox(children=(IntProgress(value=0, max=721), HTML(value='')))


Train on 14152 samples, validate on 721 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.40273, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.40273 to 1.33686, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.33686
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.33686
Epoch 5/100
Epoch 00005: val_loss improved from 1.33686 to 1.32291, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.32291
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.32291
Epoch 8/100
Epoch 00008: val_loss improved from 1.32291 to 1.30109, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.30109
Epoch 10/100
Epoch 00010: val_loss improved from 1.30109 to 1.29614, saving model to ./nn_model.w8
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.29614
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.29614
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=722), HTML(value='')))


Train on 14152 samples, validate on 722 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.42814, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.42814 to 1.42364, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.42364 to 1.38648, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 1.38648 to 1.38308, saving model to ./nn_model.w8
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.38308
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.38308
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.38308
Epoch 8/100
Epoch 00008: val_loss improved from 1.38308 to 1.35152, saving model to ./nn_model.w8
Epoch 9/100
Epoch 00009: val_loss improved from 1.35152 to 1.33411, saving model to ./nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.33411
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.33411
Epoch 12/100
Epoch 00012: val_loss did not improve

HBox(children=(IntProgress(value=0, max=723), HTML(value='')))


Train on 14152 samples, validate on 723 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.36180, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.36180 to 1.35598, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.35598 to 1.29279, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.29279
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.29279
Epoch 6/100
Epoch 00006: val_loss improved from 1.29279 to 1.26437, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.26437
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.26437
Epoch 9/100
Epoch 00009: val_loss improved from 1.26437 to 1.22296, saving model to ./nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.22296
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.22296
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.22296
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.41927, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.41927 to 1.36747, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.36747 to 1.35029, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.35029
Epoch 5/100
Epoch 00005: val_loss improved from 1.35029 to 1.34257, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.34257 to 1.30335, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.30335
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.30335
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.30335
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.30335
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.30335
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.30335
Epoch 13/100
Epoch 00

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


Train on 14152 samples, validate on 724 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.41535, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss did not improve from 1.41535
Epoch 3/100
Epoch 00003: val_loss improved from 1.41535 to 1.39467, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.39467
Epoch 5/100
Epoch 00005: val_loss improved from 1.39467 to 1.35452, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 1.35452 to 1.34495, saving model to ./nn_model.w8
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.34495
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.34495
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.34495
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.34495
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.34495
Epoch 12/100
   32/14152 [..............................] - ETA: 2s - loss: 1.3866