- improve linear model and nn model

In [1]:
import numpy as np
import pandas as pd
import os
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras import backend as K
import gc
import json
pd.set_option('display.max_columns', 1000)
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import random

In [2]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')

    print('Reading specs.csv file....')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    return train, test, train_labels, specs, sample_submission

In [3]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world

In [4]:
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    durations_game = []
    durations_activity = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    session_count = 0
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    crys_act_true = 0; crys_act_false = 0
    tree_act_true = 0; tree_act_false = 0
    magma_act_true = 0; magma_act_false = 0
    crys_act_acc = []; tree_act_acc = []; magma_act_acc = []
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]  
        session_world = session["world"].iloc[0]
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated      
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features.update(event_id_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['installation_session_count'] = session_count
            
            variety_features = [('var_event_code', event_code_count), 
                                ('var_event_id', event_id_count), 
                                ('var_title', title_count), 
                                ('var_title_event_code', title_event_code_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)
                
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            # ----------------------------------------------
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            
            
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            # ----------------------------------------------
            
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['last_duration'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['last_duration'] = durations[-1]
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            if durations_game == []:
                features['duration_game_mean'] = 0
                features['duration_game_std'] = 0
                features['game_last_duration'] = 0
                features['game_max_duration'] = 0
            else:
                features['duration_game_mean'] = np.mean(durations_game)
                features['duration_game_std'] = np.std(durations_game)
                features['game_last_duration'] = durations_game[-1]
                features['game_max_duration'] = np.max(durations_game)
                
            if durations_activity == []:
                features['duration_activity_mean'] = 0
                features['duration_activity_std'] = 0
                features['game_activity_duration'] = 0
                features['game_activity_max'] = 0
            else:
                features['duration_activity_mean'] = np.mean(durations_activity)
                features['duration_activity_std'] = np.std(durations_activity)
                features['game_activity_duration'] = durations_activity[-1]
                features['game_activity_max'] = np.max(durations_activity)
                
            features["misses"] = miss
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
                features["act_true"] = crys_act_true
                features["act_false"] = crys_act_false
                features['act_accuracy'] = crys_act_true / (crys_act_true + crys_act_false) if (crys_act_true + crys_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(crys_act_acc) if len(crys_act_acc) >=1 else 0
                features["last_act_acc"] = crys_act_acc[-1] if len(crys_act_acc) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
                features["act_true"] = tree_act_true
                features["act_false"] = tree_act_false
                features['act_accuracy'] = tree_act_true / (tree_act_true + tree_act_false) if (tree_act_true + tree_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(tree_act_acc) if len(tree_act_acc) >=1 else 0
                features["last_act_acc"] = tree_act_acc[-1] if len(tree_act_acc) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
                features["act_true"] = magma_act_true
                features["act_false"] = magma_act_false
                features['act_accuracy'] = magma_act_true / (magma_act_true + magma_act_false) if (magma_act_true + magma_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(magma_act_acc) if len(magma_act_acc) >=1 else 0
                features["last_act_acc"] = magma_act_acc[-1] if len(magma_act_acc) >=1 else 0
            
            # the accuracy is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        if session_type == 'Game':
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            true = session['event_data'].str.contains('true').sum()
            false = session['event_data'].str.contains('false').sum() 
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            if session_world == activities_world["CRYSTALCAVES"]:
                crys_game_true += true
                crys_game_false += false
                crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["TREETOPCITY"]:
                tree_game_true += true
                tree_game_false += false
                tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["MAGMAPEAK"]:
                magma_game_true += true
                magma_game_false += false
                magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
            else:
                pass
            
        if session_type == 'Activity':
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            true = session['event_data'].str.contains('true').sum()
            false = session['event_data'].str.contains('false').sum() 
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            if session_world == activities_world["CRYSTALCAVES"]:
                crys_act_true += true
                crys_act_false += false
                crys_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["TREETOPCITY"]:
                tree_act_true += true
                tree_act_false += false
                tree_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            elif session_world == activities_world["MAGMAPEAK"]:
                magma_act_true += true
                magma_act_false += false
                magma_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
            else:
                pass    
        
        session_count += 1
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [5]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_test_his = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort = False)), total = 17000):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    for i, (ins_id, user_sample) in tqdm(enumerate(test.groupby('installation_id', sort = False)), total = 1000):
        compiled_test_his += get_data(user_sample)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_test_his = pd.DataFrame(compiled_test_his)
    
    return reduce_train, reduce_test, reduce_test_his

In [6]:
# thank to Bruno
def eval_qwk_lgb_regr(y_pred, train_t):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

def predict(sample_submission, y_pred):
    sample_submission['accuracy_group'] = y_pred
    sample_submission['accuracy_group'] = sample_submission['accuracy_group'].astype(int)
    sample_submission.to_csv('submission.csv', index = False)
    print(sample_submission['accuracy_group'].value_counts(normalize = True))

In [7]:
def get_random_assessment(reduce_train):
    used_idx = []
    for iid in set(reduce_train['installation_id']):
        list_ = list(reduce_train[reduce_train['installation_id']==iid].index)
        cur = random.choices(list_, k = 1)[0]
        used_idx.append(cur)
    reduce_train_t = reduce_train.loc[used_idx]
    print("used validation data: ", len(used_idx))
    return reduce_train_t, used_idx

In [8]:
# for each validation fold extract one random observation for each installation id to simulate the test set
def run_lgb(reduce_train, reduce_test, features):
    # features found in initial bayesian optimization
    params = {'boosting_type': 'gbdt', 
              'metric': 'rmse', 
              'objective': 'regression', 
              'eval_metric': 'cappa', 
              'n_jobs': -1, 
              'seed': 42, 
              'num_leaves': 26, 
              'learning_rate': 0.077439684887749, 
              'max_depth': 33, 
              'lambda_l1': 3.27791989030057, 
              'lambda_l2': 1.3047627805931334, 
              'bagging_fraction': 0.896924978584253, 
              'bagging_freq': 1, 
              'colsample_bytree': 0.8710772167017853}

    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)
        train_set = lgb.Dataset(x_train, y_train, categorical_feature = ['session_title'])


        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]
        val_set = lgb.Dataset(x_val, y_val, categorical_feature = ['session_title'])

        model = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                         valid_sets = [train_set, val_set], verbose_eval = 100)

        oof_pred[idx_val] = model.predict(x_val)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [9]:
# for each validation fold extract one random observation for each installation id to simulate the test set
def run_lr(reduce_train, reduce_test, features):
    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)

        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]

        model = LinearRegression()
        model.fit(x_train, y_train)  

        oof_pred[idx_val] = model.predict(x_val)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [10]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def run_nn(reduce_train, reduce_test, features):
    kf = GroupKFold(n_splits = 5)
    target = 'accuracy_group'
    oof_pred = np.zeros(len(reduce_train))
    y_pred = np.zeros(len(reduce_test))
    ind = []

    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train, groups = reduce_train['installation_id'])):
        print('Fold:', fold + 1)
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        x_train.drop('installation_id', inplace = True, axis = 1)

        x_val, idx_val = get_random_assessment(x_val)
        ind.extend(idx_val)
        x_val.drop('installation_id', inplace = True, axis = 1)
        y_val = y_val.loc[idx_val]
        
        verbosity = 100
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(x_train.shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'), #, kernel_regularizer=tf.keras.regularizers.l2(0.001)
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(100, activation='tanh'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='relu')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse')
        #print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('./nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
        
        model.fit(x_train, 
                y_train, 
                validation_data=(x_val, y_val),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('./nn_model.w8')
        
        oof_pred[idx_val] = model.predict(x_val).reshape(x_val.shape[0],)
        y_pred += model.predict(reduce_test[[x for x in features if x not in ['installation_id']]]).reshape(reduce_test.shape[0],) / kf.n_splits
    oof_rmse_score = np.sqrt(mean_squared_error(reduce_train[target][ind], oof_pred[ind]))
    oof_cohen_score = cohen_kappa_score(reduce_train[target][ind], eval_qwk_lgb_regr(oof_pred[ind], reduce_train), weights = 'quadratic')
    print('Our oof rmse score is:', oof_rmse_score)
    print('Our oof cohen kappa score is:', oof_cohen_score)
    return y_pred, oof_rmse_score, oof_cohen_score

In [11]:
def standardize_data(reduce_train, reduce_test):
    features = [i for i in reduce_train.columns if i not in ["installation_id", "accuracy_group"]]
    categoricals = ['session_title']
    features = features.copy()
    new_train = reduce_train.copy()
    new_test = reduce_test.copy()
    if len(categoricals) > 0:
        for cat in categoricals:
            enc = OneHotEncoder()
            train_cats = enc.fit_transform(new_train[[cat]])
            test_cats = enc.transform(new_test[[cat]])
            cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
            features += cat_cols
            train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
            test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
            new_train = pd.concat([new_train, train_cats], axis=1)
            new_test = pd.concat([new_test, test_cats], axis=1)
        scalar = MinMaxScaler()
        new_train[features] = scalar.fit_transform(new_train[features])
        new_test[features] = scalar.transform(new_test[features])
    new_train = new_train.drop(["session_title"], axis=1)
    new_test = new_test.drop(["session_title"], axis=1)
    return new_train, new_test

In [12]:
def remove_correlated_features(reduce_train):
    counter = 0
    to_remove = []
    for feat_a in features:
        for feat_b in features:
            if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
                c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
                if c > 0.995:
                    counter += 1
                    to_remove.append(feat_b)
                    print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))
    return to_remove

In [13]:
# function to exclude columns from the train and test set if the mean is different, also adjust test column by a factor to simulate the same distribution
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title']:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test

# installation

In [14]:
train, test, train_labels, specs, sample_submission = read_data()
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world = encode_title(train, test, train_labels)
reduce_train, reduce_test, reduce_his_test = get_train_and_test(train, test)
del train, test

Reading train.csv file....
Reading test.csv file....
Reading train_labels.csv file....
Reading specs.csv file....
Reading sample_submission.csv file....


HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




# feature selection

In [15]:
# extracted from feature elimination first round script
old_features = list(reduce_train.columns[0:95]) + list(reduce_train.columns[882:])
el_features = ['accuracy_group', 'accuracy', 'installation_id']
old_features = [col for col in old_features if col not in el_features]
event_id_features = list_of_event_id #list(reduce_train.columns[95:479])
title_event_code_cross = all_title_event_code #list(reduce_train.columns[479:882])
features = old_features + event_id_features + title_event_code_cross
lr_features = features.copy()
nn_features = features.copy()

In [16]:
to_remove = remove_correlated_features(reduce_train)

1: FEAT_A: Clip FEAT_B: 27253bdc - Correlation: 0.9999999999999999
2: FEAT_A: 2050 FEAT_B: 2040 - Correlation: 0.9965259434878118
3: FEAT_A: 2050 FEAT_B: 37c53127 - Correlation: 1.0
4: FEAT_A: 2050 FEAT_B: 2b9272f4 - Correlation: 0.9999839030068793
5: FEAT_A: 2050 FEAT_B: 08fd73f3 - Correlation: 0.9966123918733654
6: FEAT_A: 2050 FEAT_B: 73757a5e - Correlation: 0.9998050146713992
7: FEAT_A: 2050 FEAT_B: dcaede90 - Correlation: 0.9965259434878118
8: FEAT_A: 2050 FEAT_B: 26fd2d99 - Correlation: 0.9965084543995759
9: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2040 - Correlation: 0.9965259434878118
10: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2050 - Correlation: 1.0
11: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_3021 - Correlation: 0.9998050146713992
12: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2020 - Correlation: 0.9965084543995759
13: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_2030 - Correlation: 0.9966123918733654
14: FEAT_A: 2050 FEAT_B: Scrub-A-Dub_3121 - Correlation: 0.9999839030068793
15: FEAT_A: 4230 FEAT_B: 4235 - Correlation

In [17]:
features = [col for col in features if col not in to_remove]
features = [col for col in features if col not in ['Heavy, Heavier, Heaviest_2000', 'Heavy, Heavier, Heaviest']]
features.append('installation_id')
print('Training with {} features in LGBM'.format(len(features)))

lr_features = [col for col in lr_features if col not in to_remove]
lr_features = [col for col in lr_features if col not in ['Heavy, Heavier, Heaviest_2000', 'Heavy, Heavier, Heaviest', "session_title"]]
nn_features = [col for col in nn_features if col not in to_remove]
nn_features = [col for col in nn_features if col not in ['Heavy, Heavier, Heaviest_2000', 'Heavy, Heavier, Heaviest', "session_title"]]
lr_features.append('installation_id')
nn_features.append('installation_id')
print('Training with {} features in NN and LR'.format(len(lr_features)))

Training with 426 features in LGBM
Training with 425 features in NN and LR


In [18]:
to_exclude, ajusted_test = exclude(reduce_train, reduce_test, features)
features = [col for col in features if col not in to_exclude]

acc_Cart Balancer (Assessment)
misses
6aeafed4
ab4ec3a4
a8cc6fec
5dc079d8
003cd2ee
119b5b02
ecc6157f
29a42aea
0ce40006
dcb1663e
17ca3959
611485c5
eb2c19cd
e4d32835
01ca3a3c
4074bac2
bfc77bd6
7fd1ac25
13f56524
2ec694de
1b54d27f
Crystals Rule_2010
Air Show_4080
Pan Balance_2010
Sandcastle Builder (Activity)_2010
Bottle Filler (Activity)_2010
Scrub-A-Dub_4080


# modelling and prediction

## LightGBM

In [19]:
# train 5 times because the evaluation and training data change with the randomness
y_pred_1, oof_rmse_score_1, oof_cohen_score_1 = run_lgb(reduce_train, ajusted_test, features)
y_pred_2, oof_rmse_score_2, oof_cohen_score_2 = run_lgb(reduce_train, ajusted_test, features)
y_pred_3, oof_rmse_score_3, oof_cohen_score_3 = run_lgb(reduce_train, ajusted_test, features)
y_pred_4, oof_rmse_score_4, oof_cohen_score_4 = run_lgb(reduce_train, ajusted_test, features)
y_pred_5, oof_rmse_score_5, oof_cohen_score_5 = run_lgb(reduce_train, ajusted_test, features)
mean_rmse = (oof_rmse_score_1 + oof_rmse_score_2 + oof_rmse_score_3 + oof_rmse_score_4 + oof_rmse_score_5) / 5
mean_cohen_kappa = (oof_cohen_score_1 + oof_cohen_score_2 + oof_cohen_score_3 + oof_cohen_score_4 + oof_cohen_score_5) / 5
print('Our mean rmse score is: ', mean_rmse)
print('Our mean cohen kappa score is: ', mean_cohen_kappa)

Fold: 1
used validation data:  721
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870295	valid_1's rmse: 1.02686
[200]	training's rmse: 0.791164	valid_1's rmse: 1.02195
Early stopping, best iteration is:
[185]	training's rmse: 0.801493	valid_1's rmse: 1.02076
Fold: 2
used validation data:  722
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.871566	valid_1's rmse: 1.03133
[200]	training's rmse: 0.793145	valid_1's rmse: 1.03554
Early stopping, best iteration is:
[125]	training's rmse: 0.848612	valid_1's rmse: 1.03045
Fold: 3
used validation data:  723
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.870242	valid_1's rmse: 1.02202
[200]	training's rmse: 0.790982	valid_1's rmse: 1.02794
Early stopping, best iteration is:
[115]	training's rmse: 0.856993	valid_1's rmse: 1.02061
Fold: 4
used validation data:  724
Training until validation scores don't improve for 100 rounds
[10

## linear regression and neutal network

In [20]:
train_std, test_std = standardize_data(reduce_train, ajusted_test)

In [21]:
for i in train_std.columns:
    if "session_title" in str(i):
        lr_features.append(i)
y_pred_1_lr, oof_rmse_score_1_lr, oof_cohen_score_1_lr = run_lr(train_std, test_std, nn_features)
y_pred_2_lr, oof_rmse_score_2_lr, oof_cohen_score_2_lr = run_lr(train_std, test_std, nn_features)
y_pred_3_lr, oof_rmse_score_3_lr, oof_cohen_score_3_lr = run_lr(train_std, test_std, nn_features)
y_pred_4_lr, oof_rmse_score_4_lr, oof_cohen_score_4_lr = run_lr(train_std, test_std, nn_features)
y_pred_5_lr, oof_rmse_score_5_lr, oof_cohen_score_5_lr = run_lr(train_std, test_std, nn_features)
mean_rmse_lr = (oof_rmse_score_1_lr + oof_rmse_score_2_lr + oof_rmse_score_3_lr + oof_rmse_score_4_lr + oof_rmse_score_5_lr) / 5
mean_cohen_kappa_lr = (oof_cohen_score_1_lr + oof_cohen_score_2_lr + oof_cohen_score_3_lr + oof_cohen_score_4_lr + oof_cohen_score_5_lr) / 5
print('Our mean rmse score is: ', mean_rmse_lr)
print('Our mean cohen kappa score is: ', mean_cohen_kappa_lr)

Fold: 1
used validation data:  721
Fold: 2
used validation data:  722
Fold: 3
used validation data:  723
Fold: 4
used validation data:  724
Fold: 5
used validation data:  724
Our oof rmse score is: 1.323370624263243
Our oof cohen kappa score is: 0.3621713527205296
Fold: 1
used validation data:  721
Fold: 2
used validation data:  722
Fold: 3
used validation data:  723
Fold: 4
used validation data:  724
Fold: 5
used validation data:  724
Our oof rmse score is: 1.286606344347764
Our oof cohen kappa score is: 0.37888036094989275
Fold: 1
used validation data:  721
Fold: 2
used validation data:  722
Fold: 3
used validation data:  723
Fold: 4
used validation data:  724
Fold: 5
used validation data:  724
Our oof rmse score is: 1.3199924706619068
Our oof cohen kappa score is: 0.3594178222380865
Fold: 1
used validation data:  721
Fold: 2
used validation data:  722
Fold: 3
used validation data:  723
Fold: 4
used validation data:  724
Fold: 5
used validation data:  724
Our oof rmse score is: 1.322

In [22]:
for i in train_std.columns:
    if "session_title" in str(i):
        nn_features.append(i)
y_pred_1_nn, oof_rmse_score_1_nn, oof_cohen_score_1_nn = run_nn(train_std, test_std, nn_features)
y_pred_2_nn, oof_rmse_score_2_nn, oof_cohen_score_2_nn = run_nn(train_std, test_std, nn_features)
y_pred_3_nn, oof_rmse_score_3_nn, oof_cohen_score_3_nn = run_nn(train_std, test_std, nn_features)
y_pred_4_nn, oof_rmse_score_4_nn, oof_cohen_score_4_nn = run_nn(train_std, test_std, nn_features)
y_pred_5_nn, oof_rmse_score_5_nn, oof_cohen_score_5_nn = run_nn(train_std, test_std, nn_features)
mean_rmse_nn = (oof_rmse_score_1_nn + oof_rmse_score_2_nn + oof_rmse_score_3_nn + oof_rmse_score_4_nn + oof_rmse_score_5_nn) / 5
mean_cohen_kappa_nn = (oof_cohen_score_1_nn + oof_cohen_score_2_nn + oof_cohen_score_3_nn + oof_cohen_score_4_nn + oof_cohen_score_5_nn) / 5
print('Our mean rmse score is: ', mean_rmse_nn)
print('Our mean cohen kappa score is: ', mean_cohen_kappa_nn)

Fold: 1
used validation data:  721
Train on 14152 samples, validate on 721 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.32660, saving model to ./nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 1.32660 to 1.26210, saving model to ./nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 1.26210 to 1.24879, saving model to ./nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.24879
Epoch 5/100
Epoch 00005: val_loss improved from 1.24879 to 1.23899, saving model to ./nn_model.w8
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.23899
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.23899
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.23899
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.23899
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.23899
Epoch 11/100
Epoch 00011: val_loss improved from 1.23899 to 1.18783, saving model to ./nn_model.w8
Epoch 12/100
Epoch 00012: val_loss did not improve 

# prediction

In [23]:
y_final_lgb = (y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5) / 5
y_final_lr = (y_pred_1_lr + y_pred_2_lr + y_pred_3_lr + y_pred_4_lr + y_pred_5_lr) / 5
y_final_nn = (y_pred_1_nn + y_pred_2_nn + y_pred_3_nn + y_pred_4_nn + y_pred_5_nn) / 5
y_final = y_final_lgb * 0.6 + y_final_nn * 0.3 + y_final_lr * 0.1
y_final = eval_qwk_lgb_regr(y_final, reduce_train)
predict(sample_submission, y_final)

3    0.495
0    0.239
1    0.136
2    0.130
Name: accuracy_group, dtype: float64
