In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, cohen_kappa_score, mean_squared_error
from functools import partial
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import json
import copy
import time
import seaborn as sns
import scipy as sp
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        
    def fit(self, X, y,random_flg=False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7), np.random.uniform(0.8,0.9)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [4]:
def stract_hists(feature, train, test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5, label = "train")
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5, label = "test")
        plt.show()
    return msre

In [5]:
def eval_qwk_lgb_regr(y_pred, train_t):
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred)))
    
    return y_pred

In [6]:
def read_data():
    train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
    train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
    test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
    #specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
    sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')
    print("Finish reading")
    return train, train_labels, test, sample_submission

In [7]:
def remove_data(train, train_labels):
    keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
    train = pd.merge(train, keep_id, on="installation_id", how="inner")
    train = train[train.installation_id.isin(train_labels.installation_id.unique())]
    assess_title = ['Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)',
       'Cauldron Filler (Assessment)', 'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)']
    additional_remove_index = []
    for i, session in train.groupby('installation_id', sort=False):
        last_row = session.index[-1]
        session = session[session.title.isin(assess_title)]
        first_row = session.index[-1] + 1
        for j in range(first_row, last_row+1):
            additional_remove_index.append(j)                
    train = train[~train.index.isin(additional_remove_index)]
    return train

In [8]:
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique())))
    
    list_of_eventid = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))

    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    
    train["level"] = train["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    test["level"] = test["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    
    train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)         
        
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_eventid

In [9]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    eventid_count = {str(ele): 0 for ele in list_of_eventid}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    
    last_session_time_sec = 0
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    crys_game_level = np.array([]); tree_game_level = np.array([]); magma_game_level = np.array([])
    crys_game_round = np.array([]); tree_game_round = np.array([]); magma_game_round = np.array([])
    crys_act_true = 0; crys_act_false = 0
    tree_act_true = 0; tree_act_false = 0
    magma_act_true = 0; magma_act_false = 0
    crys_act_acc = []; tree_act_acc = []; magma_act_acc = []
    durations_game = []; durations_activity = []
    
    for i, session in user_sample.groupby('game_session', sort=False):  
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent   
            
            if session_type == "Game":
                true = session['event_data'].str.contains('true').sum()
                false = session['event_data'].str.contains('false').sum() 
                durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    crys_game_level = np.concatenate([crys_game_level, session["level"]], axis=0)
                    crys_game_round = np.concatenate([crys_game_round, session["round"]], axis=0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    tree_game_level = np.concatenate([tree_game_level, session["level"]], axis=0)
                    tree_game_round = np.concatenate([tree_game_round, session["round"]], axis=0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    magma_game_level = np.concatenate([magma_game_level, session["level"]], axis=0)
                    magma_game_round = np.concatenate([magma_game_round, session["round"]], axis=0)
                else:
                    pass
                
            if session_type == "Activity":
                true = session['event_data'].str.contains('true').sum()
                false = session['event_data'].str.contains('false').sum() 
                durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_act_true += true
                    crys_act_false += false
                    crys_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_act_true += true
                    tree_act_false += false
                    tree_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_act_true += true
                    magma_act_false += false
                    magma_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                else:
                    pass

        if (session_type == 'Assessment') & (test_set or len(session)>1): # test set or session in train_label
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features.update(time_spent_each_act.copy()) # cumulative gameplay time in each title
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features.update(eventid_count.copy()) # apperance of eventid
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
 
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
                features["act_true"] = crys_act_true
                features["act_false"] = crys_act_false
                features['act_accuracy'] = crys_act_true / (crys_act_true + crys_act_false) if (crys_act_true + crys_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(crys_act_acc) if len(crys_act_acc) >=1 else 0
                features["last_act_acc"] = crys_act_acc[-1] if len(crys_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(crys_game_level) if len(crys_game_level[~np.isnan(crys_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(crys_game_round) if len(crys_game_round[~np.isnan(crys_game_round)]) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
                features["act_true"] = tree_act_true
                features["act_false"] = tree_act_false
                features['act_accuracy'] = tree_act_true / (tree_act_true + tree_act_false) if (tree_act_true + tree_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(tree_act_acc) if len(tree_act_acc) >=1 else 0
                features["last_act_acc"] = tree_act_acc[-1] if len(tree_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(tree_game_level) if len(tree_game_level[~np.isnan(tree_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(tree_game_round) if len(tree_game_round[~np.isnan(tree_game_round)]) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
                features["act_true"] = magma_act_true
                features["act_false"] = magma_act_false
                features['act_accuracy'] = magma_act_true / (magma_act_true + magma_act_false) if (magma_act_true + magma_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(magma_act_acc) if len(magma_act_acc) >=1 else 0
                features["last_act_acc"] = magma_act_acc[-1] if len(magma_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(magma_game_level) if len(magma_game_level[~np.isnan(magma_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(magma_game_round) if len(magma_game_round[~np.isnan(magma_game_round)]) >=1 else 0
            
            if durations_game == []:
                features['duration_game_mean'] = 0
                features['duration_game_std'] = 0
                features['game_last_duration'] = 0
                features['game_max_duration'] = 0
            else:
                features['duration_game_mean'] = np.mean(durations_game)
                features['duration_game_std'] = np.std(durations_game)
                features['game_last_duration'] = durations_game[-1]
                features['game_max_duration'] = np.max(durations_game)
                
            if durations_activity == []:
                features['duration_activity_mean'] = 0
                features['duration_activity_std'] = 0
                features['activity_last_duration'] = 0
                features['activity_max_duration'] = 0
            else:
                features['duration_activity_mean'] = np.mean(durations_activity)
                features['duration_activity_std'] = np.std(durations_activity)
                features['activity_last_duration'] = durations_activity[-1]
                features['activity_max_duration'] = np.max(durations_activity)
            
            # unique type --------------------------------------------------------
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            
            # nums in target assessment data ------------------------------------------
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds) 
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
            
        miss += np.sum(session["misses"])
        
        n_of_eventid = Counter(session['event_id']) 
        for key in n_of_eventid.keys():
            eventid_count[str(key)] += n_of_eventid[key]
                        
        user_world_count["world_"+str(session_world)] += session.shape[0]

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1]
    return all_assessments

In [10]:
def make_data(train, test):
    new_train = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        new_train += get_data(user_sample)
    new_train = pd.DataFrame(new_train)
    print(new_train.shape)
    del train
    
    new_test = []
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
        a = get_data(user_sample, test_set=True)
        new_test.append(a)   
    new_test = pd.DataFrame(new_test)
    print(new_test.shape)
    del test
    
    return new_train, new_test 

In [11]:
def post_process(new_train, new_test, stand_flg = False):
    X_train = new_train.copy()
    X_test = new_test.copy()
    y_train = new_train.accuracy_group
    
    if stand_flg == True:
        features = [i for i in new_train.columns if i not in ["installation_id", "accuracy_group"]]
        categoricals = ['session_title']
        features = features.copy()
        if len(categoricals) > 0:
            for cat in categoricals:
                enc = OneHotEncoder()
                train_cats = enc.fit_transform(X_train[[cat]])
                test_cats = enc.transform(X_test[[cat]])
                cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
                features += cat_cols
                train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
                test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
                X_train = pd.concat([X_train, train_cats], axis=1)
                X_test = pd.concat([X_test, test_cats], axis=1)
            scalar = MinMaxScaler()
            X_train[features] = scalar.fit_transform(X_train[features])
            X_test[features] = scalar.transform(X_test[features])
        X_train = X_train.drop(["session_title"], axis=1)
        X_test = X_test.drop(["session_title"], axis=1)
    
    X_train = X_train.drop(['accuracy_group'],axis=1) 
    X_test = X_test.drop(["installation_id", "accuracy_group"], axis=1)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = []
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    #for i in high_corr_features:
    #    if i not in remove_features:
    #        remove_features.append(i)  
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]
    X_test = X_test.drop(remove_features, axis=1)
    X_test = X_test[sorted(X_test.columns.tolist())]
    print("train: ", X_train.shape)
    print("test: ", X_test.shape)
    return X_train, y_train, X_test

# Install

In [12]:
train, train_labels, test, sample_submission = read_data()
train = remove_data(train, train_labels)

Finish reading


# Preprocess and feature engineering

In [13]:
train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_eventid = encode_title(train, test)

In [14]:
new_train, new_test = make_data(train, test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…


(17690, 860)


HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…


(1000, 860)


# Feature selection

In [15]:
correlations = new_train.corr().abs()
correlations = correlations.mask(np.tril(np.ones(correlations.shape)).astype(np.bool))
correlations = correlations.stack().reset_index()
corr_columns = ["level_0", "level_1", "value"]
correlations.columns = corr_columns
correlations = correlations.sort_values("value", ascending=False).reset_index(drop=True)

high_corr = correlations[correlations["value"] >= 0.995]

high_corr_features = []
for i in range(high_corr.shape[0]):
    if high_corr.iloc[i]["level_0"] not in high_corr_features and high_corr.iloc[i]["level_1"] not in high_corr_features:
        high_corr_features.append(high_corr.iloc[i]["level_0"])

In [16]:
#ajusted_test = new_test.copy()
#for feature in ajusted_test.columns:
#    if feature not in ['accuracy_group', 'installation_id', 'accuracy_group', 'session_title'] and i not in remove_features:
#        data = new_train[feature]
#        train_mean = data.mean()
#        data = ajusted_test[feature] 
#        test_mean = data.mean()
#        try:
#            error = stract_hists(feature, new_train, new_test, adjust=True)
#            ajust_factor = train_mean / test_mean
#            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
#                remove_features.append(feature)
#                print("try", feature, train_mean, test_mean, error)
#            else:
#                ajusted_test[feature] *= ajust_factor
#        except:
#            remove_features.append(feature)
#            print("except", feature, train_mean, test_mean)

In [17]:
X_train_lgb, y_train_lgb, X_test_lgb = post_process(new_train, new_test)
X_train_lr, y_train_lr, X_test_lr = post_process(new_train, new_test, stand_flg = True)

train:  (17690, 834)
test:  (1000, 833)
train:  (17690, 838)
test:  (1000, 837)


In [18]:
X_train_lgb.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_lgb.columns]
X_train_lr.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_lr.columns]
X_test_lgb.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_lgb.columns]
X_test_lr.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_lr.columns]

# modelling and prediction

In [19]:
def models(model_name, X_tr, y_tr, X_te):
    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds)
    coefficients = []
    train_qwk_scores = []; test_qwk_scores = []; train_qwk_dist = []; test_qwk_dist = []
    pred_value = np.zeros([X_te.shape[0]])

    lgbm_params = {
        "objective" : "regression",
        "metric" : "rmse",
        "tree_learner": "serial",
        "max_depth" : 4,
        "boosting": 'gbdt',
        "num_leaves" : 13,
        "learning_rate" : 0.01,
        }

    for i , (train_index, test_index) in enumerate(skf.split(X_tr, y_tr)):
        optR = OptimizedRounder()
        X_train2 = X_tr.iloc[train_index,:]
        y_train2 = y_tr.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_tr.iloc[test_index,:]
        y_test2 = y_tr.iloc[test_index]
        test2 = pd.concat([X_test2, y_test2], axis=1)
        test2 = test2.groupby('installation_id').apply(lambda x: x.sample(1, random_state=1223)).reset_index(drop=True)
        X_test2 = test2.drop(["accuracy_group", "installation_id"], axis=1)
        y_test2 = test2["accuracy_group"]
    
        if model_name == "lgb":
            lgb_train = lgb.Dataset(X_train2, y_train2)
            lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
            clf = lgb.train(
                lgbm_params, lgb_train,
                valid_sets=[lgb_train, lgb_eval],
                num_boost_round=100000,
                early_stopping_rounds=10,
        )
            train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
            valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            pred_value += clf.predict(X_te, num_iteration = clf.best_iteration) / n_folds
        
        elif model_name == "lr":    
            clf = LinearRegression()
            clf.fit(X_train2, y_train2) 
            train_predict = clf.predict(X_train2)
            valid_predict = clf.predict(X_test2)
            pred_value += clf.predict(X_te) / n_folds
        
        elif model_name == "nn":
            verbosity = 100
            clf = tf.keras.models.Sequential([
                tf.keras.layers.Input(shape=(X_train2.shape[1],)),
                tf.keras.layers.Dense(200, activation='relu'),
                tf.keras.layers.LayerNormalization(),
                tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(100, activation='tanh'),
                tf.keras.layers.LayerNormalization(),
                tf.keras.layers.Dropout(0.3),
                #tf.keras.layers.Dense(50, activation='relu'),
                #tf.keras.layers.LayerNormalization(),
                #tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(25, activation='relu'),
                tf.keras.layers.LayerNormalization(),
                tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(1, activation='relu')
            ])
            clf.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4), loss='mse')
            save_best = tf.keras.callbacks.ModelCheckpoint('./nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
            early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
            
            clf.fit(X_train2, 
                y_train2, 
                validation_data=(X_test2, y_test2),
                epochs=100,
                 callbacks=[save_best, early_stop])
            clf.load_weights('./nn_model.w8')
            train_predict = clf.predict(X_train2)
            valid_predict = clf.predict(X_test2)
            test_coefficients = np.mean(coefficients, axis=0)
            pred_value += clf.predict(X_te).reshape(X_te.shape[0],) / n_folds
    
        optR.fit(valid_predict.reshape(-1,), y_test2)
        tmp_coefficients = optR.coefficients()
        print("fold_"+str(i)+" coefficients: ", tmp_coefficients)
        opt_train_preds = optR.predict(train_predict.reshape(-1, ), tmp_coefficients)
        train_qwk_score = qwk(y_train2, opt_train_preds)
        opt_test_preds = optR.predict(valid_predict.reshape(-1, ), tmp_coefficients)
        test_qwk_score = qwk(y_test2, opt_test_preds)
        train_qwk_scores.append(train_qwk_score)
        test_qwk_scores.append(test_qwk_score)
        coefficients.append(tmp_coefficients)
    
        train_qwk_d = qwk(y_train2, eval_qwk_lgb_regr(train_predict, new_train))
        test_qwk_d = qwk(y_test2, eval_qwk_lgb_regr(valid_predict, new_train))
        train_qwk_dist.append(train_qwk_d)
        test_qwk_dist.append(test_qwk_d)
        
    print("training qwk     : ", train_qwk_scores, np.mean(train_qwk_scores))
    print("validation qwk   : ", test_qwk_scores, np.mean(test_qwk_scores))
    print("train qwk by dist: ", train_qwk_dist, np.mean(train_qwk_dist))
    print("valid qwk by dist: ", test_qwk_dist, np.mean(test_qwk_dist))

    return pred_value

In [20]:
lgb_preds = models("lgb", X_train_lgb, y_train_lgb, X_test_lgb)
lr_preds = models("lr", X_train_lr, y_train_lr, X_test_lr)
nn_preds = models("nn", X_train_lr, y_train_lr, X_test_lr)

[1]	training's rmse: 1.25383	valid_1's rmse: 1.27372
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 1.25095	valid_1's rmse: 1.27099
[3]	training's rmse: 1.24812	valid_1's rmse: 1.26832
[4]	training's rmse: 1.24534	valid_1's rmse: 1.26569
[5]	training's rmse: 1.24259	valid_1's rmse: 1.2631
[6]	training's rmse: 1.23987	valid_1's rmse: 1.26065
[7]	training's rmse: 1.23722	valid_1's rmse: 1.25814
[8]	training's rmse: 1.23459	valid_1's rmse: 1.25576
[9]	training's rmse: 1.23203	valid_1's rmse: 1.25339
[10]	training's rmse: 1.22937	valid_1's rmse: 1.25091
[11]	training's rmse: 1.22675	valid_1's rmse: 1.24858
[12]	training's rmse: 1.2242	valid_1's rmse: 1.2462
[13]	training's rmse: 1.22168	valid_1's rmse: 1.24395
[14]	training's rmse: 1.2192	valid_1's rmse: 1.24162
[15]	training's rmse: 1.21678	valid_1's rmse: 1.23944
[16]	training's rmse: 1.21438	valid_1's rmse: 1.23722
[17]	training's rmse: 1.21203	valid_1's rmse: 1.23507
[18]	training's rmse: 1.20973	vali

In [21]:
test_pred_class = eval_qwk_lgb_regr(lgb_preds * 0.4 + nn_preds * 0.45 + lr_preds * 0.15, new_train) # threshold by distribution
sample_submission["accuracy_group"] = test_pred_class
sample_submission.to_csv('submission.csv', index=False)
sample_submission["accuracy_group"].value_counts(normalize = True)

3    0.500
0    0.239
1    0.136
2    0.125
Name: accuracy_group, dtype: float64