- version 47: went back to stratifiedkfold with truncation by average and add highest level and round in the game

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, cohen_kappa_score, mean_squared_error
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import seaborn as sns
import scipy as sp
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
        
    def fit(self, X, y,random_flg=False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7), np.random.uniform(0.8,0.9)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [4]:
def stract_hists(feature, train, test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5, label = "train")
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5, label = "test")
        plt.show()
    return msre

# install

In [5]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 12s, sys: 10.5 s, total: 1min 22s
Wall time: 1min 22s


In [6]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
#dummy_id = list(train[~train.installation_id.isin(train_labels.installation_id.unique())]["installation_id"].unique())
train = train[train.installation_id.isin(train_labels.installation_id.unique())]

In [7]:
assess_title = ['Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)',
       'Cauldron Filler (Assessment)', 'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)']
def remove_index_calc(df):
    additional_remove_index = []
    for i, session in df.groupby('installation_id', sort=False):
        last_row = session.index[-1]
        session = session[session.title.isin(assess_title)]
        first_row = session.index[-1] + 1
        for j in range(first_row, last_row+1):
            additional_remove_index.append(j)                
    return additional_remove_index
additional_remove_index = remove_index_calc(train)
train = train[~train.index.isin(additional_remove_index)]

# Preprocess and Feature engineering

In [8]:
%%time
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique())))
    
    list_of_eventid = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))

    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    
    tmp = train["title"].copy().shift(1)
    tmp[0] = 0
    tmp = tmp.astype(int)
    trans = tmp.apply(lambda x: str(x)) + str("-") + train["title"].copy().apply(lambda x: str(x))
    trans[0] = np.nan
    train["trans"] = trans
    tmp = test["title"].copy().shift(1)
    tmp[0] = 0
    tmp = tmp.astype(int)
    trans = tmp.apply(lambda x: str(x)) + str("-") + test["title"].copy().apply(lambda x: str(x))
    trans[0] = np.nan
    test["trans"] = trans
    list_of_trans = list(set(train.iloc[1:]['trans'].unique()).union(set(test.iloc[1:]['trans'].unique())))
    activities_trans = dict(zip(list_of_trans, np.arange(len(list_of_trans))))
    train['trans'] = train['trans'].map(activities_trans)
    test['trans'] = test['trans'].map(activities_trans)
    
    train["level"] = train["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    test["level"] = test["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    
    train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
                
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_eventid, list_of_trans, activities_trans

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode, list_of_eventid, list_of_trans, activities_trans = encode_title(train, test)

CPU times: user 1min 57s, sys: 5.56 s, total: 2min 3s
Wall time: 2min 2s


In [9]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    eventid_count = {str(ele): 0 for ele in list_of_eventid}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    #trans_count = {"trans"+str(ele) : 0 for ele in activities_trans.values()}
    
    last_session_time_sec = 0
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    miss = 0
    crys_game_true = 0; crys_game_false = 0
    tree_game_true = 0; tree_game_false = 0
    magma_game_true = 0; magma_game_false = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    crys_game_level = np.array([]); tree_game_level = np.array([]); magma_game_level = np.array([])
    crys_game_round = np.array([]); tree_game_round = np.array([]); magma_game_round = np.array([])
    crys_act_true = 0; crys_act_false = 0
    tree_act_true = 0; tree_act_false = 0
    magma_act_true = 0; magma_act_false = 0
    crys_act_acc = []; tree_act_acc = []; magma_act_acc = []
    #x = np.array([]); y = np.array([])
        
    for i, session in user_sample.groupby('game_session', sort=False):  
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        # get current session time in seconds
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent   
            
            if session_type == "Game":
                true = session['event_data'].str.contains('true').sum()
                false = session['event_data'].str.contains('false').sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    crys_game_level = np.concatenate([crys_game_level, session["level"]], axis=0)
                    crys_game_round = np.concatenate([crys_game_round, session["round"]], axis=0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    tree_game_level = np.concatenate([tree_game_level, session["level"]], axis=0)
                    tree_game_round = np.concatenate([tree_game_round, session["round"]], axis=0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                    magma_game_level = np.concatenate([magma_game_level, session["level"]], axis=0)
                    magma_game_round = np.concatenate([magma_game_round, session["round"]], axis=0)
                else:
                    pass
                
            if session_type == "Activity":
                true = session['event_data'].str.contains('true').sum()
                false = session['event_data'].str.contains('false').sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_act_true += true
                    crys_act_false += false
                    crys_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_act_true += true
                    tree_act_false += false
                    tree_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_act_true += true
                    magma_act_false += false
                    magma_act_acc.append(true / (true + false) if (true + false) != 0 else 0)
                else:
                    pass

        if (session_type == 'Assessment') & (test_set or len(session)>1): # test set or session in train_label
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features.update(time_spent_each_act.copy()) # cumulative gameplay time in each title
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features.update(eventid_count.copy()) # apperance of eventid
            features.update(user_world_count.copy()) # appearance of world with duplicates
            #features.update(trans_count.copy()) # appearance of world with duplicates
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
 
            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
                features["act_true"] = crys_act_true
                features["act_false"] = crys_act_false
                features['act_accuracy'] = crys_act_true / (crys_act_true + crys_act_false) if (crys_act_true + crys_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(crys_act_acc) if len(crys_act_acc) >=1 else 0
                features["last_act_acc"] = crys_act_acc[-1] if len(crys_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(crys_game_level) if len(crys_game_level[~np.isnan(crys_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(crys_game_round) if len(crys_game_round[~np.isnan(crys_game_round)]) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
                features["act_true"] = tree_act_true
                features["act_false"] = tree_act_false
                features['act_accuracy'] = tree_act_true / (tree_act_true + tree_act_false) if (tree_act_true + tree_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(tree_act_acc) if len(tree_act_acc) >=1 else 0
                features["last_act_acc"] = tree_act_acc[-1] if len(tree_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(tree_game_level) if len(tree_game_level[~np.isnan(tree_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(tree_game_round) if len(tree_game_round[~np.isnan(tree_game_round)]) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
                features["act_true"] = magma_act_true
                features["act_false"] = magma_act_false
                features['act_accuracy'] = magma_act_true / (magma_act_true + magma_act_false) if (magma_act_true + magma_act_false) != 0 else 0
                features["act_accuracy_std"] = np.std(magma_act_acc) if len(magma_act_acc) >=1 else 0
                features["last_act_acc"] = magma_act_acc[-1] if len(magma_act_acc) >=1 else 0
                features["hightest_level"] = np.nanmax(magma_game_level) if len(magma_game_level[~np.isnan(magma_game_level)]) >=1 else 0
                features["hightest_round"] = np.nanmax(magma_game_round) if len(magma_game_round[~np.isnan(magma_game_round)]) >=1 else 0

            #if len(x[~np.isnan(x)]) >=2:
            #    features["xstd"] = np.nanstd(x)
            #    features["ystd"] = np.nanstd(y)
            #    features["xrange"] = np.nanmax(x) - np.nanmin(x)
            #    features["yrange"] = np.nanmax(y) - np.nanmin(y)
            #else:
            #    features["xstd"] = 0
            #    features["ystd"] = 0
            #    features["xrange"] = 0
            #    features["yrange"] = 0
            #features["touch_range"] = features["xrange"] * features["yrange"] 
                        
            # unique type --------------------------------------------------------
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            
            # nums in target assessment data ------------------------------------------
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds) 
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
            
        miss += np.sum(session["misses"])
        
        n_of_eventid = Counter(session['event_id']) 
        for key in n_of_eventid.keys():
            eventid_count[str(key)] += n_of_eventid[key]
            
        user_world_count["world_"+str(session_world)] += session.shape[0]
        
        #n_of_trans = Counter(session.iloc[1:]['trans']) 
        #for key in n_of_trans.keys():
        #    trans_count["trans"+str(key)] += n_of_eventid[key]
        
        #x = np.concatenate([x, session["x"]], axis=0)
        #y = np.concatenate([y, session["y"]], axis=0)

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1]
    return all_assessments

In [10]:
new_train = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    new_train += get_data(user_sample)
new_train = pd.DataFrame(new_train)
print(new_train.shape)

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…


(17690, 852)


In [11]:
del train

In [12]:
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)   
new_test = pd.DataFrame(new_test)
print(new_test.shape)

HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…


(1000, 852)


In [13]:
del test

# Feature selection and modelling

In [14]:
#correlations = new_train.corr().abs()
#correlations = correlations.mask(np.tril(np.ones(correlations.shape)).astype(np.bool))
#correlations = correlations.stack().reset_index()
#corr_columns = ["level_0", "level_1", "value"]
#correlations.columns = corr_columns
#correlations = correlations.sort_values("value", ascending=False).reset_index(drop=True)

#high_corr = correlations[correlations["value"] == 1]

#high_corr_features = []
#for i in range(high_corr.shape[0]):
#    if high_corr.iloc[i]["level_0"] not in high_corr_features and high_corr.iloc[i]["level_1"] not in high_corr_features:
#        high_corr_features.append(high_corr.iloc[i]["level_0"])
#    elif high_corr.iloc[i]["level_0"] in high_corr_features and high_corr.iloc[i]["level_1"] not in high_corr_features:
#        high_corr_features.append(high_corr.iloc[i]["level_1"])
#    elif high_corr.iloc[i]["level_0"] not in high_corr_features and high_corr.iloc[i]["level_1"] in high_corr_features:
#        high_corr_features.append(high_corr.iloc[i]["level_0"])

In [15]:
X_train = new_train.drop(['accuracy_group'],axis=1) 
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train["installation_id"]))
X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
remove_features = []
for i in X_train.columns:
    if X_train[i].std() == 0 and i not in remove_features:
        remove_features.append(i)

In [16]:
#ajusted_test = new_test.copy()
#for feature in ajusted_test.columns:
#    if feature not in ['accuracy_group', 'installation_id', 'accuracy_group', 'session_title'] and i not in remove_features:
#        data = new_train[feature]
#        train_mean = data.mean()
#        data = ajusted_test[feature] 
#        test_mean = data.mean()
#        try:
#            error = stract_hists(feature, new_train, new_test, adjust=True)
#            ajust_factor = train_mean / test_mean
#            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
#                remove_features.append(feature)
#                print("try", feature, train_mean, test_mean, error)
#            else:
#                ajusted_test[feature] *= ajust_factor
#        except:
#            remove_features.append(feature)
#            print("except", feature, train_mean, test_mean)

In [17]:
X_train = X_train.drop(remove_features, axis=1)
X_train = X_train[sorted(X_train.columns.tolist())]
y_train = new_train.accuracy_group
print(X_train.shape)

(17690, 826)


In [18]:
n_folds=5
skf=StratifiedKFold(n_splits = n_folds)
coefficients = []
models = []
train_qwk_scores = []
test_qwk_scores = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "rmse",
    "tree_learner": "serial",
    "max_depth" : 4,
    "boosting": 'gbdt',
    "num_leaves" : 13,
    "learning_rate" : 0.01,
}

features_list = [i for i in X_train.columns if i != "installation_id"]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    optR = OptimizedRounder()
    X_train2 = X_train.iloc[train_index,:]
    y_train2 = y_train.iloc[train_index]
    X_train2 = X_train2.drop(['installation_id'],axis=1)
    
    X_test2 = X_train.iloc[test_index,:]
    y_test2 = y_train.iloc[test_index]
    test2 = pd.concat([X_test2, y_test2], axis=1)
    test2 = test2.groupby('installation_id').apply(lambda x: x.sample(1, random_state=1223)).reset_index(drop=True)
    X_test2 = test2.drop(["accuracy_group", "installation_id"], axis=1)
    y_test2 = test2["accuracy_group"]
    
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        num_boost_round=100000,
        early_stopping_rounds=10,
    )
    
    models.append(clf)
    train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
    test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
    
    optR.fit(test_predict.reshape(-1,), y_test2)
    tmp_coefficients = optR.coefficients()
    print("fold_"+str(i)+" coefficients: ", tmp_coefficients)
    opt_train_preds = optR.predict(train_predict.reshape(-1, ), tmp_coefficients)
    train_qwk_score = qwk(y_train2, opt_train_preds)
    print("training qwk: ", train_qwk_score)
    opt_test_preds = optR.predict(test_predict.reshape(-1, ), tmp_coefficients)
    test_qwk_score = qwk(y_test2, opt_test_preds)
    print("validation qwk: ", test_qwk_score)
    train_qwk_scores.append(train_qwk_score)
    test_qwk_scores.append(test_qwk_score)
    coefficients.append(tmp_coefficients)
    
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
    
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

print("                             ")
print("-----------------------------")
print('coefficients: ', coefficients)
print('train qwk list:', train_qwk_scores)
print('train qwk average score:',np.mean(train_qwk_scores))
print('valid qwk list: ', test_qwk_scores)
print('valid qwk average score:',np.mean(test_qwk_scores))

[1]	training's rmse: 1.25285	valid_1's rmse: 1.27294
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 1.249	valid_1's rmse: 1.26946
[3]	training's rmse: 1.24522	valid_1's rmse: 1.26593
[4]	training's rmse: 1.24149	valid_1's rmse: 1.26252
[5]	training's rmse: 1.23783	valid_1's rmse: 1.25911
[6]	training's rmse: 1.23423	valid_1's rmse: 1.2559
[7]	training's rmse: 1.23069	valid_1's rmse: 1.25269
[8]	training's rmse: 1.22721	valid_1's rmse: 1.2494
[9]	training's rmse: 1.22375	valid_1's rmse: 1.24633
[10]	training's rmse: 1.22035	valid_1's rmse: 1.24324
[11]	training's rmse: 1.21704	valid_1's rmse: 1.2402
[12]	training's rmse: 1.21374	valid_1's rmse: 1.23711
[13]	training's rmse: 1.21054	valid_1's rmse: 1.23426
[14]	training's rmse: 1.20736	valid_1's rmse: 1.23138
[15]	training's rmse: 1.20423	valid_1's rmse: 1.22857
[16]	training's rmse: 1.20122	valid_1's rmse: 1.22573
[17]	training's rmse: 1.19819	valid_1's rmse: 1.22302
[18]	training's rmse: 1.19525	valid

In [19]:
feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True).head(500)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,625,554,583,730,599,618.2,60.456265,0.097794
1,accumulated_accuracy_group,461,450,467,665,473,503.2,81.256138,0.161479
2,27253bdc,306,316,248,375,314,311.8,40.300868,0.129252
3,misses,172,206,199,231,253,212.2,27.737339,0.130713
4,7372e1a5,255,169,162,206,146,187.6,39.02102,0.208001
5,3afb49e6,206,190,143,181,167,177.4,21.359775,0.120405
6,game_accuracy,163,106,139,247,220,175.0,51.78803,0.295932
7,last_game_acc,182,146,148,235,153,172.8,33.701039,0.195029
8,act_true,181,127,146,127,190,154.2,26.633813,0.172723
9,duration_mean,100,147,136,195,177,151.0,33.027261,0.218724


In [20]:
print(X_train.shape)
print("")
print(", ".join('%s'%x for x in X_train.columns))

(17690, 826)

0, 0086365d, 00c73085, 01ca3a3c, 022b4259, 02a42007, 0330ab6a, 0413e89d, 04df9b66, 05ad839b, 06372577, 070a5291, 08fd73f3, 08ff79ad, 0a08139c, 0ce40006, 0d18d96c, 0d1da71f, 0db6d71d, 1, 119b5b02, 12 Monkeys_2000, 1325467d, 1340b8d7, 1375ccb7, 13f56524, 14de4c5d, 155f62a4, 1575e76c, 15a43e5b, 15ba1109, 15eb4a7d, 15f99afc, 160654fd, 16667cc5, 16dffff1, 17113b36, 19967db1, 1996c610, 1af8be29, 1b54d27f, 1bb5fbdb, 1beb320a, 1c178d24, 1cc7cfca, 1cf54632, 1f19558b, 2, 222660ff, 2230fab4, 250513af, 25fa8af4, 262136f4, 26a5a3dd, 26fd2d99, 27253bdc, 28520915, 28a4eb9a, 28ed704e, 28f975ea, 29a42aea, 29bdd9ba, 29f54413, 2a444e03, 2a512369, 2b058fe3, 2b9272f4, 2c4e6db0, 2dc29e21, 2dcad279, 2ec694de, 2fb91ec1, 3, 30614231, 30df3273, 31973d56, 3323d7e9, 33505eae, 3393b68b, 363c86c9, 363d3849, 36fa3ebe, 37937459, 37c53127, 37db1c2f, 37ee8496, 38074c54, 392e14df, 3a4be871, 3afb49e6, 3afde5dd, 3b2048ee, 3babcb9b, 3bb91ced, 3bb91dda, 3bf1cf26, 3bfd1a65, 3ccd3f02, 3d0b9317, 3d63345e, 3d8c61b

In [21]:
#import eli5
#from eli5.sklearn import PermutationImportance
#my_model = lgb.LGBMRegressor(**lgbm_params).fit(X_train2, y_train2)
#perm = PermutationImportance(my_model, random_state=1).fit(X_test2, y_test2)
#eli5.show_weights(perm, feature_names = X_test2.columns.tolist(), top = 500)

# prediction

In [22]:
X_test = new_test.drop(["installation_id", "accuracy_group"], axis=1)
X_test = X_test.drop(remove_features, axis=1)
X_test = X_test[sorted(X_test.columns.tolist())]
pred_value = np.zeros([X_test.shape[0]])
test_coefficients = np.mean(coefficients, axis=0)
for model in models:
    pred_value += model.predict(X_test, num_iteration = model.best_iteration) /n_folds
test_pred_class= optR.predict(pred_value.reshape(-1, ), test_coefficients)
sample_submission["accuracy_group"] = test_pred_class
sample_submission.to_csv('submission.csv', index=False)

In [23]:
sample_submission["accuracy_group"].value_counts(normalize = True)

3    0.432
2    0.322
1    0.186
0    0.060
Name: accuracy_group, dtype: float64