In [1]:
import pandas as pd
import numpy as np
import calendar
import warnings
from numba import jit 
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from typing import Any
import lightgbm as lgb
import json
from sklearn import metrics
from itertools import product
import copy
import time
from functools import partial
import scipy as sp
from scipy import stats
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

# install

In [4]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 11s, sys: 9.5 s, total: 1min 20s
Wall time: 1min 20s


In [5]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
train = train[train.installation_id.isin(train_labels.installation_id.unique())]

# Preprocess and Feature engineering

In [6]:
def num_count(x):
    num = [str(i) for i in range(10)]
    judge = [1 for i in x if i in num]
    return sum(judge)

def encode_title(train, test, train_labels):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))

    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    # identifier
    #train["identifier"] = train["event_data"].apply(lambda x: json.loads(x)["identifier"] if "identifier" in x else "None")
    #test["identifier"] = test["event_data"].apply(lambda x: json.loads(x)["identifier"] if "identifier" in x else "None")
    
    # description
    #train["description"] = train["event_data"].apply(lambda x: 1 if "\"description\"" in x else 0)
    #test["description"] = test["event_data"].apply(lambda x: 1 if "\"description\"" in x else 0)
    
    # x, y
    train["x"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["x"] if "\"coordinates\"" in x else -999)
    train["y"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["y"] if "\"coordinates\"" in x else -999)
    test["x"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["x"] if "\"coordinates\"" in x else -999)
    test["y"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["y"] if "\"coordinates\"" in x else -999)
        
    # level
    train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"level\"" in x else -999)
    test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"level\"" in x else -999)
    
    train["num_in_id"] = train["installation_id"].apply(lambda x: num_count(x))
    test["num_in_id"] = test["installation_id"].apply(lambda x: num_count(x))
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world

# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_world = encode_title(train, test, train_labels)
categoricals = ['session_title']

In [7]:
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame with only one installation_id from train or test is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    user_world_count = {'CRYSTALCAVES': 0, 'TREETOPCITY': 0, 'MAGMAPEAK': 0}
    
    # news features: time spent in each activity
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {eve: 0 for eve in list_of_event_code}
    last_session_time_sec = 0
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    accumulated_stumps = 0 
    accumulated_cataps = 0
    click_counts = 0
    accumulated_clicks = 0
    audio_counts = 0
    accumulated_audio_counts = 0
    animation_counts = 0
    accumulated_anime_counts = 0
    level = 0
    rounds = 0
    accumulated_rounds = 0
    bottles_num = 0
    accumulated_bottles = 0
    counter = 0
    cumulative_gametime = 0
    cumulative_description = 0
    xrange = 0
    yrange = 0 
    play_game = []
    event_count = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    previous_title = -999
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        num_in_id = session["num_in_id"].iloc[0]
        
        # get current session time in seconds
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
            
            tmp_click_counts = session["event_data"].apply(lambda x: 1 if "\"coordinates\"" in x else 0)
            click_counts += np.sum(tmp_click_counts)
            
            tmp_media = session["event_data"].apply(lambda x: json.loads(x)["media_type"] if "\"media_type\"" in x else "None").value_counts()
            if "audio" in tmp_media.index:
                audio_counts += tmp_media["audio"]
            if "animation" in tmp_media.index:
                animation_counts += tmp_media["animation"]
            
            cumulative_gametime += session.iloc[-1]["game_time"]
            
            if session_title not in play_game:
                play_game.append(session_title)
            
            if session_world == 1:
                user_world_count['CRYSTALCAVES'] += session.shape[0]
            elif session_world == 2:
                user_world_count['TREETOPCITY'] += session.shape[0]
            elif session_world == 3:
                user_world_count['MAGMAPEAK'] += session.shape[0]
            else:
                pass
            
            #cumulative_description += np.sum(session["description"])
            
            tmp_round = np.max(session["round"])
            rounds = np.max([rounds, tmp_round])
            
            if session[session.x != -999].shape[0] >= 2:
                tmp_xrange = session["x"].max() - session[session.x != -999]["x"].min()
                tmp_yrange = session["y"].max() - session[session.x != -999]["y"].min()       
            else:
                tmp_xrange = 0
                tmp_yrange = 0
            xrange = np.max([tmp_xrange, xrange])
            yrange = np.max([tmp_yrange, yrange])
                        
            #event_count += session.iloc[-1]["event_count"]
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):    
            #session_weekday = session["weekday"].iloc[0]
            # search for event_code 4100, that represents the assessments trial
            # all_attempts doesn't include media_type, coordinate infomation in event_data, may be empty
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            
            tmp_stump_counts = all_attempts["event_data"].apply(lambda x: np.sum(json.loads(x)["stumps"]) if "\"stumps\"" in x else 0)
            stump_counts = np.sum(tmp_stump_counts)
            #tmp_catap_counts = all_attempts["event_data"].apply(lambda x: np.sum(json.loads(x)["caterpillars"]) if "\"caterpillars\"" in x else 0)
            #catap_counts = np.sum(tmp_catap_counts)

            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(time_spent_each_act.copy())
            features.update(event_code_count.copy())
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session['title'].iloc[0] 
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            features['accumulated_stumps'] = accumulated_stumps
            accumulated_stumps += stump_counts
            #features['accumulated_cataps'] = accumulated_cataps
            #accumulated_stumps += catap_counts
            accumulated_clicks += click_counts
            features['accumulated_clicks'] = accumulated_clicks
            click_counts = 0
            
            accumulated_audio_counts += audio_counts
            features['accumulated_audios'] = accumulated_audio_counts
            audio_counts = 0
            accumulated_anime_counts += animation_counts
            features['accumulated_animes'] = accumulated_anime_counts
            animation_counts = 0
            
            features["total_gametime_before_assess"] = cumulative_gametime
            cumulative_gametime = 0
            
            features["no_playgames_before_assess"] = len(play_game)
            play_game.clear()
            
            features["MAGMAPEAK_play_before_assess"] = user_world_count["MAGMAPEAK"]
            features["TREETOPCITY_play_before_assess"] = user_world_count["TREETOPCITY"]
            features["CRYSTALCAVES_play_before_assess"] = user_world_count["CRYSTALCAVES"]
            user_world_count = {'CRYSTALCAVES': 0, 'TREETOPCITY': 0, 'MAGMAPEAK': 0}
            
            #features["total_desc_before_assess"] = cumulative_description
            #cumulative_description = 0
            
            features["max_xrange_before_assess"] = xrange
            features["max_yrange_before_assess"] = yrange
            xrange = 0
            yrange = 0
            features["touch_range"] = features["max_xrange_before_assess"] * features["max_yrange_before_assess"]
                  
            features["highest_round_before_assess"] = rounds
            rounds = 0    
            
            features["count_num_in_id"] = num_in_id
                
            #features["total_event_counts"] = event_count
            #event_count = 0
            #features["gametime/event_counts"] = features["total_gametime_before_assess"] / features["total_event_counts"]
                        
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # this piece counts how many actions was made in each event_code so far
        n_of_event_codes = Counter(session['event_code'])
        
        for key in n_of_event_codes.keys():
            event_code_count[key] += n_of_event_codes[key]

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    # if test_set=True, only the last assessment must be predicted, the previous are scraped
    # in train_set, all assessments are kept
    if test_set:
        return all_assessments[-1]
    return all_assessments

In [8]:
new_train = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    new_train += get_data(user_sample)
new_train = pd.DataFrame(new_train)
print(new_train.shape)
del train

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…


(17690, 117)


In [9]:
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)   
new_test = pd.DataFrame(new_test)
print(new_test.shape)
del test

HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…


(1000, 117)


In [10]:
X_train = new_train.drop(['accuracy_group'],axis=1) 
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train["installation_id"]))
X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
remove_features = [4235, 2070]
for i in X_train.columns:
    if X_train[i].std() == 0:
        remove_features.append(i)
X_train = X_train.drop(remove_features ,axis=1)
y_train = new_train.accuracy_group

# Modelling

In [11]:
# my method
n_folds=5
groups = np.array(X_train.installation_id)
gkf=GroupKFold(n_splits = n_folds)
coefficients = []
models = []
train_qwk_scores = []
test_qwk_scores = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "rmse",
    "tree_learner": "serial",
    "max_depth" : 5,
    "boosting": 'gbdt',
    "num_leaves" : 13,
    "learning_rate" : 0.01,
}
#evals_result = {}
features_list = [i for i in X_train.columns if i != "installation_id"]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
for i , (train_index, test_index) in enumerate(gkf.split(X_train, y_train, groups)):
    optR = OptimizedRounder()
    X_train2 = X_train.iloc[train_index,:]
    y_train2 = y_train.iloc[train_index]
    X_test2 = X_train.iloc[test_index,:]
    y_test2 = y_train.iloc[test_index]
    X_train2 = X_train2.drop(['installation_id'],axis=1) 
    X_test2 = X_test2.drop(['installation_id'],axis=1) 
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        early_stopping_rounds=10,
        #evals_result=evals_result,
        #feval=quadratic_weighted_kappa,
    )
    
    models.append(clf)
    train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
    test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
    
    optR.fit(train_predict.reshape(-1,), y_train2)
    tmp_coefficients = optR.coefficients()
    print("fold_"+str(i)+" coefficients: ", tmp_coefficients)
    opt_train_preds = optR.predict(train_predict.reshape(-1, ), tmp_coefficients)
    train_qwk_score = qwk(y_train2, opt_train_preds)
    print("training qwk: ", train_qwk_score)
    opt_test_preds = optR.predict(test_predict.reshape(-1, ), tmp_coefficients)
    test_qwk_score = qwk(y_test2, opt_test_preds)
    print("validation qwk: ", test_qwk_score)
    train_qwk_scores.append(train_qwk_score)
    test_qwk_scores.append(test_qwk_score)
    coefficients.append(tmp_coefficients)
    
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
    
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

print("                             ")
print("-----------------------------")
print('coefficients: ', coefficients)
print('train qwk list:', train_qwk_scores)
print('train qwk average score:',np.mean(train_qwk_scores))
print('valid qwk list: ', test_qwk_scores)
print('valid qwk average score:',np.mean(test_qwk_scores))

[1]	valid_0's rmse: 1.25667
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 1.25295
[3]	valid_0's rmse: 1.24931
[4]	valid_0's rmse: 1.24572
[5]	valid_0's rmse: 1.24218
[6]	valid_0's rmse: 1.23871
[7]	valid_0's rmse: 1.2353
[8]	valid_0's rmse: 1.23199
[9]	valid_0's rmse: 1.22868
[10]	valid_0's rmse: 1.22548
[11]	valid_0's rmse: 1.22229
[12]	valid_0's rmse: 1.21925
[13]	valid_0's rmse: 1.21615
[14]	valid_0's rmse: 1.21321
[15]	valid_0's rmse: 1.21023
[16]	valid_0's rmse: 1.2074
[17]	valid_0's rmse: 1.20449
[18]	valid_0's rmse: 1.20162
[19]	valid_0's rmse: 1.19887
[20]	valid_0's rmse: 1.19619
[21]	valid_0's rmse: 1.19346
[22]	valid_0's rmse: 1.19085
[23]	valid_0's rmse: 1.18828
[24]	valid_0's rmse: 1.1857
[25]	valid_0's rmse: 1.18325
[26]	valid_0's rmse: 1.18067
[27]	valid_0's rmse: 1.17826
[28]	valid_0's rmse: 1.17578
[29]	valid_0's rmse: 1.17352
[30]	valid_0's rmse: 1.17115
[31]	valid_0's rmse: 1.16888
[32]	valid_0's rmse: 1.16658
[33]	valid_0's rmse: 1.

- train qwk list: [0.61123127, 0.58476354, 0.59588894, 0.6303266, 0.60107489]
- train qwk average score: 0.604657048
- valid qwk list:  [0.5827259, 0.53003928, 0.52046055, 0.56694015, 0.50757124]
- valid qwk average score: 0.541547424
- 
- after adding num_in_id and highest_round
- train qwk list: [0.61291619, 0.63146046, 0.62326739, 0.64149355, 0.60333902]
- train qwk average score: 0.622495322
- valid qwk list:  [0.5925536, 0.57804037, 0.54227903, 0.56509709, 0.50079162]
- valid qwk average score: 0.5557523420000001

In [12]:
list(X_train.columns)

['Clip',
 'Activity',
 'Assessment',
 'Game',
 'Dino Dive',
 'Crystals Rule',
 'Flower Waterer (Activity)',
 'Leaf Leader',
 'Chicken Balancer (Activity)',
 'Sandcastle Builder (Activity)',
 'Pan Balance',
 'Dino Drink',
 'Chow Time',
 'Bug Measurer (Activity)',
 'Bottle Filler (Activity)',
 'All Star Sorting',
 'Fireworks (Activity)',
 'Egg Dropper (Activity)',
 'Scrub-A-Dub',
 'Watering Hole (Activity)',
 'Air Show',
 'Bubble Bath',
 'Happy Camel',
 2050,
 4100,
 4230,
 5000,
 2060,
 4110,
 5010,
 2075,
 2080,
 2081,
 2083,
 3110,
 4010,
 3120,
 3121,
 4020,
 4021,
 4022,
 4025,
 4030,
 4031,
 3010,
 4035,
 4040,
 3020,
 3021,
 4045,
 2000,
 4050,
 2010,
 2020,
 4070,
 2025,
 2030,
 4080,
 2035,
 2040,
 4090,
 4220,
 4095,
 'installation_id',
 'session_title',
 'accumulated_correct_attempts',
 'accumulated_uncorrect_attempts',
 'accumulated_stumps',
 'accumulated_clicks',
 'accumulated_audios',
 'accumulated_animes',
 'total_gametime_before_assess',
 'no_playgames_before_assess',
 'M

In [13]:
feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True).head(100)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,665,746,779,778,736,740.8,41.566333,0.05611
1,4070,391,363,442,505,483,436.8,53.585073,0.122676
2,accumulated_accuracy,324,447,270,417,265,344.6,74.904205,0.217366
3,Clip,206,312,343,349,370,316.0,58.051701,0.183708
4,accumulated_accuracy_group,324,201,351,288,355,303.8,56.700617,0.186638
5,3020,244,252,258,350,283,277.4,38.572529,0.13905
6,CRYSTALCAVES_play_before_assess,210,280,212,261,226,237.8,27.91702,0.117397
7,3121,170,266,175,306,253,234.0,53.190225,0.227309
8,2010,165,201,125,177,216,176.8,31.447734,0.177872
9,2000,182,164,167,184,183,176.0,8.648699,0.04914


# prediction

In [14]:
X_test = new_test.drop(["installation_id", "accuracy_group"], axis=1)
X_test = X_test.drop(remove_features, axis=1)
pred_value = np.zeros([X_test.shape[0]])
test_coefficients = np.mean(coefficients, axis=0)
for model in models:
    pred_value += model.predict(X_test, num_iteration = model.best_iteration) /n_folds
test_pred_class= optR.predict(pred_value.reshape(-1, ), test_coefficients)
sample_submission["accuracy_group"] = test_pred_class
sample_submission.to_csv('submission.csv', index=False)

In [15]:
sample_submission["accuracy_group"].value_counts()

3    455
2    260
1    156
0    129
Name: accuracy_group, dtype: int64