- predict num_correct and num_incorrect

In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, cohen_kappa_score
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
    
    def fit(self, X, y, random_flg = False):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if random_flg:
            initial_coef = [np.random.uniform(0.4,0.5), np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7)]
        else:
            initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

# install

In [4]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 24s, sys: 11.5 s, total: 1min 35s
Wall time: 1min 35s


In [5]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
train = train[train.installation_id.isin(train_labels.installation_id.unique())]

In [6]:
assess_title = ['Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)',
       'Cauldron Filler (Assessment)', 'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)']
def remove_index_calc(df):
    additional_remove_index = []
    for i, session in df.groupby('installation_id', sort=False):
        last_row = session.index[-1]
        session = session[session.title.isin(assess_title)]
        first_row = session.index[-1] + 1
        for j in range(first_row, last_row+1):
            additional_remove_index.append(j)                
    return additional_remove_index
additional_remove_index = remove_index_calc(train)
train = train[~train.index.isin(additional_remove_index)]

# Preprocess and Feature engineering

In [7]:
%%time
def encode_title(train, test):
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = sorted(list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique()))))

    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    
    #train["level"] = train["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    #test["level"] = test["event_data"].apply(lambda x: json.loads(x)["level"] if "\"level\"" in x else np.nan)
    
    #train["round"] = train["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    #test["round"] = test["event_data"].apply(lambda x: json.loads(x)["round"] if "\"round\"" in x else np.nan)
    
    train["true"] = train["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)
    test["true"] = test["event_data"].apply(lambda x: 1 if "true" in x and "correct" in x else 0)

    train["false"] = train["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    test["false"] = test["event_data"].apply(lambda x: 1 if "false" in x and "correct" in x else 0)
    
    train['hour'] = train['timestamp'].dt.hour
    test['hour'] = test['timestamp'].dt.hour    
    train["morning"] = train["hour"].apply(lambda x: 1 if x>=5 and x <=10 else 0)
    test["morning"] = test["hour"].apply(lambda x: 1 if x>=5 and x <=10 else 0)
                
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode = encode_title(train, test)

CPU times: user 1min 1s, sys: 4.74 s, total: 1min 6s
Wall time: 1min 6s


In [8]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()}
    event_code_count = {str(ev): 0 for ev in list_of_event_code}
    title_count = {actv: 0 for actv in list_of_user_activities}
    morning_play = 0
    
    last_session_time_sec = 0
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    miss = 0
    crys_game_true = 0; crys_game_false = 0; crys_game_true_bet = 0; crys_game_false_bet = 0
    tree_game_true = 0; tree_game_false = 0; tree_game_true_bet = 0; tree_game_false_bet = 0
    magma_game_true = 0; magma_game_false = 0; magma_game_true_bet = 0; magma_game_false_bet = 0
    crys_game_acc = []; tree_game_acc = []; magma_game_acc = []
    durations = []
    prev_assess_title = -999
    assess_count = 1
    last_accuracy = -999
    
    for i, session in user_sample.groupby('game_session', sort=False):      
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        game_session = session["game_session"].iloc[0]
        
        if session_type != 'Assessment':
            if session_type == "Game":
                true = session['true'].sum()
                false = session['false'].sum() 
                if session_world == activities_world["CRYSTALCAVES"]:
                    crys_game_true += true
                    crys_game_false += false
                    crys_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["TREETOPCITY"]:
                    tree_game_true += true
                    tree_game_false += false
                    tree_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                elif session_world == activities_world["MAGMAPEAK"]:
                    magma_game_true += true
                    magma_game_false += false
                    magma_game_acc.append(true / (true + false) if (true + false) != 0 else 0)
                else:
                    pass
                
        if (session_type == 'Assessment') & (test_set or len(session)>1): # test set or session in train_label
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            features["misses"] = miss
            features['accumulated_actions'] = accumulated_actions
            features["morning_play"] = morning_play

            if session_world == activities_world["CRYSTALCAVES"]:
                features["game_true"] = crys_game_true
                features["game_false"] = crys_game_false
                features['game_accuracy'] = crys_game_true / (crys_game_true + crys_game_false) if (crys_game_true + crys_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(crys_game_acc) if len(crys_game_acc) >=1 else 0
                features["last_game_acc"] = crys_game_acc[-1] if len(crys_game_acc) >=1 else 0
            elif session_world == activities_world["TREETOPCITY"]:
                features["game_true"] = tree_game_true
                features["game_false"] = tree_game_false
                features['game_accuracy'] = tree_game_true / (tree_game_true + tree_game_false) if (tree_game_true + tree_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(tree_game_acc) if len(tree_game_acc) >=1 else 0
                features["last_game_acc"] = tree_game_acc[-1] if len(tree_game_acc) >=1 else 0
            elif session_world == activities_world["MAGMAPEAK"]:
                features["game_true"] = magma_game_true
                features["game_false"] = magma_game_false
                features['game_accuracy'] = magma_game_true / (magma_game_true + magma_game_false) if (magma_game_true + magma_game_false) != 0 else 0
                features["game_accuracy_std"] = np.std(magma_game_acc) if len(magma_game_acc) >=1 else 0
                features["last_game_acc"] = magma_game_acc[-1] if len(magma_game_acc) >=1 else 0
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            features['game_session'] = game_session
            features["prev_assess_title"] = prev_assess_title
            prev_assess_title = session_title
            features["first_assessment"] = 1 if assess_count == 1 else 0
            assess_count += 1
            
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds) 
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            features['last_assess_acc'] = last_accuracy
            last_accuracy = accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        n_of_title = Counter(session['title']) 
        for key in n_of_title.keys():
            title_count[activities_labels[key]] += n_of_title[key]
            
        n_of_eventcode = Counter(session['event_code']) 
        for key in n_of_eventcode.keys():
            event_code_count[str(key)] += n_of_eventcode[key]
                        
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
        miss += np.sum(session["misses"])
        morning_play += np.sum(session["morning"])  
        user_world_count["world_"+str(session_world)] += session.shape[0]

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1], all_assessments[:-1]
    return all_assessments

In [9]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    compiled_val = []

    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        compiled_train += get_data(user_sample)
    del train
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
        test_data, val_data = get_data(user_sample, test_set=True)

        compiled_test.append(test_data)
        compiled_val += val_data
    del test
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    reduce_val = pd.DataFrame(compiled_val)

    categoricals = ['session_title']
    return reduce_train, reduce_test, reduce_val, categoricals

In [10]:
new_train, new_test, new_val, categoricals = get_train_and_test(train, test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…




HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




# Feature selection

In [11]:
def exclude(reduce_train, reduce_test, features):
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in features:
        if feature not in ['accuracy_group', 'installation_id', 'session_title']:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                    to_exclude.append(feature)
                    print(feature)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
                print(feature)
    return to_exclude, ajusted_test
#features = [i for i in new_train.columns if i not in ["game_session"]]
#to_exclude, ajusted_test = exclude(new_train, new_test, features)

In [12]:
#X_train = new_train.drop(['accuracy_group', 'game_session'],axis=1) 
#X_train = pd.concat([X_train, magic_feat["myfea_0"]], axis=1)
#lbl = preprocessing.LabelEncoder()
#lbl.fit(list(X_train["installation_id"]))
#X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
#remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"])]
#for i in X_train.columns:
#    if X_train[i].std() == 0 and i not in remove_features:
#        remove_features.append(i)
#X_train = X_train.drop(remove_features, axis=1)
#X_train = X_train[sorted(X_train.columns.tolist())]
#y_train = new_train.accuracy_group
#print(X_train.shape)

#X_test = new_test.drop(["installation_id","accuracy_group", "game_session"], axis=1)
#X_test = X_test.drop(remove_features, axis=1)
#X_test = X_test[sorted(X_test.columns.tolist())]

#X_val = new_val.drop(["installation_id", "accuracy_group", "game_session"], axis=1)
#X_val = X_val.drop(remove_features, axis=1)
#X_val = X_val[sorted(X_val.columns.tolist())]
#y_val = new_val["accuracy_group"]

In [13]:
#mod_test = ajusted_test.drop(["installation_id","accuracy_group", "game_session"], axis=1)
#mod_test = mod_test.drop(remove_features, axis=1)
#mod_test = mod_test.drop(to_exclude, axis=1)
#mod_test = mod_test[sorted(mod_test.columns.tolist())]
#mod_train = X_train.drop(to_exclude, axis=1)

# modelling and prediction

In [14]:
def num_correct_calc(new_train, new_test):
    X_train = new_train.drop(['accuracy_group'],axis=1) 
    X_train = pd.merge(X_train, train_labels[["game_session", "num_correct", "num_incorrect"]], on ="game_session")
    y_train = X_train.num_correct
    X_train = X_train.drop(['game_session', "num_correct", "num_incorrect"],axis=1) 
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"]) 
                       or i == "morning_play"]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    #X_train = pd.concat([X_train[features], X_train["installation_id"]],axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]
    print(X_train.shape)

    X_test = new_test.drop(["installation_id","accuracy_group", "game_session"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    #X_test = X_test[features]
    X_test = X_test[sorted(X_test.columns.tolist())]

    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    coefficients = []
    models = []
    lgbm_params = {
    'objective': 'binary','eval_metric': 'auc', 
    "max_depth" : 5, "boosting": 'gbdt', "num_leaves" : 20, "learning_rate" : 0.01,
}

    valid_correct_num = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        
        test2 = pd.concat([X_test2, y_test2], axis=1)
        X_test2 = test2.drop(["num_correct", "installation_id"], axis=1)
        y_test2 = test2["num_correct"]
    
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,)
        train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            
        models.append(clf)
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        valid_correct_num.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
        
    print('Accuracy score = \t {}'.format(accuracy_score(y_train, np.round(valid_correct_num))))
    print('Precision score = \t {}'.format(precision_score(y_train, np.round(valid_correct_num))))
    print('Recall score =   \t {}'.format(recall_score(y_train, np.round(valid_correct_num))))
    print('F1 score =      \t {}'.format(f1_score(y_train, np.round(valid_correct_num))))
    print(confusion_matrix(y_train, np.round(valid_correct_num)))
    pred_value = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value, valid_correct_num, feature_importance_df
pred_value, valid_correct_num, feat_df1 = num_correct_calc(new_train, new_test)

(17690, 511)
Fold 1
Training until validation scores don't improve for 100 rounds
[500]	training's binary_logloss: 0.335722	valid_1's binary_logloss: 0.385536
Early stopping, best iteration is:
[875]	training's binary_logloss: 0.303661	valid_1's binary_logloss: 0.379566
Fold 2
Training until validation scores don't improve for 100 rounds
[500]	training's binary_logloss: 0.33639	valid_1's binary_logloss: 0.39281
[1000]	training's binary_logloss: 0.296714	valid_1's binary_logloss: 0.387364
Early stopping, best iteration is:
[1110]	training's binary_logloss: 0.28941	valid_1's binary_logloss: 0.386689
Fold 3
Training until validation scores don't improve for 100 rounds
[500]	training's binary_logloss: 0.336269	valid_1's binary_logloss: 0.375651
Early stopping, best iteration is:
[725]	training's binary_logloss: 0.315628	valid_1's binary_logloss: 0.374022
Fold 4
Training until validation scores don't improve for 100 rounds
[500]	training's binary_logloss: 0.331606	valid_1's binary_logloss: 

Accuracy score = 	 0.8370830977953646
Precision score = 	 0.8586344836938098
Recall score =   	 0.9407919173909813
F1 score =      	 0.897837646224743
[[ 2144  2085]
 [  797 12664]]

In [15]:
feat_df1.sort_values("Average", ascending=False).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,1655,1946,1629,1912,1858,1800.0,132.280006,0.073489
1,accumulated_accuracy_group,503,464,511,678,474,526.0,77.982049,0.148255
2,game_accuracy,408,390,393,583,381,431.0,76.495752,0.177484
3,4070,313,368,191,422,341,327.0,76.933738,0.235271
4,Clip,283,226,255,296,358,283.6,44.328772,0.156307
5,last_game_acc,279,295,230,329,221,270.8,40.459362,0.149407
6,2000,223,245,224,294,161,229.4,42.814016,0.186635
7,game_accuracy_std,166,277,156,248,287,226.8,55.322328,0.243926
8,Chow Time_4070,246,169,138,279,232,212.8,51.704545,0.242972
9,duration_mean,173,230,143,276,132,190.8,54.506513,0.285674


In [16]:
def num_incorrect_calc(new_train, new_test): # consider only people who included correct answers
    X_train = new_train.drop(['accuracy_group'],axis=1) 
    X_train = pd.merge(X_train, train_labels[["game_session", "num_correct", "num_incorrect"]], on ="game_session")
    #X_train = X_train[X_train.num_correct != 0] # remove data with num_correct = 0
    y_train = X_train.num_incorrect
    y_train.loc[y_train >=3] = 3
    X_train = X_train.drop(['game_session', "num_correct", "num_incorrect"],axis=1) 
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["installation_id"]))
    X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
    remove_features = [i for i in X_train.columns if "_4235" in i or i == "world_"+str(activities_world["NONE"]) 
                       or i == "morning_play"]
    for i in X_train.columns:
        if X_train[i].std() == 0 and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_train = X_train[sorted(X_train.columns.tolist())]
    print(X_train.shape)

    X_test = new_test.drop(["installation_id","accuracy_group", "game_session"], axis=1)
    X_test = X_test.drop(remove_features, axis=1)
    X_test = X_test[sorted(X_test.columns.tolist())]

    n_folds=5
    skf=GroupKFold(n_splits = n_folds)
    coefficients = []
    models = []
    lgbm_params = {
        'objective': 'regression','metric': 'rmse',"tree_learner": "serial", 
    "max_depth" : 5, "boosting": 'gbdt', "num_leaves" : 13, "learning_rate" : 0.01,
    }

    valid_incorrect_num = pd.DataFrame(np.zeros([X_train.shape[0]]))
    features_list = [i for i in X_train.columns if i != "installation_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["installation_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2 = X_train2.drop(['installation_id'],axis=1)
    
        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        
        test2 = pd.concat([X_test2, y_test2], axis=1)
        X_test2 = test2.drop(["num_incorrect", "installation_id"], axis=1)
        y_test2 = test2["num_incorrect"]
    
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500,)
        train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
        test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
            
        models.append(clf)
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        valid_incorrect_num.iloc[test_index] = test_predict.reshape(X_test2.shape[0], 1)
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    #print('Accuracy score = \t {}'.format(accuracy_score(y_train, np.round(valid_incorrect_num))))
    #print('Precision score = \t {}'.format(precision_score(y_train, np.round(valid_incorrect_num))))
    #print('Recall score =   \t {}'.format(recall_score(y_train, np.round(valid_incorrect_num))))
    #print('F1 score =      \t {}'.format(f1_score(y_train, np.round(valid_incorrect_num))))
    #print(confusion_matrix(y_train, np.round(valid_incorrect_num)))
    
    pred_value_incorrect = np.zeros([X_test.shape[0]])
    for model in models:
        pred_value_incorrect += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return pred_value_incorrect, valid_incorrect_num, feature_importance_df
pred_value_incorrect, valid_incorrect_num, feat_df2 = num_incorrect_calc(new_train, new_test)

(17690, 511)
Fold 1
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.960277	valid_1's rmse: 0.989301
[1000]	training's rmse: 0.925308	valid_1's rmse: 0.984539
Early stopping, best iteration is:
[991]	training's rmse: 0.925883	valid_1's rmse: 0.984485
Fold 2
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.959714	valid_1's rmse: 0.990386
Early stopping, best iteration is:
[584]	training's rmse: 0.952435	valid_1's rmse: 0.989609
Fold 3
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.957596	valid_1's rmse: 0.986523
Early stopping, best iteration is:
[821]	training's rmse: 0.933869	valid_1's rmse: 0.984566
Fold 4
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.952389	valid_1's rmse: 1.01152
[1000]	training's rmse: 0.916081	valid_1's rmse: 1.00654
Early stopping, best iteration is:
[1303]	training's rmse: 0.898647	valid_1's rmse: 1.00569
F

In [17]:
feat_df2.sort_values("Average", ascending=False).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,953,831,938,998,959,935.8,56.019282,0.059862
1,accumulated_accuracy_group,457,364,409,576,389,439.0,74.990666,0.170822
2,game_accuracy,410,249,347,477,310,358.6,78.954671,0.220175
3,4070,244,123,163,264,127,184.2,59.00983,0.320357
4,last_game_acc,196,119,146,234,158,170.6,40.227354,0.235799
5,misses,145,116,103,233,136,146.6,45.644715,0.311355
6,accumulated_uncorrect_attempts,118,77,129,187,133,128.8,35.227262,0.273504
7,Clip,145,92,118,135,137,125.4,18.874321,0.150513
8,Chow Time_4070,180,49,80,147,93,109.8,47.283824,0.430636
9,Bug Measurer (Activity)_4070,125,65,140,95,119,108.8,26.263282,0.24139


In [18]:
train_exp_accuracy = valid_correct_num / (valid_correct_num + valid_incorrect_num)
test_exp_accuracy = pred_value / (pred_value + pred_value_incorrect)
best_score = 0
for i in range(10):
    optR = OptimizedRounder()
    optR.fit(np.array(train_exp_accuracy).reshape(-1,), new_train.accuracy_group, random_flg=True)
    coefficients = optR.coefficients()
    final_valid_pred = optR.predict(np.array(train_exp_accuracy).reshape(-1,), coefficients)
    score = qwk(new_train.accuracy_group, final_valid_pred)
    print(coefficients, score)
    if score > best_score:
        best_score = score
        best_coefficients = coefficients
#final_test_pred = pd.cut(np.array(test_exp_accuracy).reshape(-1,), [-np.inf] + list(np.sort(best_coefficients)) + [np.inf], labels = [0, 1, 2, 3])
#sample_submission["accuracy_group"] = final_test_pred.astype(int)
#sample_submission.to_csv('submission.csv', index=False)
#sample_submission["accuracy_group"].value_counts(normalize = True)

[0.37893937 0.25822934 0.52620022] 0.59234866
[0.54731571 0.27147812 0.38143139] 0.59097442
[0.38207003 0.2697439  0.525777  ] 0.59280533
[0.3822795  0.2575359  0.52569192] 0.59238971
[0.26908416 0.36893293 0.52591049] 0.5917555
[0.38118532 0.26974533 0.53079678] 0.5923002
[0.52475385 0.26909861 0.38209732] 0.59278342
[0.25770265 0.38119267 0.52505012] 0.59248029
[0.3821188  0.26944158 0.52505214] 0.59277221
[0.52612311 0.25820378 0.38093433] 0.59247671


In [19]:
final_test_pred = pd.cut(np.array(test_exp_accuracy).reshape(-1,), [-np.inf] + list(np.sort([0.38199133, 0.2697444, 0.52564235])) + [np.inf], labels = [0, 1, 2, 3])
sample_submission["accuracy_group"] = final_test_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission["accuracy_group"].value_counts(normalize = True)

3    0.501
2    0.236
0    0.173
1    0.090
Name: accuracy_group, dtype: float64