In [1]:
import pandas as pd
import numpy as np
import calendar
import warnings
from numba import jit 
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import json
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from typing import Any
import lightgbm as lgb
import xgboost as xgb
import json
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import metrics
from itertools import product
import copy
import time
from functools import partial
import scipy as sp
from scipy import stats
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
def quadratic_weighted_kappa(y_pred, data):
    y_pred = np.transpose(y_pred.reshape(4,-1))
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.array(data.get_label()).astype("int")
    c_matrix = confusion_matrix(y_true, y_pred, labels=[0,1,2,3])

    numer = 0.0
    denom = 0.0
    
    for i in range(c_matrix.shape[0]):
        for j in range(c_matrix.shape[1]):
            n = c_matrix.shape[0]
            wij = ((i-j)**2.0)
            oij = c_matrix[i, j]
            eij = c_matrix[i, :].sum() * c_matrix[:, j].sum() / c_matrix.sum()
            numer += wij * oij
            denom += wij * eij
    return "Quadratic weighted Kappa", 1 - numer / denom, True

In [3]:
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return np.round(1 - o / e, 8)

In [4]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

# install

In [5]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 12s, sys: 9.59 s, total: 1min 22s
Wall time: 1min 22s


In [6]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")

As we can not train on those installation_id's anyway, I am taking them out of the train set. This reduces our train set further from 8.3 million rows to 7.7 million.

In [7]:
train = train[train.installation_id.isin(train_labels.installation_id.unique())]
train.shape

(7734558, 11)

# Preprocess and Feature engineering

In [8]:
#Credits go to Andrew Lukyanenko

def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code

# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(train, test, train_labels)

categoricals = ['session_title']

In [9]:
#Credits go to Massoud Hosseinali

def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # news features: time spent in each activity
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {eve: 0 for eve in list_of_event_code}
    last_session_time_sec = 0
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title] #from Andrew
        
        # get current session time in seconds
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
        
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(time_spent_each_act.copy())
            features.update(event_code_count.copy())
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1] #from Andrew
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0] 
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # this piece counts how many actions was made in each event_code so far
        n_of_event_codes = Counter(session['event_code'])
        
        for key in n_of_event_codes.keys():
            event_code_count[key] += n_of_event_codes[key]

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    # if test_set=True, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in train_set, all assessments are kept
    return all_assessments

In [10]:
#Credits go to Massoud Hosseinali
#The get_data function is applied to each installation_id and added to the compile_data list
new_train = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    # user_sample is a DataFrame that contains only one installation_id
    new_train += get_data(user_sample)
new_train = pd.DataFrame(new_train)
new_train.shape

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…




(17690, 103)

In [11]:
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
new_test = pd.DataFrame(new_test)

HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




In [12]:
def preprocess(reduce_train, reduce_test):
    for df in [reduce_train, reduce_test]:
        df['installation_session_count'] = df.groupby(['installation_id'])['Clip'].transform('count')
        df['installation_duration_mean'] = df.groupby(['installation_id'])['duration_mean'].transform('mean')
        #df['installation_duration_std'] = df.groupby(['installation_id'])['duration_mean'].transform('std')
        df['installation_title_nunique'] = df.groupby(['installation_id'])['session_title'].transform('nunique')
        
        df['sum_event_code_count'] = df[[2050, 4100, 4230, 5000, 4235, 2060, 4110, 5010, 2070, 2075, 2080, 2081, 2083, 3110, 4010, 3120, 3121, 4020, 4021, 
                                        4022, 4025, 4030, 4031, 3010, 4035, 4040, 3020, 3021, 4045, 2000, 4050, 2010, 2020, 4070, 2025, 2030, 4080, 2035, 
                                        2040, 4090, 4220, 4095]].sum(axis = 1)
        
        df['installation_event_code_count_mean'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('mean')
        #df['installation_event_code_count_std'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('std')
        
    features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
    features = [x for x in features if x not in ['accuracy_group', 'installation_id']] + ['acc_' + title for title in assess_titles]
   
    return reduce_train, reduce_test, features
# call feature engineering function
new_train, new_test, features = preprocess(new_train, new_test)

In [13]:
X_train = new_train.drop(['accuracy_group', 'installation_id'],axis=1)
remove_features = [4235, "sum_event_code_count"]
for i in X_train.columns:
    if X_train[i].std() == 0:
        remove_features.append(i)
        
X_train = X_train.drop(remove_features ,axis=1)
y_train = new_train.accuracy_group

# Modelling

In [14]:
# my method
n_folds=5
skf=StratifiedKFold(n_splits = n_folds)
coefficients = []
models = []
train_qwk_scores = []
test_qwk_scores = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "rmse", # if set None, only custom metric is considered.
    "tree_learner": "serial",
    "max_depth" : 5,
    "boosting": 'gbdt',
    #"num_class": 4,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
}
#evals_result = {}
feature_importance_df = pd.DataFrame(list(X_train.columns), columns=["Feature"])
for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    optR = OptimizedRounder()
    X_train2 = X_train.iloc[train_index,:]
    y_train2 = y_train.iloc[train_index]
    X_test2 = X_train.iloc[test_index,:]
    y_test2 = y_train.iloc[test_index]
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        early_stopping_rounds=10,
        #evals_result=evals_result,
        #feval=quadratic_weighted_kappa,
    )
    
    models.append(clf)
    train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
    test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
    
    optR.fit(test_predict.reshape(-1,), y_test2)
    tmp_coefficients = optR.coefficients()
    print("fold_"+str(i)+" coefficients: ", tmp_coefficients)
    opt_train_preds = optR.predict(train_predict.reshape(-1, ), tmp_coefficients)
    train_qwk_score = qwk(y_train2, opt_train_preds)
    print("training qwk: ", train_qwk_score)
    opt_test_preds = optR.predict(test_predict.reshape(-1, ), tmp_coefficients)
    test_qwk_score = qwk(y_test2, opt_test_preds)
    print("validation qwk: ", test_qwk_score)
    train_qwk_scores.append(train_qwk_score)
    test_qwk_scores.append(test_qwk_score)
    coefficients.append(tmp_coefficients)
    
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
    
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

print("                             ")
print("-----------------------------")
print('coefficients: ', coefficients)
print('train qwk list         :', train_qwk_scores)
print('train qwk average score:',np.mean(train_qwk_scores))
print('valid qwk list         : ', test_qwk_scores)
print('valid qwk average score:',np.mean(test_qwk_scores))

[1]	valid_0's rmse: 1.2531
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 1.24969
[3]	valid_0's rmse: 1.24609
[4]	valid_0's rmse: 1.24274
[5]	valid_0's rmse: 1.23934
[6]	valid_0's rmse: 1.23598
[7]	valid_0's rmse: 1.2327
[8]	valid_0's rmse: 1.22948
[9]	valid_0's rmse: 1.22637
[10]	valid_0's rmse: 1.22327
[11]	valid_0's rmse: 1.22033
[12]	valid_0's rmse: 1.21721
[13]	valid_0's rmse: 1.21437
[14]	valid_0's rmse: 1.21136
[15]	valid_0's rmse: 1.20861
[16]	valid_0's rmse: 1.20573
[17]	valid_0's rmse: 1.20286
[18]	valid_0's rmse: 1.20016
[19]	valid_0's rmse: 1.1974
[20]	valid_0's rmse: 1.1948
[21]	valid_0's rmse: 1.19209
[22]	valid_0's rmse: 1.18954
[23]	valid_0's rmse: 1.18697
[24]	valid_0's rmse: 1.18452
[25]	valid_0's rmse: 1.18201
[26]	valid_0's rmse: 1.17961
[27]	valid_0's rmse: 1.17728
[28]	valid_0's rmse: 1.17492
[29]	valid_0's rmse: 1.1725
[30]	valid_0's rmse: 1.1703
[31]	valid_0's rmse: 1.16817
[32]	valid_0's rmse: 1.16608
[33]	valid_0's rmse: 1.163

- train 基準
- train qwk list         : [0.60493095, 0.62992208, 0.61592005, 0.64217875, 0.58921787]
- train qwk average score: 0.6164339400000001
- test qwk list          :  [0.5456311, 0.56214594, 0.5468586, 0.56905943, 0.54125862]
- test qwk average score : 0.5529907379999999
- 
- valid 基準
- 

In [15]:
list(X_train.columns)

['Clip',
 'Activity',
 'Assessment',
 'Game',
 'Dino Drink',
 'Happy Camel',
 'Dino Dive',
 'Bug Measurer (Activity)',
 'Egg Dropper (Activity)',
 'Chow Time',
 'Chicken Balancer (Activity)',
 'Flower Waterer (Activity)',
 'All Star Sorting',
 'Bottle Filler (Activity)',
 'Watering Hole (Activity)',
 'Fireworks (Activity)',
 'Pan Balance',
 'Air Show',
 'Sandcastle Builder (Activity)',
 'Leaf Leader',
 'Scrub-A-Dub',
 'Crystals Rule',
 'Bubble Bath',
 2050,
 4100,
 4230,
 5000,
 2060,
 4110,
 5010,
 2070,
 2075,
 2080,
 2081,
 2083,
 3110,
 4010,
 3120,
 3121,
 4020,
 4021,
 4022,
 4025,
 4030,
 4031,
 3010,
 4035,
 4040,
 3020,
 3021,
 4045,
 2000,
 4050,
 2010,
 2020,
 4070,
 2025,
 2030,
 4080,
 2035,
 2040,
 4090,
 4220,
 4095,
 'session_title',
 'accumulated_correct_attempts',
 'accumulated_uncorrect_attempts',
 'duration_mean',
 'accumulated_accuracy',
 0,
 1,
 2,
 3,
 'accumulated_accuracy_group',
 'accumulated_actions',
 'installation_session_count',
 'installation_duration_mea

In [16]:
feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True).head(100)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,949,998,868,1005,954,954.8,48.897444,0.051212
1,installation_duration_mean,495,551,376,620,556,519.6,81.996585,0.157807
2,4070,397,470,312,537,419,427.0,75.017331,0.175685
3,Clip,402,359,308,352,346,353.4,30.03731,0.084995
4,accumulated_accuracy_group,328,310,349,324,320,326.2,12.874782,0.039469
5,accumulated_accuracy,288,377,254,417,281,323.4,62.451901,0.19311
6,3020,336,322,155,316,291,284.0,66.124126,0.232831
7,installation_event_code_count_mean,209,247,151,294,236,227.4,47.051461,0.206911
8,3121,220,280,151,208,244,220.6,42.612674,0.193167
9,installation_session_count,228,286,161,206,210,218.2,40.449475,0.185378


# prediction

In [17]:
# pattern 1: average regression outputs and predict class by avearge coefficients
#X_test = new_test.drop(["installation_id"], axis=1)
#X_test = X_test.drop(remove_features, axis=1)
#pred_value = np.zeros([X_test.shape[0]])
#test_coefficients = np.mean(coefficients, axis=0)
#for model in models:
#    pred_value += model.predict(X_test, num_iteration = model.best_iteration) /n_folds
#test_pred_class= optR.predict(pred_value.reshape(-1, ), tmp_coefficients)
#sample_submission["accuracy_group"] = test_pred_class
#sample_submission.to_csv('submission.csv', index=False)

In [18]:
# pattern 2
X_test = new_test.drop(["installation_id"], axis=1)
X_test = X_test.drop(remove_features, axis=1)
pred_value = np.zeros([X_test.shape[0], 5])
for i, model in enumerate(models):
    tmp_pred_value=  model.predict(X_test, num_iteration = model.best_iteration)
    pred_value[:,i] = optR.predict(tmp_pred_value.reshape(-1, ), coefficients[i])
test_pred_class = pd.DataFrame(stats.mode(pred_value, axis=1)[0]).astype("int")
sample_submission["accuracy_group"] = test_pred_class
sample_submission.to_csv('submission.csv', index=False)