In [59]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
import datetime
from numba import jit
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn.metrics import confusion_matrix

In [2]:
# # this function is the quadratic weighted kappa (the metric used for the competition submission)
# def qwk(act,pred,n=4,hist_range=(0,3)):
    
#     # Calculate the percent each class was tagged each label
#     O = confusion_matrix(act, pred)
#     # normalize to sum 1
#     O = np.divide(O,np.sum(O))
    
#     # create a new matrix of zeroes that match the size of the confusion matrix
#     # this matriz looks as a weight matrix that give more weight to the corrects
#     W = np.zeros((n,n))
#     for i in range(n):
#         for j in range(n):
#             # makes a weird matrix that is bigger in the corners top-right and botton-left (= 1)
#             W[i][j] = ((i-j)**2)/((n-1)**2)
            
#     # make two histograms of the categories real X prediction
#     act_hist = np.histogram(act,bins=n,range=hist_range)[0]
#     prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
#     # multiply the two histograms using outer product
#     E = np.outer(act_hist,prd_hist)
#     E = np.divide(E,np.sum(E)) # normalize to sum 1
    
#     # apply the weights to the confusion matrix
#     num = np.sum(np.multiply(W,O))
#     # apply the weights to the histograms
#     den = np.sum(np.multiply(W,E))
    
#     return 1-np.divide(num,den)

[量表信度的测量kappa统计量之简介](https://max.book118.com/html/2018/0212/152788389.shtm)

In [5]:
@jit
def qwk(y_true: [np.ndarray, list],
        y_pred: [np.ndarray, list],
        max_rat: int = 3) -> float:
    y_true_ = np.asarray(y_true, dtype=int)
    y_pred_ = np.asarray(y_pred, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    numerator = 0
    for k in range(y_true_.shape[0]):
        i, j = y_true_[k], y_pred_[k]
        hist1[i] += 1
        hist2[j] += 1
        numerator += (i - j) * (i - j)

    denominator = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            denominator += hist1[i] * hist2[j] * (i - j) * (i - j)

    denominator /= y_true_.shape[0]
    return 1 - numerator / denominator


def calc_metric(y_true: [np.ndarray, list],
                y_pred: [np.ndarray, list]) -> float:
    return qwk(y_true, y_pred)

def eval_qwk_lgb(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """

    y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True


In [61]:
train = pd.read_csv('./data//train.csv')
train_labels = pd.read_csv('./data/train_labels.csv')
test = pd.read_csv('./data/test.csv')
specs = pd.read_csv('./data/specs.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [62]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [63]:
## encode
# make a list with all the unique 'titles' from the train and test set
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
# make a list with all the unique 'event_code' from the train and test set
list_of_event_code = list(set(train['event_code'].value_counts().index).union(set(test['event_code'].value_counts().index)))
# make a list with all the unique worlds from the train and test set
list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))

# create a dictionary numerating the titles
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))


In [67]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

LabelEncoder()

In [68]:
le.classes_

array(['amsterdam', 'paris', 'tokyo'], dtype='<U9')

In [69]:
le.transform(["tokyo", "tokyo", "paris"]) 

array([2, 2, 1])

In [70]:
list(le.inverse_transform([2, 2, 1]))

  if diff:


['tokyo', 'tokyo', 'paris']

In [71]:
# replace the text titles with the number titles from the dict
train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train['world'] = train['world'].map(activities_world)
test['world'] = test['world'].map(activities_world)
train_labels['title'] = train_labels['title'].map(activities_map)

In [73]:
# Assessment attempts are captured in event_code 4100 for all assessments 
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
# except for Bird Measurer, which uses event_code 4110.
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [76]:
# convert text into datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:

9点评估  9点之前

10点评估  10点之前

对所有的installation_id
- installation_id 出现次数：一个用户总操作次数，操作次数多比较熟练
- 操作时间：所有看video时间
- 每次操作消耗时间的统计值：最大值？最小值？平均值？
- 分箱后的离群值：这个值/均值
- 

对 game_session 分别取上述特征

对 type 分别提取

event_code 的统计

type顺序

不同world的分布

观看了 clip acitvity  Game ----> Assessment
clip --- > Assessment

训练集
一个installation_id：
acitvity Game -----> Assessment
acitvity Game -----> Assessment | Game ------> Assessment 
acitvity Game -----> Assessment | Game ------> Assessment | clip acitvity ---> Assessment   
训练集有3个

测试集
一个installation_id：
acitvity Game -----> Assessment | Game ------> Assessment | clip acitvity ---> Assessment   
预测最后一次评估

In [79]:
train['type'].unique()

array(['Clip', 'Activity', 'Game', 'Assessment'], dtype=object)

In [None]:
event_code

- **`user_activities_count`**: counts how many actions(`Clip`, `Activity`, `Assessment`, `Game`) the player has done so far
- `time_spent_each_act`: counts spending-time(seconds) in each `title`
- **`event_code_count`**: counts how many actions was made in each event_code so far
- `accumulated_correct_attempts`, `accumulated_uncorrect_attempts`: the history of the trials of this player
- `accumulated_accuracy`: the accurace is the all time wins divided by the all time attempts
- `accuracy_groups`: counts how many times this player was in each accuracy group
- `duration_mean`: the time spent in the app so far
- `accumulated_accuracy_group`: mean of the all accuracy groups of this player
- **`accumulated_actions`**: how many actions the player has done so far

In [84]:
train[train.installation_id=='0001e90f']

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,35,Clip,2
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,9,Clip,1
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,37,Activity,1
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,37,Activity,1
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,37,Activity,1
...,...,...,...,...,...,...,...,...,...,...,...
1352,84538528,ca8b415f34d12873,2019-09-06 18:17:50.487000+00:00,"{""size"":1,""castles_placed"":[{""size"":1,""positio...",0001e90f,107,4020,85402,37,Activity,1
1353,b2dba42b,ca8b415f34d12873,2019-09-06 18:17:50.489000+00:00,"{""description"":""So cool!"",""identifier"":""Dot_So...",0001e90f,108,3010,85402,37,Activity,1
1354,1bb5fbdb,ca8b415f34d12873,2019-09-06 18:17:51.703000+00:00,"{""description"":""So cool!"",""identifier"":""Dot_So...",0001e90f,109,3110,86619,37,Activity,1
1355,27253bdc,3fce4f09769ff0b7,2019-09-06 18:18:11.250000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,2,Clip,3


In [85]:
train[(train.installation_id=='0001e90f')&(train.game_session=='0848ef14a8dc6892')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,37,Activity,1
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,37,Activity,1
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,37,Activity,1
5,1325467d,0848ef14a8dc6892,2019-09-06 17:55:06.279000+00:00,"{""coordinates"":{""x"":583,""y"":605,""stage_width"":...",0001e90f,4,4070,9991,37,Activity,1
6,1325467d,0848ef14a8dc6892,2019-09-06 17:55:06.913000+00:00,"{""coordinates"":{""x"":601,""y"":570,""stage_width"":...",0001e90f,5,4070,10622,37,Activity,1
...,...,...,...,...,...,...,...,...,...,...,...
264,84538528,0848ef14a8dc6892,2019-09-06 17:58:05.792000+00:00,"{""size"":3,""castles_placed"":[{""size"":2,""positio...",0001e90f,263,4020,189518,37,Activity,1
265,b2dba42b,0848ef14a8dc6892,2019-09-06 17:58:05.793000+00:00,"{""description"":""So cool!"",""identifier"":""Dot_So...",0001e90f,264,3010,189518,37,Activity,1
266,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:58:07.009000+00:00,"{""description"":""So cool!"",""identifier"":""Dot_So...",0001e90f,265,3110,190735,37,Activity,1
267,5e812b27,0848ef14a8dc6892,2019-09-06 17:58:07.423000+00:00,"{""size"":0,""coordinates"":{""x"":782,""y"":207,""stag...",0001e90f,266,4030,191135,37,Activity,1


In [19]:
# this is the function that convert the raw data into processed features
# Note that all of them are prior to each event.
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one installation_id is filtered
            user_sample = train[train.installation_id=='XX']
    And the test_set parameter is related with the labels processing, that is only requered if test_set=False
    '''
    # Constants and parameters declaration
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # news features: time spent in each activity
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {eve: 0 for eve in list_of_event_code}
    last_session_time_sec = 0
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        ### train[(train.installation_id=='0001e90f')&(train.game_session=='0848ef14a8dc6892')]
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        
        # get current session time in seconds
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
        
            ### 提取特征
            # type 时间
            
        
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial  4100/4110
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(time_spent_each_act.copy())
            features.update(event_code_count.copy())
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session_title
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            # train_label
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # this piece counts how many actions was made in each event_code so far
        n_of_event_codes = Counter(session['event_code'])  
        
        for key in n_of_event_codes.keys():
            event_code_count[key] += n_of_event_codes[key]

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        user_activities_count[session_type] += 1
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments


In [None]:
一个installation_id：
acitvity Game -----> Assessment | Game ------> Assessment | clip acitvity ---> Assessment   
训练集有3个

测试集：
acitvity Game -----> Assessment | Game ------> Assessment
预测最后一次

In [94]:
test[(test.installation_id=='017c5718')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
3587,27253bdc,d1706431c69d0f17,2019-08-02 23:24:03.145000+00:00,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,35,Clip,2
3588,27253bdc,7f8e671b050cfc16,2019-09-21 11:23:14.319000+00:00,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,35,Clip,2
3589,27253bdc,9cbc7871cb68348e,2019-09-21 11:23:49.822000+00:00,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,35,Clip,2
3590,27253bdc,dbe0b9903177b7ab,2019-09-21 11:24:14.904000+00:00,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,35,Clip,2
3591,27253bdc,bada8e54f3bb8b3e,2019-09-21 11:24:34.545000+00:00,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,2,Clip,3
3592,4901243f,804275af3b58a38e,2019-09-21 11:25:28.440000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",017c5718,1,2000,0,23,Activity,3
3593,beb0a7b9,804275af3b58a38e,2019-09-21 11:25:28.450000+00:00,"{""description"":""Let's set off these fireworks....",017c5718,2,3010,0,23,Activity,3
3594,02a42007,804275af3b58a38e,2019-09-21 11:25:30.441000+00:00,"{""rocket"":1,""coordinates"":{""x"":128,""y"":614,""st...",017c5718,3,4030,2081,23,Activity,3
3595,b88f38da,804275af3b58a38e,2019-09-21 11:25:32.355000+00:00,"{""description"":""Let's set off these fireworks....",017c5718,4,3110,4014,23,Activity,3
3596,e694a35b,804275af3b58a38e,2019-09-21 11:25:34.296000+00:00,"{""rocket"":1,""height"":562,""duration"":3866,""coor...",017c5718,5,4020,5947,23,Activity,3


在评估之前的行为-->当前评估结果

在评估之前的行为+之前评估结果-->当前评估结果

In [106]:
test[(test.installation_id=='0754f13b')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
22750,27253bdc,335b388484fc09e1,2019-08-23 14:20:33.385000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0754f13b,1,2000,0,35,Clip,2
22751,27253bdc,e6485ce85fd7bc65,2019-08-23 14:21:08.180000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0754f13b,1,2000,0,9,Clip,1
22752,77261ab5,978e34ee81fa2dd2,2019-08-23 14:21:45.125000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0754f13b,1,2000,0,37,Activity,1
22753,b2dba42b,978e34ee81fa2dd2,2019-08-23 14:21:45.678000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0754f13b,2,3010,418,37,Activity,1
22754,1325467d,978e34ee81fa2dd2,2019-08-23 14:21:48.430000+00:00,"{""coordinates"":{""x"":187,""y"":156,""stage_width"":...",0754f13b,3,4070,3498,37,Activity,1
...,...,...,...,...,...,...,...,...,...,...,...
23434,bcceccc6,49b562480efd07e1,2019-10-04 13:13:36.977000+00:00,"{""coordinates"":{""x"":882,""y"":99,""stage_width"":1...",0754f13b,84,4070,86371,33,Game,3
23435,bcceccc6,49b562480efd07e1,2019-10-04 13:13:38.049000+00:00,"{""coordinates"":{""x"":696,""y"":193,""stage_width"":...",0754f13b,85,4070,87443,33,Game,3
23436,bcceccc6,49b562480efd07e1,2019-10-04 13:13:38.345000+00:00,"{""coordinates"":{""x"":638,""y"":233,""stage_width"":...",0754f13b,86,4070,87741,33,Game,3
23437,bcceccc6,49b562480efd07e1,2019-10-04 13:13:38.994000+00:00,"{""coordinates"":{""x"":715,""y"":221,""stage_width"":...",0754f13b,87,4070,88388,33,Game,3


之前有评估，但是评估结果未知

训练集有评估结果占比

训练集与测试集分布不一致：测试集中没有的特征，训练集要删除

不删除线下验证会很高

In [None]:
测试集 Assessment 无评估结果
训练集 Assessment 都有评估结果
所有使用到评估结果的特征删除

In [105]:
test[(test.installation_id=='0754f13b')&(test.type=='Assessment')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
22846,90d848e0,48c3a986e046db0d,2019-08-23 14:26:50.713000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0754f13b,1,2000,0,5,Assessment,1
22847,923afab1,48c3a986e046db0d,2019-08-23 14:26:50.924000+00:00,"{""description"":""Put the right sized bucket und...",0754f13b,2,3010,0,5,Assessment,1
22848,532a2afb,48c3a986e046db0d,2019-08-23 14:26:50.927000+00:00,"{""buckets"":[1,2,3],""target_bucket"":0,""mode"":""p...",0754f13b,3,2020,0,5,Assessment,1
22849,2dcad279,48c3a986e046db0d,2019-08-23 14:26:55.752000+00:00,"{""description"":""Put the right sized bucket und...",0754f13b,4,3110,5340,5,Assessment,1
22850,90d848e0,f1f7fcb341885ad6,2019-08-23 14:27:19.963000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0754f13b,1,2000,0,5,Assessment,1
22851,923afab1,f1f7fcb341885ad6,2019-08-23 14:27:20.101000+00:00,"{""description"":""Put the right sized bucket und...",0754f13b,2,3010,0,5,Assessment,1
22852,532a2afb,f1f7fcb341885ad6,2019-08-23 14:27:20.102000+00:00,"{""buckets"":[1,3,2],""target_bucket"":0,""mode"":""p...",0754f13b,3,2020,0,5,Assessment,1
22853,3ee399c3,f1f7fcb341885ad6,2019-08-23 14:27:22.719000+00:00,"{""coordinates"":{""x"":96,""y"":40,""stage_width"":10...",0754f13b,4,4070,3075,5,Assessment,1
23438,3bfd1a65,98b243b461457589,2019-10-04 13:13:55.319000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0754f13b,1,2000,0,16,Assessment,3


In [103]:
test[test.type=='Assessment'].groupby('installation_id',as_index=False)['event_id'].count().head(100)

Unnamed: 0,installation_id,event_id
0,00abaee7,27
1,01242218,245
2,017c5718,1
3,01a44906,1
4,01bc6cb6,1
5,02256298,16
6,0267757a,1
7,027e7ce5,346
8,02a29f99,201
9,0300c576,1


In [99]:
test['installation_id'].nunique()

1000

In [87]:
test.groupby('installation_id').count()

Unnamed: 0_level_0,event_id,game_session,timestamp,event_data,event_count,event_code,game_time,title,type,world
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00abaee7,868,868,868,868,868,868,868,868,868,868
01242218,2719,2719,2719,2719,2719,2719,2719,2719,2719,2719
017c5718,150,150,150,150,150,150,150,150,150,150
01a44906,234,234,234,234,234,234,234,234,234,234
01bc6cb6,952,952,952,952,952,952,952,952,952,952
...,...,...,...,...,...,...,...,...,...,...
fee254cf,212,212,212,212,212,212,212,212,212,212
ff57e602,303,303,303,303,303,303,303,303,303,303
ffc73fb2,526,526,526,526,526,526,526,526,526,526
ffe00ca8,259,259,259,259,259,259,259,259,259,259


In [81]:
train['installation_id'].nunique()

17000

In [None]:
user_sample = train[train.installation_id=='0001e90f']

In [20]:
# here the get_data function is applyed to each installation_id and added to the compile_data list
compiled_data = []
# tqdm is the library that draws the status bar below
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=17000):
    # user_sample is a DataFrame that contains only one installation_id
    compiled_data += get_data(user_sample)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




In [21]:
# the compiled_data is converted to DataFrame and deleted to save memmory
new_train = pd.DataFrame(compiled_data)
del compiled_data
new_train.shape

(17690, 102)

In [86]:
new_train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,Bug Measurer (Activity),Chest Sorter (Assessment),Tree Top City - Level 1,Tree Top City - Level 2,Happy Camel,Cauldron Filler (Assessment),Leaf Leader,Watering Hole (Activity),Lifting Heavy Things,Magma Peak - Level 1,Bubble Bath,Chicken Balancer (Activity),Pan Balance,Costume Box,Magma Peak - Level 2,Balancing Act,Mushroom Sorter (Assessment),Pirate's Tale,Rulers,Ordering Spheres,Honey Cake,Dino Drink,Treasure Map,Fireworks (Activity),Flower Waterer (Activity),Chow Time,Crystals Rule,Scrub-A-Dub,Dino Dive,Bottle Filler (Activity),Bird Measurer (Assessment),Crystal Caves - Level 3,Egg Dropper (Activity),Air Show,Crystal Caves - Level 2,Welcome to Lost Lagoon!,Crystal Caves - Level 1,Sandcastle Builder (Activity),Tree Top City - Level 3,Cart Balancer (Assessment),"Heavy, Heavier, Heaviest",12 Monkeys,Slop Problem,All Star Sorting,2050,4100,4230,5000,4235,2060,4110,5010,2070,2075,2080,2081,2083,3110,4010,3120,3121,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4220,4095,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,11,3,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,91,164,0,0,115,0,0,0,0,0,0,0,0,0,89,0,0,0,0,0,163,6,0,0,0,0,0,0,0,0,0,4,1,2,77,4,7,9,92,14,31,19,121,0,79,1,0,7,9,0,18,0,0,20,94,4,18,0,0,6,4,0,0,16,0,0,0.0,0.0,3,0,0,0,0,0.0,647
1,14,4,1,6,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,91,164,0,78,115,0,0,0,0,0,193,0,0,0,89,0,0,0,0,0,163,6,5,0,0,0,1,2,0,1,0,4,1,2,223,6,11,16,127,14,31,37,149,0,226,6,2,11,16,0,25,0,1,26,156,5,22,0,1,6,4,0,0,30,1,0,39.0,1.0,0,0,0,0,1,3.0,1143
2,14,4,2,6,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,91,164,0,78,115,0,0,0,0,0,193,0,0,0,89,0,0,0,0,0,163,6,5,0,0,0,1,13,0,1,0,4,1,2,225,6,22,16,127,14,31,59,171,0,228,6,2,22,16,0,26,0,1,27,160,5,22,0,1,6,4,0,0,16,1,11,65.5,0.5,3,1,0,0,1,1.5,1230
3,24,9,4,10,104,0,0,0,0,0,0,80,0,0,133,0,0,0,0,0,0,0,0,0,0,110,0,1611,254,0,78,195,0,165,0,0,0,193,0,0,0,126,0,0,0,0,0,229,9,6,0,5,0,2,13,5,2,0,8,2,5,336,10,25,40,243,29,45,93,314,6,341,14,9,25,40,2,47,0,2,52,348,9,43,0,5,10,4,9,1,16,2,11,41.25,0.5,2,2,0,0,2,1.5,2159
4,28,10,5,13,184,0,0,0,0,0,0,80,0,0,133,0,0,0,0,0,0,0,0,0,0,110,0,1611,254,0,310,195,0,165,0,0,0,336,0,0,0,126,0,0,0,0,0,229,9,12,0,5,0,3,13,5,2,1,8,2,5,457,12,30,53,277,29,45,105,331,6,463,15,10,30,53,2,56,0,3,64,387,10,53,0,6,10,4,9,1,30,3,12,39.2,0.5,3,2,0,1,2,1.6,2586


In [57]:
# this list comprehension create the list of features that will be used on the input dataset X
# all but accuracy_group, that is the label y
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]
# this cat_feature must be declared to pass later as parameter to fit the model
cat_features = ['session_title']
# here the dataset select the features and split the input ant the labels
X, y = new_train[all_features], new_train['accuracy_group']
del train
X.shape

(17690, 101)

In [58]:
def make_classifier(iterations=6000):
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                                eval_metric="WKappa",
                               task_type="CPU",
                               #learning_rate=0.01,
                               iterations=iterations,
                               od_type="Iter",
                                #depth=4,
                               early_stopping_rounds=500,
                                #l2_leaf_reg=10,
                                #border_count=96,
                               random_seed=42,
                                #use_best_model=use_best_model
                              )
        
    return clf

In [None]:
# CV
from sklearn.model_selection import KFold
# oof is an zeroed array of the same size of the input dataset
oof = np.zeros(len(X))
NFOLDS = 5
# here the KFold class is used to split the dataset in 5 diferents training and validation sets
# this technique is used to assure that the model isn't overfitting and can performs aswell in 
# unseen data. More the number of splits/folds, less the test will be impacted by randomness
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)
training_start_time = time()
models = []
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    # each iteration of folds.split returns an array of indexes of the new training data and validation data
    start_time = time()
    print(f'Training on fold {fold+1}')
    # creates the model
    clf = make_classifier()
    # fits the model using .loc at the full dataset to select the splits indexes and features used
    clf.fit(X.loc[trn_idx, all_features], y.loc[trn_idx], eval_set=(X.loc[test_idx, all_features], y.loc[test_idx]),
                          use_best_model=True, verbose=500, cat_features=cat_features)
    
    # then, the predictions of each split is inserted into the oof array
    oof[test_idx] = clf.predict(X.loc[test_idx, all_features]).reshape(len(test_idx))
    models.append(clf)
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    print('____________________________________________________________________________________________\n')
    #break
    
print('-' * 30)
# and here, the complete oof is tested against the real data using que metric (quadratic weighted kappa)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

In [None]:
# process test set, the same that was done with the train set
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
X_test = pd.DataFrame(new_test)
del test

In [None]:
predictions = []
for model in models:
    predictions.append(model.predict(X_test))
predictions = np.concatenate(predictions, axis=1)
print(predictions.shape)
predictions = stats.mode(predictions, axis=1)[0].reshape(-1)
print(predictions.shape)
#del X_test

In [None]:
submission['accuracy_group'] = np.round(predictions).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()

In [None]:
submission['accuracy_group'].plot(kind='hist')

In [None]:
train_labels['accuracy_group'].plot(kind='hist')