In [1]:
import pandas as pd
pd.set_option('display.max_columns', 56)
import numpy as np
import seaborn as sns
import xgboost as xgb

import datetime

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Config

In [58]:
# DATA PATHS
TRAIN_PATH = 'data/data_training.csv'
PROV_PATH = 'data/data_provisional.csv'
SAMPLE_PATH = 'data/sample-data.csv'
# OUTPUT PATHS
OUTPUT_PATH = 'submissions/output/'

# Sample Data Paths
S_TRAIN = 'sample-submission/code/data/training.csv'
S_TEST = 'sample-submission/code/data/testing.csv'
S_SOL = 'sample-submission/solution/solution.csv'

# category & headers
CATEGORY = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}
REV_CATEGORY = { v:k for k,v in CATEGORY.items()}
OUTPUT_HEADER = ['timestamp', 'test_suite', 'predicted_induced_state',
       'three_sec_predicted_induced_state',
       'predicted_induced_state_confidence',
       'three_sec_predicted_induced_state_confidence', 'top_three_features']

#

### The difference btw sensordata mean values of `baseline` & `channelized` statet.  Put into ratio to std

In [69]:
(single_channelized.describe().loc['mean'] - single_baseline.describe().loc['mean'])/single_baseline.describe().loc['std']

time                         20.963801
tlx_score                          inf
E4_BVP                        0.045512
E4_GSR                        0.060257
LooxidLink_EEG_A3            36.275706
LooxidLink_EEG_A4            45.836544
LooxidLink_EEG_FP1           35.861761
LooxidLink_EEG_FP2           28.080833
LooxidLink_EEG_A7            11.396929
LooxidLink_EEG_A8            -0.831278
Muse_EEG_TP9                  0.000000
Muse_EEG_AF7                  0.000000
Muse_EEG_AF8                  0.000000
Muse_EEG_TP10                 0.000000
Muse_PPG_0                    0.000000
Muse_PPG_1                    0.000000
Muse_PPG_2                    0.000000
Myo_GYR_X                     0.000269
Myo_GYR_Y                     0.092695
Myo_GYR_Z                     0.018316
Myo_EMG_0                     0.015856
Myo_EMG_1                     0.015963
Myo_EMG_2                     0.015858
Myo_EMG_3                     0.015655
Myo_EMG_4                     0.015556
Myo_EMG_5                

## Create true future label on sample data

### Preprocess

In [4]:
def preprocess(df):
    # replace  'induced_state'
    df["induced_state"] = df["induced_state"].replace(CATEGORY)
    # set time as index
    df.set_index('time', inplace=True)

    # define drop cols 
    drop_cols = ['test_suite', 'induced_state'] #, 'tlx_score']

    # split data into Train & Test
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    return X, Y

## Process the data in chunks to get a new training data

##### Helpers

In [3]:
def round_time(time_serie):
    time_serie = pd.to_numeric(time_serie)
    time_serie = time_serie.apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    time_serie = time_serie.dt.round('1s')
    time_serie = time_serie.apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
#     time_serie = time_serie.drop_duplicates()
    
    return time_serie

# return the mode(most comm) element in a series
def mode(series):
    return series.value_counts().index[0]

# return the mean value excluding -9999.9 the default value, if there's normal values
def normal_mean(series):
    if series.nunique() > 1:
        return series[series > -9999.9].mean()
    return series.unique()

##### Processers

In [275]:
def data_preprocess(df, full_df):
    # map  'induced_state' for training data
    if('induced_state' in df.columns):
        df["induced_state"] = df["induced_state"].replace(CATEGORY)
    
    # round time to whole seconds
    df['time'] = round_time(df['time'])    

    # aggregate the data by 'time' & 'test_suite' 
    col_merger = dict(zip(df.columns[2:], [normal_mean]*len(df.columns[2:])))
    if('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    df = df.groupby(['time', 'test_suite']).agg(col_merger)
    
    return full_df.append(df)

In [397]:
def load_data_in_chunks(path, chunksize=500000): 
    
    df = pd.read_csv(path, chunksize=chunksize)
    
    # read the header cols
    with open(path) as f:
        header = f.readline().strip().split(',')

    full_df = pd.DataFrame(columns=header).set_index(['time', 'test_suite'])
    for i, c in enumerate(df):
        full_df = data_preprocess(c, full_df)

        print(f'{(i+1)*chunksize} done....')
#         #show as percent of total train size(46010963 for test data)
#         print(f"{format((i+1)*chunksize/49048652, '.3f')} done....") 
        
    # futher aggregate the duplicates generated from different chunks
    col_merger = dict(zip(full_df.columns, [normal_mean]*len(full_df.columns)))
    if('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    full_df = full_df.groupby(['time', 'test_suite']).agg(col_merger)
    
    return full_df

In [173]:
def train_preprocess(df, test_size=0.3):
    
    # enforce dtypes
    df['induced_state'] = df['induced_state'].astype(int)
    df['tlx_score'] = df['tlx_score'].astype(int)
    
    drop_cols = ['induced_state'] #, 'tlx_score']
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=2)

    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix( X_test, label=y_test)
    
    return xg_train, xg_test, X_train, X_test, y_train, y_test

### Train full data

In [364]:
drop_cols = ['induced_state'] #, 'tlx_score']
X = train_df.loc[:, [ c not in drop_cols for c in train_df.columns]]
Y = train_df['induced_state']

In [365]:
xg_train = xgb.DMatrix(X, label=Y)

train the full train set with 1800 rounds

In [366]:
param = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

In [367]:
watchlist = [ (xg_train,'train')]
num_round = 2000
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=10 );

[0]	train-mlogloss:1.77749
[1]	train-mlogloss:1.76230
[2]	train-mlogloss:1.74642
[3]	train-mlogloss:1.73285
[4]	train-mlogloss:1.71749
[5]	train-mlogloss:1.70360
[6]	train-mlogloss:1.68949
[7]	train-mlogloss:1.67501
[8]	train-mlogloss:1.66128
[9]	train-mlogloss:1.64723
[10]	train-mlogloss:1.63305
[11]	train-mlogloss:1.62151
[12]	train-mlogloss:1.60863
[13]	train-mlogloss:1.59588
[14]	train-mlogloss:1.58237
[15]	train-mlogloss:1.56987
[16]	train-mlogloss:1.55731
[17]	train-mlogloss:1.54510
[18]	train-mlogloss:1.53247
[19]	train-mlogloss:1.52036
[20]	train-mlogloss:1.50806
[21]	train-mlogloss:1.49568
[22]	train-mlogloss:1.48405
[23]	train-mlogloss:1.47205
[24]	train-mlogloss:1.46215
[25]	train-mlogloss:1.45131
[26]	train-mlogloss:1.43955
[27]	train-mlogloss:1.42799
[28]	train-mlogloss:1.41776
[29]	train-mlogloss:1.40677
[30]	train-mlogloss:1.39606
[31]	train-mlogloss:1.38654
[32]	train-mlogloss:1.37620
[33]	train-mlogloss:1.36617
[34]	train-mlogloss:1.35668
[35]	train-mlogloss:1.34638
[3

[278]	train-mlogloss:0.33863
[279]	train-mlogloss:0.33702
[280]	train-mlogloss:0.33556
[281]	train-mlogloss:0.33390
[282]	train-mlogloss:0.33236
[283]	train-mlogloss:0.33080
[284]	train-mlogloss:0.32915
[285]	train-mlogloss:0.32745
[286]	train-mlogloss:0.32601
[287]	train-mlogloss:0.32447
[288]	train-mlogloss:0.32304
[289]	train-mlogloss:0.32155
[290]	train-mlogloss:0.31997
[291]	train-mlogloss:0.31838
[292]	train-mlogloss:0.31702
[293]	train-mlogloss:0.31570
[294]	train-mlogloss:0.31424
[295]	train-mlogloss:0.31290
[296]	train-mlogloss:0.31137
[297]	train-mlogloss:0.30992
[298]	train-mlogloss:0.30856
[299]	train-mlogloss:0.30718
[300]	train-mlogloss:0.30564
[301]	train-mlogloss:0.30416
[302]	train-mlogloss:0.30266
[303]	train-mlogloss:0.30124
[304]	train-mlogloss:0.29974
[305]	train-mlogloss:0.29833
[306]	train-mlogloss:0.29685
[307]	train-mlogloss:0.29539
[308]	train-mlogloss:0.29401
[309]	train-mlogloss:0.29265
[310]	train-mlogloss:0.29118
[311]	train-mlogloss:0.28982
[312]	train-ml

[561]	train-mlogloss:0.10151
[562]	train-mlogloss:0.10113
[563]	train-mlogloss:0.10073
[564]	train-mlogloss:0.10036
[565]	train-mlogloss:0.10001
[566]	train-mlogloss:0.09962
[567]	train-mlogloss:0.09927
[568]	train-mlogloss:0.09893
[569]	train-mlogloss:0.09855
[570]	train-mlogloss:0.09823
[571]	train-mlogloss:0.09788
[572]	train-mlogloss:0.09755
[573]	train-mlogloss:0.09723
[574]	train-mlogloss:0.09684
[575]	train-mlogloss:0.09651
[576]	train-mlogloss:0.09611
[577]	train-mlogloss:0.09572
[578]	train-mlogloss:0.09535
[579]	train-mlogloss:0.09502
[580]	train-mlogloss:0.09468
[581]	train-mlogloss:0.09440
[582]	train-mlogloss:0.09408
[583]	train-mlogloss:0.09375
[584]	train-mlogloss:0.09348
[585]	train-mlogloss:0.09315
[586]	train-mlogloss:0.09284
[587]	train-mlogloss:0.09250
[588]	train-mlogloss:0.09216
[589]	train-mlogloss:0.09188
[590]	train-mlogloss:0.09153
[591]	train-mlogloss:0.09119
[592]	train-mlogloss:0.09088
[593]	train-mlogloss:0.09057
[594]	train-mlogloss:0.09023
[595]	train-ml

[844]	train-mlogloss:0.04284
[845]	train-mlogloss:0.04274
[846]	train-mlogloss:0.04263
[847]	train-mlogloss:0.04254
[848]	train-mlogloss:0.04242
[849]	train-mlogloss:0.04232
[850]	train-mlogloss:0.04221
[851]	train-mlogloss:0.04210
[852]	train-mlogloss:0.04201
[853]	train-mlogloss:0.04191
[854]	train-mlogloss:0.04181
[855]	train-mlogloss:0.04171
[856]	train-mlogloss:0.04161
[857]	train-mlogloss:0.04152
[858]	train-mlogloss:0.04142
[859]	train-mlogloss:0.04133
[860]	train-mlogloss:0.04123
[861]	train-mlogloss:0.04114
[862]	train-mlogloss:0.04105
[863]	train-mlogloss:0.04094
[864]	train-mlogloss:0.04084
[865]	train-mlogloss:0.04074
[866]	train-mlogloss:0.04065
[867]	train-mlogloss:0.04054
[868]	train-mlogloss:0.04044
[869]	train-mlogloss:0.04035
[870]	train-mlogloss:0.04025
[871]	train-mlogloss:0.04015
[872]	train-mlogloss:0.04007
[873]	train-mlogloss:0.03997
[874]	train-mlogloss:0.03988
[875]	train-mlogloss:0.03978
[876]	train-mlogloss:0.03969
[877]	train-mlogloss:0.03960
[878]	train-ml

[1123]	train-mlogloss:0.02472
[1124]	train-mlogloss:0.02469
[1125]	train-mlogloss:0.02465
[1126]	train-mlogloss:0.02461
[1127]	train-mlogloss:0.02456
[1128]	train-mlogloss:0.02453
[1129]	train-mlogloss:0.02448
[1130]	train-mlogloss:0.02445
[1131]	train-mlogloss:0.02440
[1132]	train-mlogloss:0.02436
[1133]	train-mlogloss:0.02432
[1134]	train-mlogloss:0.02429
[1135]	train-mlogloss:0.02425
[1136]	train-mlogloss:0.02421
[1137]	train-mlogloss:0.02417
[1138]	train-mlogloss:0.02413
[1139]	train-mlogloss:0.02409
[1140]	train-mlogloss:0.02405
[1141]	train-mlogloss:0.02402
[1142]	train-mlogloss:0.02398
[1143]	train-mlogloss:0.02395
[1144]	train-mlogloss:0.02391
[1145]	train-mlogloss:0.02388
[1146]	train-mlogloss:0.02384
[1147]	train-mlogloss:0.02380
[1148]	train-mlogloss:0.02377
[1149]	train-mlogloss:0.02373
[1150]	train-mlogloss:0.02370
[1151]	train-mlogloss:0.02366
[1152]	train-mlogloss:0.02363
[1153]	train-mlogloss:0.02360
[1154]	train-mlogloss:0.02356
[1155]	train-mlogloss:0.02352
[1156]	tra

[1397]	train-mlogloss:0.01708
[1398]	train-mlogloss:0.01707
[1399]	train-mlogloss:0.01705
[1400]	train-mlogloss:0.01703
[1401]	train-mlogloss:0.01700
[1402]	train-mlogloss:0.01698
[1403]	train-mlogloss:0.01696
[1404]	train-mlogloss:0.01695
[1405]	train-mlogloss:0.01693
[1406]	train-mlogloss:0.01691
[1407]	train-mlogloss:0.01689
[1408]	train-mlogloss:0.01687
[1409]	train-mlogloss:0.01685
[1410]	train-mlogloss:0.01683
[1411]	train-mlogloss:0.01682
[1412]	train-mlogloss:0.01680
[1413]	train-mlogloss:0.01678
[1414]	train-mlogloss:0.01676
[1415]	train-mlogloss:0.01674
[1416]	train-mlogloss:0.01672
[1417]	train-mlogloss:0.01670
[1418]	train-mlogloss:0.01668
[1419]	train-mlogloss:0.01666
[1420]	train-mlogloss:0.01665
[1421]	train-mlogloss:0.01663
[1422]	train-mlogloss:0.01661
[1423]	train-mlogloss:0.01659
[1424]	train-mlogloss:0.01657
[1425]	train-mlogloss:0.01656
[1426]	train-mlogloss:0.01654
[1427]	train-mlogloss:0.01652
[1428]	train-mlogloss:0.01650
[1429]	train-mlogloss:0.01648
[1430]	tra

[1671]	train-mlogloss:0.01305
[1672]	train-mlogloss:0.01304
[1673]	train-mlogloss:0.01303
[1674]	train-mlogloss:0.01301
[1675]	train-mlogloss:0.01300
[1676]	train-mlogloss:0.01299
[1677]	train-mlogloss:0.01298
[1678]	train-mlogloss:0.01297
[1679]	train-mlogloss:0.01296
[1680]	train-mlogloss:0.01294
[1681]	train-mlogloss:0.01293
[1682]	train-mlogloss:0.01292
[1683]	train-mlogloss:0.01291
[1684]	train-mlogloss:0.01290
[1685]	train-mlogloss:0.01289
[1686]	train-mlogloss:0.01288
[1687]	train-mlogloss:0.01287
[1688]	train-mlogloss:0.01286
[1689]	train-mlogloss:0.01285
[1690]	train-mlogloss:0.01284
[1691]	train-mlogloss:0.01283
[1692]	train-mlogloss:0.01282
[1693]	train-mlogloss:0.01281
[1694]	train-mlogloss:0.01280
[1695]	train-mlogloss:0.01279
[1696]	train-mlogloss:0.01277
[1697]	train-mlogloss:0.01276
[1698]	train-mlogloss:0.01275
[1699]	train-mlogloss:0.01274
[1700]	train-mlogloss:0.01273
[1701]	train-mlogloss:0.01272
[1702]	train-mlogloss:0.01271
[1703]	train-mlogloss:0.01270
[1704]	tra

[1945]	train-mlogloss:0.01060
[1946]	train-mlogloss:0.01059
[1947]	train-mlogloss:0.01058
[1948]	train-mlogloss:0.01058
[1949]	train-mlogloss:0.01057
[1950]	train-mlogloss:0.01056
[1951]	train-mlogloss:0.01055
[1952]	train-mlogloss:0.01055
[1953]	train-mlogloss:0.01054
[1954]	train-mlogloss:0.01054
[1955]	train-mlogloss:0.01053
[1956]	train-mlogloss:0.01052
[1957]	train-mlogloss:0.01051
[1958]	train-mlogloss:0.01051
[1959]	train-mlogloss:0.01050
[1960]	train-mlogloss:0.01050
[1961]	train-mlogloss:0.01049
[1962]	train-mlogloss:0.01048
[1963]	train-mlogloss:0.01048
[1964]	train-mlogloss:0.01047
[1965]	train-mlogloss:0.01046
[1966]	train-mlogloss:0.01046
[1967]	train-mlogloss:0.01045
[1968]	train-mlogloss:0.01044
[1969]	train-mlogloss:0.01043
[1970]	train-mlogloss:0.01043
[1971]	train-mlogloss:0.01042
[1972]	train-mlogloss:0.01041
[1973]	train-mlogloss:0.01041
[1974]	train-mlogloss:0.01040
[1975]	train-mlogloss:0.01039
[1976]	train-mlogloss:0.01039
[1977]	train-mlogloss:0.01038
[1978]	tra

In [368]:
bst.save_model(OUTPUT_PATH + 'full_merged_train_2000_model.json')

# bst = xgb.Booster()
# bst.load_model(OUTPUT_PATH + "merged_train_1500_model.json")

In [156]:
def test_postprocess(c_time, c_suite, c_prob, full_pred):
    
    # create a empty dataframe for chunk data
    c_df = pd.DataFrame(columns=output_header)
    
    # setting time & test_suite
    c_df[output_header[0]] = c_time
    c_df[output_header[1]] = c_suite
    
    # process predicted probabilties
    # trim prob into 3 decimal places
    c_prob = np.vectorize(lambda x: format(x, '.3f'))(c_prob).tolist()
    # find the pred(highest prob) index for each row 
    c_pred = [r.index(max(r)) for r in c_prob]
    # map the pred index into string instance
    c_pis = [rev_category[p] for p in c_pred]
    
    # TODO: 
    c_tpis = c_pis
    c_tpis_prob = c_prob
#     c_ttf = 

    c_df[output_header[2]] = c_pis
    c_df[output_header[3]] = c_tpis
    c_df[output_header[4]] = [str(r).replace(",", "").replace("\'", "") for r in c_prob]
    c_df[output_header[5]] = [str(r).replace(",", "").replace("\'", "") for r in c_tpis_prob]
    
    return full_pred.append(c_df)

## Load & transform data

In [None]:
# call function and load data by chunksize
df = load_data_in_chunks(TRAIN_PATH)#, chunksize=500000)
# df = pd.read_csv(OUTPUT_PATH+"trim_train_merged.csv")
xg_train, xg_test, _ = train_preprocess(df)#, 0.3)


# # load data as a whole
# df = pd.read_csv(TRAIN_PATH)
# df_temp = pd.DataFrame(columns=df.columns).set_index(['time', 'test_suite'])
# df = data_preprocess(df, df_temp)

In [178]:
xg_train, xg_test, X_train, X_test, y_train, y_test  = train_preprocess(trim_full_df, 0.3)

### Train model

In [151]:
param = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

In [152]:
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 1500
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=10 );

[0]	train-mlogloss:1.77823	test-mlogloss:1.77898
[1]	train-mlogloss:1.76370	test-mlogloss:1.76508
[2]	train-mlogloss:1.74821	test-mlogloss:1.75014
[3]	train-mlogloss:1.73426	test-mlogloss:1.73666
[4]	train-mlogloss:1.71958	test-mlogloss:1.72247
[5]	train-mlogloss:1.70564	test-mlogloss:1.70911
[6]	train-mlogloss:1.69071	test-mlogloss:1.69448
[7]	train-mlogloss:1.67651	test-mlogloss:1.68075
[8]	train-mlogloss:1.66359	test-mlogloss:1.66824
[9]	train-mlogloss:1.64978	test-mlogloss:1.65501
[10]	train-mlogloss:1.63629	test-mlogloss:1.64200
[11]	train-mlogloss:1.62420	test-mlogloss:1.63048
[12]	train-mlogloss:1.61093	test-mlogloss:1.61761
[13]	train-mlogloss:1.59846	test-mlogloss:1.60557
[14]	train-mlogloss:1.58719	test-mlogloss:1.59479
[15]	train-mlogloss:1.57494	test-mlogloss:1.58289
[16]	train-mlogloss:1.56412	test-mlogloss:1.57248
[17]	train-mlogloss:1.55203	test-mlogloss:1.56088
[18]	train-mlogloss:1.54058	test-mlogloss:1.54990
[19]	train-mlogloss:1.53028	test-mlogloss:1.54002
[20]	train

[158]	train-mlogloss:0.64871	test-mlogloss:0.69236
[159]	train-mlogloss:0.64519	test-mlogloss:0.68895
[160]	train-mlogloss:0.64147	test-mlogloss:0.68530
[161]	train-mlogloss:0.63809	test-mlogloss:0.68204
[162]	train-mlogloss:0.63488	test-mlogloss:0.67896
[163]	train-mlogloss:0.63143	test-mlogloss:0.67563
[164]	train-mlogloss:0.62831	test-mlogloss:0.67262
[165]	train-mlogloss:0.62517	test-mlogloss:0.66958
[166]	train-mlogloss:0.62157	test-mlogloss:0.66606
[167]	train-mlogloss:0.61860	test-mlogloss:0.66321
[168]	train-mlogloss:0.61582	test-mlogloss:0.66056
[169]	train-mlogloss:0.61230	test-mlogloss:0.65714
[170]	train-mlogloss:0.60889	test-mlogloss:0.65379
[171]	train-mlogloss:0.60554	test-mlogloss:0.65054
[172]	train-mlogloss:0.60233	test-mlogloss:0.64738
[173]	train-mlogloss:0.59892	test-mlogloss:0.64404
[174]	train-mlogloss:0.59596	test-mlogloss:0.64125
[175]	train-mlogloss:0.59275	test-mlogloss:0.63810
[176]	train-mlogloss:0.58971	test-mlogloss:0.63511
[177]	train-mlogloss:0.58634	te

[319]	train-mlogloss:0.29335	test-mlogloss:0.34369
[320]	train-mlogloss:0.29211	test-mlogloss:0.34244
[321]	train-mlogloss:0.29077	test-mlogloss:0.34108
[322]	train-mlogloss:0.28939	test-mlogloss:0.33971
[323]	train-mlogloss:0.28798	test-mlogloss:0.33833
[324]	train-mlogloss:0.28663	test-mlogloss:0.33701
[325]	train-mlogloss:0.28545	test-mlogloss:0.33585
[326]	train-mlogloss:0.28419	test-mlogloss:0.33460
[327]	train-mlogloss:0.28289	test-mlogloss:0.33329
[328]	train-mlogloss:0.28174	test-mlogloss:0.33217
[329]	train-mlogloss:0.28054	test-mlogloss:0.33096
[330]	train-mlogloss:0.27924	test-mlogloss:0.32965
[331]	train-mlogloss:0.27795	test-mlogloss:0.32838
[332]	train-mlogloss:0.27683	test-mlogloss:0.32728
[333]	train-mlogloss:0.27561	test-mlogloss:0.32606
[334]	train-mlogloss:0.27442	test-mlogloss:0.32486
[335]	train-mlogloss:0.27306	test-mlogloss:0.32351
[336]	train-mlogloss:0.27187	test-mlogloss:0.32236
[337]	train-mlogloss:0.27071	test-mlogloss:0.32120
[338]	train-mlogloss:0.26947	te

[480]	train-mlogloss:0.15023	test-mlogloss:0.19970
[481]	train-mlogloss:0.14973	test-mlogloss:0.19919
[482]	train-mlogloss:0.14918	test-mlogloss:0.19863
[483]	train-mlogloss:0.14855	test-mlogloss:0.19798
[484]	train-mlogloss:0.14793	test-mlogloss:0.19734
[485]	train-mlogloss:0.14739	test-mlogloss:0.19680
[486]	train-mlogloss:0.14690	test-mlogloss:0.19630
[487]	train-mlogloss:0.14626	test-mlogloss:0.19564
[488]	train-mlogloss:0.14568	test-mlogloss:0.19503
[489]	train-mlogloss:0.14510	test-mlogloss:0.19445
[490]	train-mlogloss:0.14458	test-mlogloss:0.19393
[491]	train-mlogloss:0.14397	test-mlogloss:0.19329
[492]	train-mlogloss:0.14337	test-mlogloss:0.19267
[493]	train-mlogloss:0.14278	test-mlogloss:0.19205
[494]	train-mlogloss:0.14223	test-mlogloss:0.19145
[495]	train-mlogloss:0.14171	test-mlogloss:0.19092
[496]	train-mlogloss:0.14114	test-mlogloss:0.19034
[497]	train-mlogloss:0.14062	test-mlogloss:0.18981
[498]	train-mlogloss:0.14016	test-mlogloss:0.18934
[499]	train-mlogloss:0.13959	te

[641]	train-mlogloss:0.08679	test-mlogloss:0.13413
[642]	train-mlogloss:0.08652	test-mlogloss:0.13386
[643]	train-mlogloss:0.08623	test-mlogloss:0.13353
[644]	train-mlogloss:0.08595	test-mlogloss:0.13324
[645]	train-mlogloss:0.08571	test-mlogloss:0.13298
[646]	train-mlogloss:0.08544	test-mlogloss:0.13269
[647]	train-mlogloss:0.08517	test-mlogloss:0.13238
[648]	train-mlogloss:0.08492	test-mlogloss:0.13212
[649]	train-mlogloss:0.08467	test-mlogloss:0.13186
[650]	train-mlogloss:0.08440	test-mlogloss:0.13156
[651]	train-mlogloss:0.08414	test-mlogloss:0.13130
[652]	train-mlogloss:0.08385	test-mlogloss:0.13098
[653]	train-mlogloss:0.08364	test-mlogloss:0.13076
[654]	train-mlogloss:0.08338	test-mlogloss:0.13048
[655]	train-mlogloss:0.08310	test-mlogloss:0.13017
[656]	train-mlogloss:0.08285	test-mlogloss:0.12992
[657]	train-mlogloss:0.08258	test-mlogloss:0.12964
[658]	train-mlogloss:0.08235	test-mlogloss:0.12939
[659]	train-mlogloss:0.08211	test-mlogloss:0.12914
[660]	train-mlogloss:0.08189	te

[802]	train-mlogloss:0.05565	test-mlogloss:0.10068
[803]	train-mlogloss:0.05551	test-mlogloss:0.10053
[804]	train-mlogloss:0.05538	test-mlogloss:0.10038
[805]	train-mlogloss:0.05523	test-mlogloss:0.10022
[806]	train-mlogloss:0.05510	test-mlogloss:0.10008
[807]	train-mlogloss:0.05500	test-mlogloss:0.09998
[808]	train-mlogloss:0.05487	test-mlogloss:0.09983
[809]	train-mlogloss:0.05474	test-mlogloss:0.09967
[810]	train-mlogloss:0.05460	test-mlogloss:0.09952
[811]	train-mlogloss:0.05451	test-mlogloss:0.09943
[812]	train-mlogloss:0.05439	test-mlogloss:0.09929
[813]	train-mlogloss:0.05426	test-mlogloss:0.09915
[814]	train-mlogloss:0.05412	test-mlogloss:0.09898
[815]	train-mlogloss:0.05398	test-mlogloss:0.09883
[816]	train-mlogloss:0.05385	test-mlogloss:0.09869
[817]	train-mlogloss:0.05371	test-mlogloss:0.09853
[818]	train-mlogloss:0.05359	test-mlogloss:0.09839
[819]	train-mlogloss:0.05347	test-mlogloss:0.09826
[820]	train-mlogloss:0.05335	test-mlogloss:0.09811
[821]	train-mlogloss:0.05324	te

[963]	train-mlogloss:0.03963	test-mlogloss:0.08294
[964]	train-mlogloss:0.03957	test-mlogloss:0.08286
[965]	train-mlogloss:0.03949	test-mlogloss:0.08279
[966]	train-mlogloss:0.03942	test-mlogloss:0.08271
[967]	train-mlogloss:0.03935	test-mlogloss:0.08264
[968]	train-mlogloss:0.03927	test-mlogloss:0.08256
[969]	train-mlogloss:0.03921	test-mlogloss:0.08249
[970]	train-mlogloss:0.03914	test-mlogloss:0.08241
[971]	train-mlogloss:0.03907	test-mlogloss:0.08235
[972]	train-mlogloss:0.03901	test-mlogloss:0.08228
[973]	train-mlogloss:0.03894	test-mlogloss:0.08221
[974]	train-mlogloss:0.03887	test-mlogloss:0.08212
[975]	train-mlogloss:0.03879	test-mlogloss:0.08202
[976]	train-mlogloss:0.03872	test-mlogloss:0.08194
[977]	train-mlogloss:0.03864	test-mlogloss:0.08185
[978]	train-mlogloss:0.03857	test-mlogloss:0.08178
[979]	train-mlogloss:0.03850	test-mlogloss:0.08171
[980]	train-mlogloss:0.03843	test-mlogloss:0.08162
[981]	train-mlogloss:0.03835	test-mlogloss:0.08153
[982]	train-mlogloss:0.03828	te

[1122]	train-mlogloss:0.03032	test-mlogloss:0.07239
[1123]	train-mlogloss:0.03027	test-mlogloss:0.07234
[1124]	train-mlogloss:0.03023	test-mlogloss:0.07228
[1125]	train-mlogloss:0.03018	test-mlogloss:0.07222
[1126]	train-mlogloss:0.03014	test-mlogloss:0.07218
[1127]	train-mlogloss:0.03009	test-mlogloss:0.07213
[1128]	train-mlogloss:0.03004	test-mlogloss:0.07207
[1129]	train-mlogloss:0.02999	test-mlogloss:0.07202
[1130]	train-mlogloss:0.02995	test-mlogloss:0.07197
[1131]	train-mlogloss:0.02990	test-mlogloss:0.07191
[1132]	train-mlogloss:0.02986	test-mlogloss:0.07186
[1133]	train-mlogloss:0.02981	test-mlogloss:0.07181
[1134]	train-mlogloss:0.02976	test-mlogloss:0.07175
[1135]	train-mlogloss:0.02972	test-mlogloss:0.07171
[1136]	train-mlogloss:0.02968	test-mlogloss:0.07166
[1137]	train-mlogloss:0.02963	test-mlogloss:0.07161
[1138]	train-mlogloss:0.02959	test-mlogloss:0.07157
[1139]	train-mlogloss:0.02955	test-mlogloss:0.07153
[1140]	train-mlogloss:0.02951	test-mlogloss:0.07147
[1141]	train

[1280]	train-mlogloss:0.02446	test-mlogloss:0.06565
[1281]	train-mlogloss:0.02443	test-mlogloss:0.06562
[1282]	train-mlogloss:0.02441	test-mlogloss:0.06559
[1283]	train-mlogloss:0.02438	test-mlogloss:0.06555
[1284]	train-mlogloss:0.02434	test-mlogloss:0.06551
[1285]	train-mlogloss:0.02431	test-mlogloss:0.06549
[1286]	train-mlogloss:0.02429	test-mlogloss:0.06546
[1287]	train-mlogloss:0.02426	test-mlogloss:0.06542
[1288]	train-mlogloss:0.02423	test-mlogloss:0.06537
[1289]	train-mlogloss:0.02419	test-mlogloss:0.06533
[1290]	train-mlogloss:0.02416	test-mlogloss:0.06529
[1291]	train-mlogloss:0.02414	test-mlogloss:0.06526
[1292]	train-mlogloss:0.02411	test-mlogloss:0.06524
[1293]	train-mlogloss:0.02409	test-mlogloss:0.06522
[1294]	train-mlogloss:0.02407	test-mlogloss:0.06520
[1295]	train-mlogloss:0.02403	test-mlogloss:0.06515
[1296]	train-mlogloss:0.02400	test-mlogloss:0.06511
[1297]	train-mlogloss:0.02397	test-mlogloss:0.06508
[1298]	train-mlogloss:0.02395	test-mlogloss:0.06505
[1299]	train

[1438]	train-mlogloss:0.02046	test-mlogloss:0.06086
[1439]	train-mlogloss:0.02044	test-mlogloss:0.06082
[1440]	train-mlogloss:0.02042	test-mlogloss:0.06080
[1441]	train-mlogloss:0.02040	test-mlogloss:0.06077
[1442]	train-mlogloss:0.02037	test-mlogloss:0.06074
[1443]	train-mlogloss:0.02035	test-mlogloss:0.06072
[1444]	train-mlogloss:0.02033	test-mlogloss:0.06069
[1445]	train-mlogloss:0.02031	test-mlogloss:0.06067
[1446]	train-mlogloss:0.02029	test-mlogloss:0.06065
[1447]	train-mlogloss:0.02027	test-mlogloss:0.06062
[1448]	train-mlogloss:0.02024	test-mlogloss:0.06059
[1449]	train-mlogloss:0.02022	test-mlogloss:0.06056
[1450]	train-mlogloss:0.02020	test-mlogloss:0.06054
[1451]	train-mlogloss:0.02018	test-mlogloss:0.06051
[1452]	train-mlogloss:0.02016	test-mlogloss:0.06048
[1453]	train-mlogloss:0.02014	test-mlogloss:0.06045
[1454]	train-mlogloss:0.02012	test-mlogloss:0.06044
[1455]	train-mlogloss:0.02010	test-mlogloss:0.06042
[1456]	train-mlogloss:0.02008	test-mlogloss:0.06039
[1457]	train

In [153]:
bst.save_model(OUTPUT_PATH + 'merged_train_1500_model.json')

# bst = xgb.Booster()
# bst.load_model(OUTPUT_PATH + "merged_train_1500_model.json")

In [179]:
c_prob = bst.predict(xg_test)

In [180]:
c_prob = np.vectorize(lambda x: format(x, '.3f'))(c_prob).tolist()
pred = [r.index(max(r)) for r in c_prob]

In [183]:
print (f'predicting, classification error={sum(pred!=y_test)/len(y_test)}')

predicting, classification error=0.012423690692942059


### Process test data and make a prediction

In [398]:
test_df = load_data_in_chunks(PROV_PATH)#, chunksize=10000)

500000 done....
1000000 done....
1500000 done....
2000000 done....
2500000 done....
3000000 done....
3500000 done....
4000000 done....
4500000 done....
5000000 done....
5500000 done....
6000000 done....
6500000 done....
7000000 done....
7500000 done....
8000000 done....
8500000 done....
9000000 done....
9500000 done....
10000000 done....
10500000 done....
11000000 done....
11500000 done....
12000000 done....
12500000 done....
13000000 done....
13500000 done....
14000000 done....
14500000 done....
15000000 done....
15500000 done....
16000000 done....
16500000 done....
17000000 done....
17500000 done....
18000000 done....
18500000 done....
19000000 done....
19500000 done....
20000000 done....
20500000 done....
21000000 done....
21500000 done....
22000000 done....
22500000 done....
23000000 done....
23500000 done....
24000000 done....
24500000 done....
25000000 done....
25500000 done....
26000000 done....
26500000 done....
27000000 done....
27500000 done....
28000000 done....
28500000 don

In [399]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,tlx_score,E4_BVP,E4_GSR,LooxidLink_EEG_A3,LooxidLink_EEG_A4,LooxidLink_EEG_FP1,LooxidLink_EEG_FP2,LooxidLink_EEG_A7,LooxidLink_EEG_A8,Muse_EEG_TP9,Muse_EEG_AF7,Muse_EEG_AF8,Muse_EEG_TP10,Muse_PPG_0,Muse_PPG_1,Muse_PPG_2,Myo_GYR_X,Myo_GYR_Y,Myo_GYR_Z,Myo_EMG_0,Myo_EMG_1,Myo_EMG_2,Myo_EMG_3,Myo_EMG_4,Myo_EMG_5,Myo_EMG_6,Myo_EMG_7,PICARD_fnirs_0,PICARD_fnirs_1,Polar_bpm,Polar_hrv,ViveEye_eyeOpenness_L,ViveEye_pupilDiameter_L,ViveEye_pupilPos_L_X,ViveEye_pupilPos_L_Y,ViveEye_gazeOrigin_L_X,ViveEye_gazeOrigin_L_Y,ViveEye_gazeOrigin_L_Z,ViveEye_gazeDirection_L_X,ViveEye_gazeDirection_L_Y,ViveEye_gazeDirection_L_Z,ViveEye_eyeOpenness_R,ViveEye_pupilDiameter_R,ViveEye_pupilPos_R_X,ViveEye_pupilPos_R_Y,ViveEye_gazeOrigin_R_X,ViveEye_gazeOrigin_R_Y,ViveEye_gazeOrigin_R_Z,ViveEye_gazeDirection_R_X,ViveEye_gazeDirection_R_Y,ViveEye_gazeDirection_R_Z,Zephyr_HR,Zephyr_HRV
time,test_suite,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
1626275616000000,Provisional-Test-025,10,-5.963139,6.189713,-0.004490,-0.055343,0.031845,0.015960,0.076930,0.053462,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-1.377454,-5.584286,-6.539270,-0.773515,-0.872525,-0.793317,-0.719059,-0.818069,-0.662129,-0.809406,-0.754950,4508.465347,16315.501238,104.362624,13.988494,0.243812,-1.000000,-1.000000,-1.000000,34.706300,-8.031110,-33.964000,0.000000,0.000000,0.000000,0.245050,0.794312,-0.613469,-0.414832,-26.945283,-6.553989,-39.167521,0.080973,0.058833,0.294683,105.000000,20.000000
1626275617000000,Provisional-Test-025,10,-10.508233,6.188197,-0.004475,-0.055123,0.031918,0.016070,0.076997,0.053382,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-0.810629,-6.025725,-12.061783,-0.728900,-0.822890,-0.941816,-0.844629,-0.936061,-0.819693,-0.742327,-0.827366,4468.099105,16644.011509,105.000000,13.995829,0.065217,-0.837267,-0.965659,-0.953110,34.740494,-9.187108,-32.550247,0.002554,0.008333,0.026746,0.067775,0.483618,-0.700705,-0.549523,-27.129371,-6.133292,-40.426029,0.058470,0.040036,0.216456,105.632992,20.000000
1626275618000000,Provisional-Test-025,10,4.518973,6.165090,-0.004395,-0.054962,0.032020,0.016209,0.077103,0.053337,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-1.316816,-6.760485,-12.564540,-0.764364,-0.871530,-0.900581,-0.826985,-0.963848,-0.827631,-0.766301,-0.872821,4502.553260,16988.775339,105.000000,13.963872,0.222079,-0.606978,-0.903856,-0.869618,34.743384,-10.456669,-30.656103,0.001576,0.012551,0.079667,0.222724,0.417486,-0.539806,-0.301788,-27.029024,-6.566190,-39.331745,0.064454,0.044937,0.241218,106.632021,20.000000
1626275619000000,Provisional-Test-025,10,0.307316,6.140932,-0.004433,-0.054819,0.032059,0.016276,0.077128,0.053268,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-0.985918,-7.263985,-13.482886,-0.739603,-0.968010,-0.815739,-0.968010,-0.875240,-0.843250,-0.627639,-0.710813,4512.445298,16887.747921,105.000000,13.839393,0.558541,-1.000000,-1.000000,-1.000000,34.757500,-10.559600,-30.435900,0.000000,0.000000,0.000000,0.555982,-0.615887,-0.841598,-0.757784,-26.772080,-6.380961,-39.936577,0.011780,0.010336,0.059056,107.632118,19.367242
1626275620000000,Provisional-Test-025,10,-0.659421,6.114869,-0.004478,-0.054677,0.032091,0.016332,0.077125,0.053254,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-9999.900000,-0.578997,-7.209074,-13.418881,-0.647625,-0.896662,-0.872272,-0.779204,-0.890244,-0.931964,-0.833761,-0.899230,4568.049422,16779.560334,105.227856,13.797723,0.415276,-1.000000,-1.000000,-1.000000,34.757500,-10.559600,-30.435900,0.000000,0.000000,0.000000,0.416560,0.024326,-0.781922,-0.665319,-26.600265,-6.607612,-39.657989,0.057813,0.034124,0.154533,108.000000,19.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637166574000000,Provisional-Test-024,50,-5.729387,0.376112,-0.012558,0.031851,0.102590,0.052097,0.126179,0.149397,-118.749752,-90.944535,-110.971760,-104.218492,422.068388,17799.767575,6050.962219,-1.070261,-3.708376,-2.638669,-0.209469,0.363941,-0.775227,-0.741750,-0.780966,-0.867049,-0.812530,-0.360115,3869.467241,13030.632233,62.126255,117.262045,0.821299,3.254705,0.582884,0.558388,31.117945,-0.143978,-29.691910,-0.189779,0.151447,0.850395,0.857098,3.814255,0.449021,0.716364,-29.486398,-5.253870,-27.540207,0.029584,0.189860,0.912288,65.662841,114.000000
1637166575000000,Provisional-Test-024,50,-8.080046,0.374995,-0.013937,0.032910,0.103945,0.051203,0.127363,0.146320,-58.448761,-89.102002,-53.885869,-62.542879,421.075326,17923.309512,5376.151618,8.258744,5.701056,-2.205815,-1.084500,0.898600,-0.313858,-0.694833,-0.379527,-0.565427,-1.142443,-1.679865,3945.387735,10687.343312,62.000000,120.196630,1.000000,3.157802,0.576277,0.583242,31.302857,-1.377852,-29.495364,-0.220516,0.297400,0.925105,1.000000,3.261818,0.455855,0.726163,-29.516643,-6.010548,-28.628548,-0.030175,0.335836,0.936728,67.537904,114.000000
1637166576000000,Provisional-Test-024,50,11.283027,0.374685,-0.014244,0.033610,0.106456,0.051314,0.128452,0.146327,-30.268264,-41.776178,-19.320205,-25.802531,420.959342,18009.249758,5092.874153,3.216796,2.979499,-2.168115,-2.406099,-0.688771,-0.990803,-0.800097,-0.319458,-1.044046,-1.457890,-4.764763,3810.203291,10211.834947,63.744434,121.792656,1.000000,3.083697,0.524736,0.553231,32.210517,-0.811314,-29.542615,-0.098845,0.365362,0.925048,1.000000,3.292520,0.405367,0.685544,-28.707183,-5.147968,-28.818925,0.107693,0.411644,0.904305,67.471442,113.471926
1637166577000000,Provisional-Test-024,50,0.109242,0.373814,-0.014152,0.033835,0.107685,0.051267,0.129148,0.146913,-141.779287,-140.053483,-138.131707,-115.711822,421.504249,18498.599150,5167.471671,0.542068,-2.791540,-4.705493,0.081209,-0.769122,-0.931067,-0.741737,-1.339943,-1.033050,-1.274788,-1.686969,4122.300755,11786.622285,64.000000,11.690519,1.000000,3.186979,0.531673,0.540319,32.019878,-0.217111,-29.481992,-0.102678,0.319976,0.941664,1.000000,3.385421,0.414528,0.671154,-28.909694,-4.499255,-28.805830,0.096804,0.361825,0.927024,65.873466,113.000000


In [400]:
test_df.to_csv(OUTPUT_PATH+"merged_test.csv")

In [265]:
# test_df = pd.read_csv(OUTPUT_PATH+"merged_test.csv")

In [429]:
xg_pred_test = xgb.DMatrix(test_df)
# df['tlx_score'] = df['tlx_score'].astype(int)
c_prob = bst.predict(xg_pred_test)
c_time = test_df.reset_index()['time']
c_suite = test_df.reset_index()['test_suite']

In [430]:
full_pred = pd.DataFrame(columns=OUTPUT_HEADER)
full_pred = test_postprocess(c_time, c_suite, c_prob, full_pred) #TODO

### Output Pred

In [433]:
s_sol = pd.read_csv(S_SOL)
full_pred['top_three_features'] = s_sol['top_three_features']

In [434]:
full_pred.to_csv(OUTPUT_PATH+"solution_full_train_2000.csv", index=False)

In [428]:
test_df.reset_index()['test_suite'][0]

'Provisional-Test-025'

#### Previous draft data processing

In [5]:
chunksize=500000
df = pd.read_csv(TRAIN_PATH, chunksize=chunksize)

full_df = pd.DataFrame(columns=input_header).set_index(['time', 'test_suite'])
for i, c in enumerate(df):
    full_df = data_preprocess(c, full_df)
    
#     print(f'{(i+1)*chunksize} done....')
    print(f"{format((i+1)*chunksize/46010963, '.3f')} done....")

0.011 done....
0.022 done....
0.033 done....
0.043 done....
0.054 done....
0.065 done....
0.076 done....
0.087 done....
0.098 done....
0.109 done....
0.120 done....
0.130 done....
0.141 done....
0.152 done....
0.163 done....
0.174 done....
0.185 done....
0.196 done....
0.206 done....
0.217 done....
0.228 done....
0.239 done....
0.250 done....
0.261 done....
0.272 done....
0.283 done....
0.293 done....
0.304 done....
0.315 done....
0.326 done....
0.337 done....
0.348 done....
0.359 done....
0.369 done....
0.380 done....
0.391 done....
0.402 done....
0.413 done....
0.424 done....
0.435 done....
0.446 done....
0.456 done....
0.467 done....
0.478 done....
0.489 done....
0.500 done....
0.511 done....
0.522 done....
0.532 done....
0.543 done....
0.554 done....
0.565 done....
0.576 done....
0.587 done....
0.598 done....
0.609 done....
0.619 done....
0.630 done....
0.641 done....
0.652 done....
0.663 done....
0.674 done....
0.685 done....
0.695 done....
0.706 done....
0.717 done....
0.728 done

In [39]:
# aggregation method for each col
col_merger = dict(zip(full_df.columns, [normal_mean]*len(full_df.columns)))
col_merger['induced_state'] = mode

In [40]:
trim_full_df = full_df.groupby(full_df.index).agg(col_merger)

In [41]:
trim_full_df.shape

(31123, 54)

In [42]:
trim_full_df.to_csv(OUTPUT_PATH+"trim_train_merged.csv")

In [7]:
full_df.to_csv(OUTPUT_PATH+"train_merged.csv")

In [8]:
full_df.index.duplicated().sum()

98