In [1]:
import pandas as pd
pd.set_option('display.max_columns', 56)
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Config

In [2]:
# DATA PATHS
TRAIN_PATH = 'data/data_training.csv'
PROV_PATH = 'data/data_provisional.csv'
SAMPLE_PATH = 'data/sample-data.csv'

category = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}

### Loading data

In [3]:
# Load sample data
sample_data = pd.read_csv(SAMPLE_PATH)

# # load the first 10^6 rows of train data
# train = pd.read_csv(TRAIN_PATH, nrows=1000000)
# df = train[train.test_suite == 'Training-Test-038']

## pd.read_csv

filtered with 1/3 of the rows, and only object cols ['time', 'test_suite', 'induced_state', 'tlx_score']

In [11]:
df = pd.read_csv(TRAIN_PATH, skiprows=lambda x: x%3!=0, usecols=['time', 'test_suite', 'induced_state', 'tlx_score'])

In [12]:
df.shape

(16349550, 4)

In [14]:
df.test_suite.value_counts()

Training-Test-009    610356
Training-Test-014    517142
Training-Test-035    513450
Training-Test-052    486133
Training-Test-080    466040
                      ...  
Training-Test-072     30865
Training-Test-026     30449
Training-Test-078     23756
Training-Test-074      9322
Training-Test-011      3580
Name: test_suite, Length: 85, dtype: int64

In [33]:
df.induced_state.value_counts()

high           8263
medium         6915
low            6386
baseline       6358
channelized    3120
surprise         81
Name: induced_state, dtype: int64

In [16]:
df.groupby('induced_state')['tlx_score'].agg(['mean', 'median']).sort_values('mean')

Unnamed: 0_level_0,mean,median
induced_state,Unnamed: 1_level_1,Unnamed: 2_level_1
baseline,10.107636,10
surprise,13.464468,10
low,27.025233,20
medium,34.292709,40
channelized,34.800915,35
high,42.675216,50


In [29]:
t_delta = df.time.max() - df.time.min()
print(t_delta)

10891021139143


In [26]:
df.time[0] - t_delta

1615384519219251

In [27]:
df.time[0]

1626275540358394

In [10]:
sample_data.columns

Index(['time', 'test_suite', 'induced_state', 'tlx_score', 'E4_BVP', 'E4_GSR',
       'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', 'LooxidLink_EEG_FP1',
       'LooxidLink_EEG_FP2', 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8',
       'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10',
       'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', 'Myo_GYR_X', 'Myo_GYR_Y',
       'Myo_GYR_Z', 'Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3',
       'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', 'PICARD_fnirs_0',
       'PICARD_fnirs_1', 'Polar_bpm', 'Polar_hrv', 'ViveEye_eyeOpenness_L',
       'ViveEye_pupilDiameter_L', 'ViveEye_pupilPos_L_X',
       'ViveEye_pupilPos_L_Y', 'ViveEye_gazeOrigin_L_X',
       'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z',
       'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y',
       'ViveEye_gazeDirection_L_Z', 'ViveEye_eyeOpenness_R',
       'ViveEye_pupilDiameter_R', 'ViveEye_pupilPos_R_X',
       'ViveEye_pupilPos_R_Y', 'ViveEye_gazeOrigi

In [34]:
state_df = pd.read_csv(TRAIN_PATH, usecols=['induced_state'])

In [37]:
# value counts for all states in training data
state_df.value_counts()/state_df.shape[0]

induced_state
high             0.272787
medium           0.220932
low              0.202724
baseline         0.194880
channelized      0.106701
surprise         0.001975
dtype: float64

### Preprocessing

In [4]:
def preprocess(df):
    # replace  'induced_state'
    df["induced_state"] = df["induced_state"].replace(category)
    # set time as index
    df.set_index('time', inplace=True)

    # define drop cols 
    drop_cols = ['test_suite', 'induced_state'] #, 'tlx_score']

    # split data into Train & Test
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    return X, Y

In [5]:
X, Y = preprocess(sample_data)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix( X_test, label=y_test)

## Training

In [6]:
param = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

In [7]:
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 500
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=10 );

[0]	train-mlogloss:1.77816	test-mlogloss:1.77888
[1]	train-mlogloss:1.76406	test-mlogloss:1.76538
[2]	train-mlogloss:1.74875	test-mlogloss:1.75048
[3]	train-mlogloss:1.73429	test-mlogloss:1.73663
[4]	train-mlogloss:1.71955	test-mlogloss:1.72232
[5]	train-mlogloss:1.70553	test-mlogloss:1.70875
[6]	train-mlogloss:1.69078	test-mlogloss:1.69435
[7]	train-mlogloss:1.67680	test-mlogloss:1.68072
[8]	train-mlogloss:1.66402	test-mlogloss:1.66832
[9]	train-mlogloss:1.65042	test-mlogloss:1.65516
[10]	train-mlogloss:1.63681	test-mlogloss:1.64190
[11]	train-mlogloss:1.62574	test-mlogloss:1.63144
[12]	train-mlogloss:1.61225	test-mlogloss:1.61827
[13]	train-mlogloss:1.60051	test-mlogloss:1.60696
[14]	train-mlogloss:1.58854	test-mlogloss:1.59536
[15]	train-mlogloss:1.57662	test-mlogloss:1.58387
[16]	train-mlogloss:1.56580	test-mlogloss:1.57367
[17]	train-mlogloss:1.55404	test-mlogloss:1.56227
[18]	train-mlogloss:1.54250	test-mlogloss:1.55112
[19]	train-mlogloss:1.53159	test-mlogloss:1.54075
[20]	train

[158]	train-mlogloss:0.66360	test-mlogloss:0.70425
[159]	train-mlogloss:0.65999	test-mlogloss:0.70070
[160]	train-mlogloss:0.65636	test-mlogloss:0.69714
[161]	train-mlogloss:0.65298	test-mlogloss:0.69391
[162]	train-mlogloss:0.64971	test-mlogloss:0.69079
[163]	train-mlogloss:0.64633	test-mlogloss:0.68750
[164]	train-mlogloss:0.64331	test-mlogloss:0.68456
[165]	train-mlogloss:0.63997	test-mlogloss:0.68131
[166]	train-mlogloss:0.63647	test-mlogloss:0.67788
[167]	train-mlogloss:0.63326	test-mlogloss:0.67481
[168]	train-mlogloss:0.63043	test-mlogloss:0.67215
[169]	train-mlogloss:0.62698	test-mlogloss:0.66878
[170]	train-mlogloss:0.62355	test-mlogloss:0.66546
[171]	train-mlogloss:0.62023	test-mlogloss:0.66223
[172]	train-mlogloss:0.61703	test-mlogloss:0.65912
[173]	train-mlogloss:0.61355	test-mlogloss:0.65572
[174]	train-mlogloss:0.61087	test-mlogloss:0.65319
[175]	train-mlogloss:0.60778	test-mlogloss:0.65021
[176]	train-mlogloss:0.60488	test-mlogloss:0.64740
[177]	train-mlogloss:0.60162	te

[319]	train-mlogloss:0.31100	test-mlogloss:0.35997
[320]	train-mlogloss:0.30976	test-mlogloss:0.35879
[321]	train-mlogloss:0.30848	test-mlogloss:0.35750
[322]	train-mlogloss:0.30714	test-mlogloss:0.35618
[323]	train-mlogloss:0.30576	test-mlogloss:0.35482
[324]	train-mlogloss:0.30434	test-mlogloss:0.35341
[325]	train-mlogloss:0.30318	test-mlogloss:0.35229
[326]	train-mlogloss:0.30193	test-mlogloss:0.35106
[327]	train-mlogloss:0.30055	test-mlogloss:0.34967
[328]	train-mlogloss:0.29930	test-mlogloss:0.34843
[329]	train-mlogloss:0.29810	test-mlogloss:0.34724
[330]	train-mlogloss:0.29679	test-mlogloss:0.34591
[331]	train-mlogloss:0.29551	test-mlogloss:0.34462
[332]	train-mlogloss:0.29437	test-mlogloss:0.34350
[333]	train-mlogloss:0.29325	test-mlogloss:0.34239
[334]	train-mlogloss:0.29202	test-mlogloss:0.34114
[335]	train-mlogloss:0.29069	test-mlogloss:0.33981
[336]	train-mlogloss:0.28953	test-mlogloss:0.33865
[337]	train-mlogloss:0.28834	test-mlogloss:0.33748
[338]	train-mlogloss:0.28706	te

[480]	train-mlogloss:0.16974	test-mlogloss:0.21946
[481]	train-mlogloss:0.16923	test-mlogloss:0.21895
[482]	train-mlogloss:0.16868	test-mlogloss:0.21842
[483]	train-mlogloss:0.16805	test-mlogloss:0.21777
[484]	train-mlogloss:0.16747	test-mlogloss:0.21719
[485]	train-mlogloss:0.16694	test-mlogloss:0.21668
[486]	train-mlogloss:0.16644	test-mlogloss:0.21619
[487]	train-mlogloss:0.16585	test-mlogloss:0.21560
[488]	train-mlogloss:0.16530	test-mlogloss:0.21503
[489]	train-mlogloss:0.16473	test-mlogloss:0.21447
[490]	train-mlogloss:0.16420	test-mlogloss:0.21393
[491]	train-mlogloss:0.16364	test-mlogloss:0.21335
[492]	train-mlogloss:0.16309	test-mlogloss:0.21280
[493]	train-mlogloss:0.16252	test-mlogloss:0.21221
[494]	train-mlogloss:0.16197	test-mlogloss:0.21166
[495]	train-mlogloss:0.16146	test-mlogloss:0.21117
[496]	train-mlogloss:0.16095	test-mlogloss:0.21065
[497]	train-mlogloss:0.16041	test-mlogloss:0.21011
[498]	train-mlogloss:0.15995	test-mlogloss:0.20965
[499]	train-mlogloss:0.15944	te

In [9]:
pred = bst.predict( xg_test );

In [10]:
print (f'predicting, classification error={sum(pred!=y_test)/len(y_test)}')

predicting, classification error=0.03138052907786227


### Predict on train data filtered (1/50)

In [13]:
train_filtered_51 = pd.read_csv(TRAIN_PATH, skiprows=lambda x: x%51!=0) #, usecols=['time', 'test_suite', 'induced_state', 'tlx_score']))

In [14]:
train_X_51, train_Y_51 = preprocess(train_filtered_51)

In [15]:
xg_train_51 = xgb.DMatrix(train_X_51)

In [16]:
pred_51 = bst.predict( xg_train_51 );

In [17]:
print (f'predicting, classification error={sum(pred_51!=train_Y_51)/len(train_Y_51)}')

predicting, classification error=0.009884188833133347


In [18]:
sample_data.shape

(31123, 55)

In [19]:
train_filtered_51.shape

(961738, 55)

In [42]:
error_df = train_filtered_51[(pred_51!=train_Y_51).values]

In [48]:
error_df.induced_state.value_counts()

0    4833
1    2461
5    1605
4     473
2      93
3      41
Name: induced_state, dtype: int64

In [46]:
train_filtered_51.induced_state.value_counts()

2    262347
1    212471
0    194971
3    187428
4    102618
5      1903
Name: induced_state, dtype: int64

In [49]:
# error rate by induced_state
error_df.induced_state.value_counts() / train_filtered_51.induced_state.value_counts()

0    0.024788
1    0.011583
2    0.000354
3    0.000219
4    0.004609
5    0.843405
Name: induced_state, dtype: float64