In [1]:
import pandas as pd
pd.set_option('display.max_columns', 56)
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Config

In [2]:
# DATA PATHS
TRAIN_PATH = 'data/data_training.csv'
PROV_PATH = 'data/data_provisional.csv'
SAMPLE_PATH = 'data/sample-data.csv'

category = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}

### Loading data

In [35]:
# Load sample data
sample_data = pd.read_csv(SAMPLE_PATH)

# # load the first 10^6 rows of train data
# train = pd.read_csv(TRAIN_PATH, nrows=1000000)
# df = train[train.test_suite == 'Training-Test-038']

In [12]:
chunksize = 10000

data = pd.read_csv(SAMPLE_PATH, chunksize=chunksize)

# c = next(data)

In [14]:
xg_data = xgb.DMatrix(next(data))

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:test_suite, induced_state

In [25]:
# for chunk in data:
#     print(chunk.shape)

In [29]:
c.test_suite.value_counts()

Training-Test-038    990214
Training-Test-067      9786
Name: test_suite, dtype: int64

In [30]:
single = c[c.test_suite == 'Training-Test-038']

In [39]:
single.time[single.time >= (single.time[0] + 3000000)]

4661      1626275543355618
4662      1626275543356097
4663      1626275543356861
4664      1626275543357816
4665      1626275543358100
                ...       
990209    1626279183568473
990210    1626279183569511
990211    1626279183570472
990212    1626279183572471
990213    1626279183573593
Name: time, Length: 985553, dtype: int64

In [66]:
single_channelized = single[single.induced_state == 'channelized']

In [59]:
single_baseline = single.iloc[:117791]

### The difference btw sensordata mean values of `baseline` & `channelized` statet.  Put into ratio to std

In [69]:
(single_channelized.describe().loc['mean'] - single_baseline.describe().loc['mean'])/single_baseline.describe().loc['std']

time                         20.963801
tlx_score                          inf
E4_BVP                        0.045512
E4_GSR                        0.060257
LooxidLink_EEG_A3            36.275706
LooxidLink_EEG_A4            45.836544
LooxidLink_EEG_FP1           35.861761
LooxidLink_EEG_FP2           28.080833
LooxidLink_EEG_A7            11.396929
LooxidLink_EEG_A8            -0.831278
Muse_EEG_TP9                  0.000000
Muse_EEG_AF7                  0.000000
Muse_EEG_AF8                  0.000000
Muse_EEG_TP10                 0.000000
Muse_PPG_0                    0.000000
Muse_PPG_1                    0.000000
Muse_PPG_2                    0.000000
Myo_GYR_X                     0.000269
Myo_GYR_Y                     0.092695
Myo_GYR_Z                     0.018316
Myo_EMG_0                     0.015856
Myo_EMG_1                     0.015963
Myo_EMG_2                     0.015858
Myo_EMG_3                     0.015655
Myo_EMG_4                     0.015556
Myo_EMG_5                

## Create true future label on sample data

### Preprocess

In [4]:
def preprocess(df):
    # replace  'induced_state'
    df["induced_state"] = df["induced_state"].replace(category)
    # set time as index
    df.set_index('time', inplace=True)

    # define drop cols 
    drop_cols = ['test_suite', 'induced_state'] #, 'tlx_score']

    # split data into Train & Test
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    return X, Y

In [36]:
sample_data.head()

Unnamed: 0,time,test_suite,induced_state,tlx_score,E4_BVP,E4_GSR,LooxidLink_EEG_A3,LooxidLink_EEG_A4,LooxidLink_EEG_FP1,LooxidLink_EEG_FP2,LooxidLink_EEG_A7,LooxidLink_EEG_A8,Muse_EEG_TP9,Muse_EEG_AF7,Muse_EEG_AF8,Muse_EEG_TP10,Muse_PPG_0,Muse_PPG_1,Muse_PPG_2,Myo_GYR_X,Myo_GYR_Y,Myo_GYR_Z,Myo_EMG_0,Myo_EMG_1,Myo_EMG_2,Myo_EMG_3,Myo_EMG_4,Myo_EMG_5,Myo_EMG_6,Myo_EMG_7,PICARD_fnirs_0,PICARD_fnirs_1,Polar_bpm,Polar_hrv,ViveEye_eyeOpenness_L,ViveEye_pupilDiameter_L,ViveEye_pupilPos_L_X,ViveEye_pupilPos_L_Y,ViveEye_gazeOrigin_L_X,ViveEye_gazeOrigin_L_Y,ViveEye_gazeOrigin_L_Z,ViveEye_gazeDirection_L_X,ViveEye_gazeDirection_L_Y,ViveEye_gazeDirection_L_Z,ViveEye_eyeOpenness_R,ViveEye_pupilDiameter_R,ViveEye_pupilPos_R_X,ViveEye_pupilPos_R_Y,ViveEye_gazeOrigin_R_X,ViveEye_gazeOrigin_R_Y,ViveEye_gazeOrigin_R_Z,ViveEye_gazeDirection_R_X,ViveEye_gazeDirection_R_Y,ViveEye_gazeDirection_R_Z,Zephyr_HR,Zephyr_HRV
0,1626275540354390,Training-Test-038,baseline,10,-9999.9,-9999.9,-0.002007,-0.059057,0.03018,0.01029,0.072202,0.051781,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9
1,1626275540500599,Training-Test-038,baseline,10,-9999.9,-9999.9,-0.002007,-0.059079,0.030179,0.010289,0.072189,0.051773,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,0.753418,-8.74951,-14.877,-2.0,1.0,1.0,0.0,-1.0,-2.0,-3.0,0.0,3931.0,16752.0,-9999.9,-9999.9,1.0,-1.0,-1.0,-1.0,34.4694,-8.68291,-35.2248,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-27.2645,-7.50266,-37.9508,0.0,0.0,0.0,-9999.9,-9999.9
2,1626275541501825,Training-Test-038,baseline,10,-21.18506,7.358181,-0.002018,-0.059191,0.030173,0.010314,0.072199,0.05172,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-1.99707,-6.74951,-15.002,-1.0,0.0,-1.0,-1.0,0.0,3.0,1.0,0.0,4483.0,16331.0,106.0,17.3323,0.0,-1.0,-1.0,-1.0,34.4694,-8.68291,-35.2248,0.0,0.0,0.0,0.0,4.78511,0.274257,0.938343,-27.1029,-7.38116,-37.9733,0.334305,0.16777,0.927399,102.0,26.0
3,1626275542501052,Training-Test-038,baseline,10,-3.672516,7.358181,-0.002375,-0.058885,0.030056,0.010255,0.072125,0.051569,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-2.12207,-3.99951,14.373,0.0,1.0,-2.0,-1.0,-2.0,-1.0,-2.0,-1.0,4247.0,16329.0,106.0,17.3136,1.0,-1.0,-1.0,-1.0,34.4694,-8.68291,-35.2248,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-27.244,-7.43414,-37.9404,0.0,0.0,0.0,100.0,26.0
4,1626275543500284,Training-Test-038,baseline,10,1.859863,7.358181,-0.002526,-0.058668,0.030026,0.010284,0.072184,0.051425,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-9999.9,-0.62207,-6.37451,-14.377,-2.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,4320.0,17036.0,107.0,16.8914,1.0,-1.0,-1.0,-1.0,34.4694,-8.68291,-35.2248,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-27.0874,-7.92627,-38.2178,0.0,0.0,0.0,102.0,26.0


In [37]:
sample_data.time.is_monotonic_increasing

True

In [44]:
sample_data.time.diff().sort_values()

20314    5.700000e+02
374      2.000000e+03
13448    6.009000e+03
22665    6.399000e+03
19238    6.683000e+03
             ...     
22327    8.608332e+11
28085    1.220938e+12
23694    1.302633e+12
28874    2.504075e+12
0                 NaN
Name: time, Length: 31123, dtype: float64

In [3]:
time_df = pd.read_csv(TRAIN_PATH, usecols=['time'])

In [9]:
time_df.time.is_monotonic_increasing

False

In [8]:
time_df.time[3] = time_df.time[0]

In [1]:
time_df.shape

NameError: name 'time_df' is not defined