In [25]:
import pandas as pd
import numpy as np
import xgboost as xgb
import datetime

## Config (to be imported from separate file)

In [22]:
# category & headers
CATEGORY = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}
REV_CATEGORY = { v:k for k,v in CATEGORY.items()}
OUTPUT_HEADER = ['timestamp', 'test_suite', 'predicted_induced_state',
       'three_sec_predicted_induced_state',
       'predicted_induced_state_confidence',
       'three_sec_predicted_induced_state_confidence', 'top_three_features']

# param = {
#     'booster': 'gbtree',
#     'objective': 'multi:softprob',  # 多分类的问题
#     'num_class': 6,               # 类别数，与 multisoftmax 并用
#     'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
#     'max_depth': 12,               # 构建树的深度，越大越容易过拟合
#     'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
#     'subsample': 0.7,              # 随机采样训练样本
#     'colsample_bytree': 0.7,       # 生成树时进行的列采样
#     'min_child_weight': 3,
# #     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
#     'eta': 0.007,                  # 如同学习率
#     'seed': 1000,
# #     'nthread': 4,                  # cpu 线程数
# }

# # number of iteration for model training
# NUM_ROUND = 2000

### Helper (separate file) 

In [19]:
def round_time(time_serie):
    time_serie = pd.to_numeric(time_serie)
    time_serie = time_serie.apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    time_serie = time_serie.dt.round('1s')
    time_serie = time_serie.apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
#     time_serie = time_serie.drop_duplicates()
    
    return time_serie

# return the mode(most comm) element in a series
def mode(series):
    return series.value_counts().index[0]

# return the mean value excluding -9999.9 the default value, if there's normal values
def normal_mean(series):
    if series.nunique() > 1:
        return series[series > -9999.9].mean()
    return series.unique()

### User Input

In [None]:
# # Test data path
# TEST_PATH = 'data/data_training.csv'
# # OUTPUT PATH
# OUTPUT_PATH = ''
# # MODEL PATH
# MODEL_PATH = 'submissions/output/'


### Utils

In [15]:
# preprocess raw data into model readable format
def data_preprocess(df, full_df):
    # map  'induced_state' for training data
    if ('induced_state' in df.columns):
        df["induced_state"] = df["induced_state"].replace(CATEGORY)

    # round time to whole seconds
    df['time'] = round_time(df['time'])

    # aggregate the data by 'time' & 'test_suite'
    col_merger = dict(zip(df.columns[2:], [normal_mean] * len(df.columns[2:])))
    if ('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    df = df.groupby(['time', 'test_suite']).agg(col_merger)

    return full_df.append(df)

In [12]:
# loading data in chunks
def load_data_in_chunks(path, chunksize=500000):
    df = pd.read_csv(path, chunksize=chunksize)

    # read the header cols
    with open(path) as f:
        header = f.readline().strip().split(',')

    # iterate thru the data in chunks and process them
    full_df = pd.DataFrame(columns=header).set_index(['time', 'test_suite'])
    for i, c in enumerate(df):
        full_df = data_preprocess(c, full_df)

        print(f'{(i + 1) * chunksize} rows loaded....')

    # futher aggregate the duplicates generated from different chunks
    col_merger = dict(zip(full_df.columns, [normal_mean] * len(full_df.columns)))
    if ('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    full_df = full_df.groupby(['time', 'test_suite']).agg(col_merger)

    print('Data loaded.')
    
    return full_df

In [31]:
def test_postprocess(c_time, c_suite, c_prob, c_ttf, full_pred):
    
    output_header = OUTPUT_HEADER
    
    # create a empty dataframe for chunk data
    c_df = pd.DataFrame(columns=output_header)
    
    # setting time & test_suite
    c_df[output_header[0]] = c_time
    c_df[output_header[1]] = c_suite
    
    # process predicted probabilties
    # trim prob into 3 decimal places
    c_prob = np.vectorize(lambda x: format(x, '.3f'))(c_prob).tolist()
    # find the pred(highest prob) index for each row 
    c_pred = [r.index(max(r)) for r in c_prob]
    # map the pred index into string instance
    c_pis = [REV_CATEGORY[p] for p in c_pred]
    
    # TODO: 
    c_tpis = c_pis
    c_tpis_prob = c_prob
    c_ttf = [c_ttf] * len(c_prob)

    c_df[output_header[2]] = c_pis
    c_df[output_header[3]] = c_tpis
    c_df[output_header[4]] = [str(r).replace(",", "").replace("\'", "") for r in c_prob]
    c_df[output_header[5]] = [str(r).replace(",", "").replace("\'", "") for r in c_tpis_prob]
    c_df[output_header[6]] = [str(r).replace(",", "") for r in c_ttf]
    
    return full_pred.append(c_df)

## Main

In [None]:
# # mock path
# input_file = 'py_temp/mock_data/test_mock.csv'
# output_file = 'py_temp/output/solution.csv'
# model_file = 'py_temp/output/model.json'

In [32]:
def main():
    
    if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
        print("Testing input file is missing.")
        return 1
    
    if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
        print("Testing output file is missing.")
        return 1
    
    print('Testing started.')

    input_file = sys.argv[1]
    output_file = sys.argv[2]
    model_file = sys.argv[3]
    
    
    # My Code Starts HERE
    
    # load and process test data
    test_df = load_data_in_chunks(input_file)#, chunksize=10000)
    
    # load saved model
    bst = xgb.Booster()
    bst.load_model(model_file)
    
    # make prediction
    xg_test = xgb.DMatrix(test_df)
    c_prob = bst.predict(xg_test)
    
    # post process predicted probabilities
    c_time = test_df.reset_index()['time']
    c_suite = test_df.reset_index()['test_suite']
    scores = bst.get_score(importance_type='gain')
    c_ttf = sorted(scores, key=scores.get, reverse=True)[:3]
    
    # pass values into the post-process function
    full_pred = pd.DataFrame(columns=OUTPUT_HEADER)
    full_pred = test_postprocess(c_time, c_suite, c_prob, c_ttf, full_pred)
    
    # output the solution
    full_pred.to_csv(output_file, index=False)

    print('Testing finished.')

    return 0

if __name__ == "__main__":
    main()


500000 rows loaded....
Data loaded.

Testing finished.


In [50]:
# output_file = 'py_temp/output/solution.csv'
output_file = 'submissions/draft_submission/code/src/sample/submission/output/solution.csv'

In [51]:
my_sol = pd.read_csv(output_file)

In [36]:
S_SOL = 'sample-submission/solution/solution.csv'
s_sol = pd.read_csv(S_SOL)

In [52]:
my_sol.head()

Unnamed: 0,timestamp,test_suite,predicted_induced_state,three_sec_predicted_induced_state,predicted_induced_state_confidence,three_sec_predicted_induced_state_confidence,top_three_features
0,1626275616000000,Provisional-Test-025,baseline,baseline,[0.002 0.002 0.003 0.982 0.008 0.003],[0.002 0.002 0.003 0.982 0.008 0.003],['E4_GSR' 'ViveEye_gazeOrigin_R_Y' 'LooxidLink...
1,1626275617000000,Provisional-Test-025,baseline,baseline,[0.002 0.002 0.003 0.982 0.008 0.003],[0.002 0.002 0.003 0.982 0.008 0.003],['E4_GSR' 'ViveEye_gazeOrigin_R_Y' 'LooxidLink...
2,1626275618000000,Provisional-Test-025,baseline,baseline,[0.002 0.002 0.003 0.982 0.007 0.003],[0.002 0.002 0.003 0.982 0.007 0.003],['E4_GSR' 'ViveEye_gazeOrigin_R_Y' 'LooxidLink...
3,1626275619000000,Provisional-Test-025,baseline,baseline,[0.002 0.002 0.003 0.983 0.007 0.003],[0.002 0.002 0.003 0.983 0.007 0.003],['E4_GSR' 'ViveEye_gazeOrigin_R_Y' 'LooxidLink...
4,1626275620000000,Provisional-Test-025,baseline,baseline,[0.002 0.002 0.003 0.983 0.007 0.003],[0.002 0.002 0.003 0.983 0.007 0.003],['E4_GSR' 'ViveEye_gazeOrigin_R_Y' 'LooxidLink...


In [37]:
s_sol.head()

Unnamed: 0,timestamp,test_suite,predicted_induced_state,three_sec_predicted_induced_state,predicted_induced_state_confidence,three_sec_predicted_induced_state_confidence,top_three_features
0,1626275616000000,Provisional-Test-025,high,high,[0.283 0.609 0.656 0.391 0.897 0.863],[0.95 0.986 0.137 0.793 0.188 0.808],['Myo_EMG_2' 'LooxidLink_EEG_FP1' 'ViveEye_pup...
1,1626275617000000,Provisional-Test-025,high,high,[0.007 0.485 0.693 0.735 0.149 0.004],[0.556 0.07 0.68 0.286 0.721 0.821],['Muse_EEG_TP10' 'Muse_EEG_TP9' 'LooxidLink_EE...
2,1626275618000000,Provisional-Test-025,high,high,[0.505 0.365 0.195 0.467 0.496 0.511],[0.299 0.687 0.999 0.185 0.336 0.284],['ViveEye_gazeDirection_L_X' 'Myo_EMG_2' 'Vive...
3,1626275619000000,Provisional-Test-025,baseline,baseline,[0.16 0.235 0.859 0.074 0.595 0.924],[0.798 0.045 0.238 0.221 0.608 0.218],['ViveEye_pupilPos_L_Y' 'ViveEye_eyeOpenness_R...
4,1626275620000000,Provisional-Test-025,baseline,baseline,[0.687 0.378 0.819 0.749 0.87 0.01 ],[0.364 0.606 0.203 0.132 0.415 0.128],['ViveEye_pupilDiameter_R' 'ViveEye_pupilPos_R...


In [54]:
sum(my_sol.test_suite != s_sol.test_suite[:500])

0