In [3]:
import pandas as pd
pd.set_option('display.max_columns', 56)
import numpy as np
import seaborn as sns
import xgboost as xgb

import datetime

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Config

In [4]:
# DATA PATHS
TRAIN_PATH = 'data/data_training.csv'
PROV_PATH = 'data/data_provisional.csv'
SAMPLE_PATH = 'data/sample-data.csv'
# OUTPUT PATHS
OUTPUT_PATH = 'submissions/output/'

# Sample Data Paths
S_TRAIN = 'sample-submission/code/data/training.csv'
S_TEST = 'sample-submission/code/data/testing.csv'
S_SOL = 'sample-submission/solution/solution.csv'

category = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}
rev_category = { v:k for k,v in category.items()}
output_header = ['timestamp', 'test_suite', 'predicted_induced_state',
       'three_sec_predicted_induced_state',
       'predicted_induced_state_confidence',
       'three_sec_predicted_induced_state_confidence', 'top_three_features']

## pre-processing & post-processing

In [5]:
def train_preprocess(df):
    # replace  'induced_state'
    df["induced_state"] = df["induced_state"].replace(category)
    
    # set time as index
    df.set_index('time', inplace=True)

    # define drop cols 
    drop_cols = ['test_suite', 'induced_state'] #, 'tlx_score']

    # split data into Train & Test
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    return X, Y

In [6]:
def round_time(time_serie):
    time_serie = pd.to_numeric(time_serie)
    time_serie = time_serie.apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    time_serie = time_serie.dt.round('1s')
    time_serie = time_serie.apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
#     time_serie = time_serie.drop_duplicates()
    
    return time_serie

In [7]:
def test_preprocess(df):
    
    # rounding timestamp and set as index
    df['time'] = round_time(df['time'])
    df.set_index('time', inplace=True)
#     df['time'] = pd.to_numeric(df['time'])
#     df['time'] = df['time'].apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
#     df['time'] = df['time'].dt.round('1s')
#     df['time'] = df['time'].apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))

#     # TODO: drop duplcaites & reindex
#     submission_data = df[['timestamp','test_suite']].copy()
#     submission_data = submission_data.drop_duplicates()
#     submission_data = submission_data.reset_index(drop=True)
    
    # drop not used cols
    drop_cols = ['test_suite']#, 'induced_state'] #, 'tlx_score']
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    
    return X

In [8]:
def test_postprocess(c_time, c_suite, c_prob, full_pred):
    
    # create a empty dataframe for chunk data
    c_df = pd.DataFrame(columns=output_header)
    
    # setting time & test_suite
    c_df[output_header[0]] = c_time
    c_df[output_header[1]] = c_suite
    
    # process predicted probabilties
    # trim prob into 3 decimal places
    c_prob = np.vectorize(lambda x: format(x, '.3f'))(c_prob).tolist()
    # find the pred(highest prob) index for each row 
    c_pred = [r.index(max(r)) for r in c_prob]
    # map the pred index into string instance
    c_pis = [rev_category[p] for p in c_pred]
    
    # TODO: 
    c_tpis = c_pis
    c_tpis_prob = c_prob
#     c_ttf = 

    c_df[output_header[2]] = c_pis
    c_df[output_header[3]] = c_tpis
    c_df[output_header[4]] = [str(r).replace(",", "").replace("\'", "") for r in c_prob]
    c_df[output_header[5]] = [str(r).replace(",", "").replace("\'", "") for r in c_tpis_prob]
    
    return full_pred.append(c_df)

## Train model

In [6]:
param = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

In [7]:
# loading & process sample_data for training
sample_data = pd.read_csv(SAMPLE_PATH)
X, Y = train_preprocess(sample_data)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix( X_test, label=y_test)

In [26]:
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 800
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=10 );

[0]	train-mlogloss:1.77816	test-mlogloss:1.77888
[1]	train-mlogloss:1.76406	test-mlogloss:1.76538
[2]	train-mlogloss:1.74875	test-mlogloss:1.75048
[3]	train-mlogloss:1.73429	test-mlogloss:1.73663
[4]	train-mlogloss:1.71955	test-mlogloss:1.72232
[5]	train-mlogloss:1.70553	test-mlogloss:1.70875
[6]	train-mlogloss:1.69078	test-mlogloss:1.69435
[7]	train-mlogloss:1.67680	test-mlogloss:1.68072
[8]	train-mlogloss:1.66402	test-mlogloss:1.66832
[9]	train-mlogloss:1.65042	test-mlogloss:1.65516
[10]	train-mlogloss:1.63681	test-mlogloss:1.64190
[11]	train-mlogloss:1.62574	test-mlogloss:1.63144
[12]	train-mlogloss:1.61225	test-mlogloss:1.61827
[13]	train-mlogloss:1.60051	test-mlogloss:1.60696
[14]	train-mlogloss:1.58854	test-mlogloss:1.59536
[15]	train-mlogloss:1.57662	test-mlogloss:1.58387
[16]	train-mlogloss:1.56580	test-mlogloss:1.57367
[17]	train-mlogloss:1.55404	test-mlogloss:1.56227
[18]	train-mlogloss:1.54250	test-mlogloss:1.55112
[19]	train-mlogloss:1.53159	test-mlogloss:1.54075
[20]	train

[158]	train-mlogloss:0.66360	test-mlogloss:0.70425
[159]	train-mlogloss:0.65999	test-mlogloss:0.70070
[160]	train-mlogloss:0.65636	test-mlogloss:0.69714
[161]	train-mlogloss:0.65298	test-mlogloss:0.69391
[162]	train-mlogloss:0.64971	test-mlogloss:0.69079
[163]	train-mlogloss:0.64633	test-mlogloss:0.68750
[164]	train-mlogloss:0.64331	test-mlogloss:0.68456
[165]	train-mlogloss:0.63997	test-mlogloss:0.68131
[166]	train-mlogloss:0.63647	test-mlogloss:0.67788
[167]	train-mlogloss:0.63326	test-mlogloss:0.67481
[168]	train-mlogloss:0.63043	test-mlogloss:0.67215
[169]	train-mlogloss:0.62698	test-mlogloss:0.66878
[170]	train-mlogloss:0.62355	test-mlogloss:0.66546
[171]	train-mlogloss:0.62023	test-mlogloss:0.66223
[172]	train-mlogloss:0.61703	test-mlogloss:0.65912
[173]	train-mlogloss:0.61355	test-mlogloss:0.65572
[174]	train-mlogloss:0.61087	test-mlogloss:0.65319
[175]	train-mlogloss:0.60778	test-mlogloss:0.65021
[176]	train-mlogloss:0.60488	test-mlogloss:0.64740
[177]	train-mlogloss:0.60162	te

[319]	train-mlogloss:0.31100	test-mlogloss:0.35997
[320]	train-mlogloss:0.30976	test-mlogloss:0.35879
[321]	train-mlogloss:0.30848	test-mlogloss:0.35750
[322]	train-mlogloss:0.30714	test-mlogloss:0.35618
[323]	train-mlogloss:0.30576	test-mlogloss:0.35482
[324]	train-mlogloss:0.30434	test-mlogloss:0.35341
[325]	train-mlogloss:0.30318	test-mlogloss:0.35229
[326]	train-mlogloss:0.30193	test-mlogloss:0.35106
[327]	train-mlogloss:0.30055	test-mlogloss:0.34967
[328]	train-mlogloss:0.29930	test-mlogloss:0.34843
[329]	train-mlogloss:0.29810	test-mlogloss:0.34724
[330]	train-mlogloss:0.29679	test-mlogloss:0.34591
[331]	train-mlogloss:0.29551	test-mlogloss:0.34462
[332]	train-mlogloss:0.29437	test-mlogloss:0.34350
[333]	train-mlogloss:0.29325	test-mlogloss:0.34239
[334]	train-mlogloss:0.29202	test-mlogloss:0.34114
[335]	train-mlogloss:0.29069	test-mlogloss:0.33981
[336]	train-mlogloss:0.28953	test-mlogloss:0.33865
[337]	train-mlogloss:0.28834	test-mlogloss:0.33748
[338]	train-mlogloss:0.28706	te

[480]	train-mlogloss:0.16974	test-mlogloss:0.21946
[481]	train-mlogloss:0.16923	test-mlogloss:0.21895
[482]	train-mlogloss:0.16868	test-mlogloss:0.21842
[483]	train-mlogloss:0.16805	test-mlogloss:0.21777
[484]	train-mlogloss:0.16747	test-mlogloss:0.21719
[485]	train-mlogloss:0.16694	test-mlogloss:0.21668
[486]	train-mlogloss:0.16644	test-mlogloss:0.21619
[487]	train-mlogloss:0.16585	test-mlogloss:0.21560
[488]	train-mlogloss:0.16530	test-mlogloss:0.21503
[489]	train-mlogloss:0.16473	test-mlogloss:0.21447
[490]	train-mlogloss:0.16420	test-mlogloss:0.21393
[491]	train-mlogloss:0.16364	test-mlogloss:0.21335
[492]	train-mlogloss:0.16309	test-mlogloss:0.21280
[493]	train-mlogloss:0.16252	test-mlogloss:0.21221
[494]	train-mlogloss:0.16197	test-mlogloss:0.21166
[495]	train-mlogloss:0.16146	test-mlogloss:0.21117
[496]	train-mlogloss:0.16095	test-mlogloss:0.21065
[497]	train-mlogloss:0.16041	test-mlogloss:0.21011
[498]	train-mlogloss:0.15995	test-mlogloss:0.20965
[499]	train-mlogloss:0.15944	te

[641]	train-mlogloss:0.10695	test-mlogloss:0.15666
[642]	train-mlogloss:0.10666	test-mlogloss:0.15637
[643]	train-mlogloss:0.10636	test-mlogloss:0.15604
[644]	train-mlogloss:0.10608	test-mlogloss:0.15576
[645]	train-mlogloss:0.10581	test-mlogloss:0.15548
[646]	train-mlogloss:0.10551	test-mlogloss:0.15516
[647]	train-mlogloss:0.10523	test-mlogloss:0.15487
[648]	train-mlogloss:0.10498	test-mlogloss:0.15463
[649]	train-mlogloss:0.10474	test-mlogloss:0.15439
[650]	train-mlogloss:0.10445	test-mlogloss:0.15409
[651]	train-mlogloss:0.10419	test-mlogloss:0.15382
[652]	train-mlogloss:0.10391	test-mlogloss:0.15353
[653]	train-mlogloss:0.10369	test-mlogloss:0.15333
[654]	train-mlogloss:0.10344	test-mlogloss:0.15307
[655]	train-mlogloss:0.10316	test-mlogloss:0.15278
[656]	train-mlogloss:0.10292	test-mlogloss:0.15254
[657]	train-mlogloss:0.10265	test-mlogloss:0.15228
[658]	train-mlogloss:0.10239	test-mlogloss:0.15201
[659]	train-mlogloss:0.10215	test-mlogloss:0.15178
[660]	train-mlogloss:0.10192	te

In [28]:
bst.save_model(OUTPUT_PATH + 'sample_800_model.json')

In [9]:
# bst = xgb.Booster()
# bst.load_model(OUTPUT_PATH + "sample_800_model.json")

## Make prediction

In [12]:
chunksize = 100000
prov_data = pd.read_csv(S_TEST, chunksize=chunksize)

In [45]:
# c_data = next(prov_data)
# c_data = test_preprocess(c_data)
# xg_c_data = xgb.DMatrix(c_data)
# c_prob = bst.predict( xg_c_data )

In [13]:
full_pred = pd.DataFrame(columns=output_header)

for i, c_data in enumerate(prov_data):
    
    # store the test suite col
    c_suite = c_data.test_suite
    
    # pre-process test data
    c_data = test_preprocess(c_data)
    xg_c_data = xgb.DMatrix(c_data)
    
    # make prediction using trained model
    c_prob = bst.predict( xg_c_data )
    
    # post-process predict the resutls    
    c_time = c_data.index
    full_pred = test_postprocess(c_time, c_suite, c_prob, full_pred)

    print(f'{(i+1)*chunksize} done....')


100000 done....


In [14]:
float(full_pred['predicted_induced_state_confidence'][0][7:12])

0.003

### Output pred

In [15]:
s_sol = pd.read_csv(S_SOL)
full_pred['top_three_features'] = s_sol['top_three_features']

In [16]:
full_pred.to_csv(OUTPUT_PATH+"solution_sample_800.csv", index=False)