In [12]:
import pandas as pd
import xgboost as xgb
import datetime

## Config (to be imported from separate file)

In [21]:
# category & headers
CATEGORY = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}
REV_CATEGORY = { v:k for k,v in CATEGORY.items()}
OUTPUT_HEADER = ['timestamp', 'test_suite', 'predicted_induced_state',
       'three_sec_predicted_induced_state',
       'predicted_induced_state_confidence',
       'three_sec_predicted_induced_state_confidence', 'top_three_features']

param = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

# number of iteration for model training
NUM_ROUND = 50

### Helper (separate file) 

In [14]:
def round_time(time_serie):
    time_serie = pd.to_numeric(time_serie)
    time_serie = time_serie.apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    time_serie = time_serie.dt.round('1s')
    time_serie = time_serie.apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
#     time_serie = time_serie.drop_duplicates()
    
    return time_serie

# return the mode(most comm) element in a series
def mode(series):
    return series.value_counts().index[0]

# return the mean value excluding -9999.9 the default value, if there's normal values
def normal_mean(series):
    if series.nunique() > 1:
        return series[series > -9999.9].mean()
    return series.unique()

### User Input

In [15]:
# Train data path
TRAIN_PATH = 'data/data_training.csv'
# MODEL PATHS
MODEL_PATH = 'submissions/output/'


# Py file

In [16]:
def data_preprocess(df, full_df):
    # map  'induced_state' for training data
    if('induced_state' in df.columns):
        df["induced_state"] = df["induced_state"].replace(CATEGORY)
    
    # round time to whole seconds
    df['time'] = round_time(df['time'])    

    # aggregate the data by 'time' & 'test_suite' 
    col_merger = dict(zip(df.columns[2:], [normal_mean]*len(df.columns[2:])))
    if('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    df = df.groupby(['time', 'test_suite']).agg(col_merger)
    
    return full_df.append(df)

In [17]:
def load_data_in_chunks(path, chunksize=500000): 
    
    df = pd.read_csv(path, chunksize=chunksize)
    
    # read the header cols
    with open(path) as f:
        header = f.readline().strip().split(',')
    
    # iterate thru the data in chunks and process them
    full_df = pd.DataFrame(columns=header).set_index(['time', 'test_suite'])
    for i, c in enumerate(df):
        full_df = data_preprocess(c, full_df)

        print(f'{(i+1)*chunksize} done....')
        
    # futher aggregate the duplicates generated from different chunks
    col_merger = dict(zip(full_df.columns, [normal_mean]*len(full_df.columns)))
    if('induced_state' in col_merger):
        col_merger['induced_state'] = mode
    full_df = full_df.groupby(['time', 'test_suite']).agg(col_merger)
    
    return full_df

In [18]:
def train_preprocess(df, test_size=0):
    
    # enforce dtypes
    df['induced_state'] = df['induced_state'].astype(int)
    df['tlx_score'] = df['tlx_score'].astype(int)
    
    drop_cols = ['induced_state'] #, 'tlx_score']
    X = df.loc[:, [ c not in drop_cols for c in df.columns]]
    Y = df['induced_state']
    
    xg_train = xgb.DMatrix(X, label=Y)
    
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=2)

#     xg_train = xgb.DMatrix(X_train, label=y_train)
#     xg_test = xgb.DMatrix( X_test, label=y_test)
    
#     return xg_train, xg_test, X_train, X_test, y_train, y_test

    return xg_train

### Main

In [23]:
def main():
    
    if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
        print("Training input file is missing.")
        return 1
    
    if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
        print("Training output file is missing.")
        return 1
    
    print('Training started.')
    
    input_file = sys.argv[1]
    model_file = sys.argv[2]
    
    # My Code Starts HERE
    
    # load & process data
    df = load_data_in_chunks(input_file)
    xg_train = train_preprocess(df)
    
    # train xgb model
    watchlist = [ (xg_train,'train')]
    num_round = NUM_ROUND
    bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=10)
    
    # output trained model
    bst.save_model(model_file)

    print('Training finished.')

    return 0

if __name__ == "__main__":
    main()

500000 done....
[0]	train-mlogloss:1.77387
[1]	train-mlogloss:1.75660
[2]	train-mlogloss:1.74001
[3]	train-mlogloss:1.72325
[4]	train-mlogloss:1.70623
[5]	train-mlogloss:1.68996
[6]	train-mlogloss:1.67374
[7]	train-mlogloss:1.65792
[8]	train-mlogloss:1.64252
[9]	train-mlogloss:1.62756
[10]	train-mlogloss:1.61260
[11]	train-mlogloss:1.59823
[12]	train-mlogloss:1.58361
[13]	train-mlogloss:1.56942
[14]	train-mlogloss:1.55513
[15]	train-mlogloss:1.54106
[16]	train-mlogloss:1.52737
[17]	train-mlogloss:1.51398
[18]	train-mlogloss:1.50053
[19]	train-mlogloss:1.48749
[20]	train-mlogloss:1.47457
[21]	train-mlogloss:1.46151
[22]	train-mlogloss:1.44900
[23]	train-mlogloss:1.43626
[24]	train-mlogloss:1.42448
[25]	train-mlogloss:1.41256
[26]	train-mlogloss:1.40056
[27]	train-mlogloss:1.38883
[28]	train-mlogloss:1.37708
[29]	train-mlogloss:1.36566
[30]	train-mlogloss:1.35397
[31]	train-mlogloss:1.34238
[32]	train-mlogloss:1.33092
[33]	train-mlogloss:1.31973
[34]	train-mlogloss:1.30904
[35]	train-mlo