In [1]:
import pandas as pd
import xgboost as xgb
import datetime

## Config (to be imported from separate file)

In [None]:
# category & headers
CATEGORY = {'low':0, 'medium':1, 'high':2, 'baseline':3, 'channelized':4, 'surprise':5}
REV_CATEGORY = { v:k for k,v in CATEGORY.items()}
OUTPUT_HEADER = ['timestamp', 'test_suite', 'predicted_induced_state',
       'three_sec_predicted_induced_state',
       'predicted_induced_state_confidence',
       'three_sec_predicted_induced_state_confidence', 'top_three_features']

param = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',  # 多分类的问题
    'num_class': 6,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
#     'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
#     'nthread': 4,                  # cpu 线程数
}

# number of iteration for model training
NUM_ROUND = 2000

### Helper (separate file) 

In [None]:
def round_time(time_serie):
    time_serie = pd.to_numeric(time_serie)
    time_serie = time_serie.apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    time_serie = time_serie.dt.round('1s')
    time_serie = time_serie.apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
#     time_serie = time_serie.drop_duplicates()
    
    return time_serie

# return the mode(most comm) element in a series
def mode(series):
    return series.value_counts().index[0]

# return the mean value excluding -9999.9 the default value, if there's normal values
def normal_mean(series):
    if series.nunique() > 1:
        return series[series > -9999.9].mean()
    return series.unique()

### User Input

In [None]:
# Test data path
TEST_PATH = 'data/data_training.csv'
# OUTPUT PATH
OUTPUT_PATH = ''
# MODEL PATH
MODEL_PATH = 'submissions/output/'


# Py File

## Main

In [None]:
def main():
    
    if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
        print("Testing input file is missing.")
        return 1
    
    if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
        print("Testing output file is missing.")
        return 1
    
    print('Testing started.')

    input_file = sys.argv[1]
    output_file = sys.argv[2]
    model_file = sys.argv[3]
    
    # My Code Starts HERE
    
    test_df = load_data_in_chunks(input_file)#, chunksize=10000)
    
    
    
    ########################################################################

    test_data = pd.read_csv(input_file)
    model_data = pd.read_csv(model_file)

    final_cols = test_data.columns.tolist()
    model_cols = model_data.columns.tolist()
    target = model_cols[0]
    columns = model_cols[1:]

    df = test_data.copy()
    df['timestamp'] = pd.to_numeric(df['time'])
    df['timestamp'] = df['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    df['timestamp'] = df['timestamp'].dt.round('1s')
    df['timestamp'] = df['timestamp'].apply(lambda x: int(datetime.datetime.timestamp(x)*1000000))
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    
    submission_data = df[['timestamp','test_suite']].copy()
    submission_data = submission_data.drop_duplicates()
    submission_data = submission_data.reset_index(drop=True)

    category = {0:'low', 1:'medium', 2:'high', 3:'baseline', 4:'channelized', 5:'surprise'}

    model = LinearRegression()
    model.fit(model_data[columns], model_data[target])

    test_data['predicted_induced_state'] = np.rint(model.predict(test_data[columns]))
    test_data["predicted_induced_state"] = test_data["predicted_induced_state"].replace(category)

    test_data['three_sec_predicted_induced_state'] = np.rint(model.predict(test_data[columns]))
    test_data["three_sec_predicted_induced_state"] = test_data["three_sec_predicted_induced_state"].replace(category)

    submission_data['predicted_induced_state'] = test_data["predicted_induced_state"]
    submission_data['three_sec_predicted_induced_state'] = test_data["three_sec_predicted_induced_state"]

    confidence_list = []
    for i in range(submission_data.shape[0]):
        confidence_list_temp = []
        for j in range(6):
            confidence_list_temp.append(round(random.uniform(0, 1),3))
            arr = np.array(confidence_list_temp)
        confidence_list.append(arr)
    submission_data['predicted_induced_state_confidence'] = confidence_list

    three_sec_confidence_list = []
    for i in range(submission_data.shape[0]):
        three_sec_confidence_list_temp = []
        for j in range(6):
            three_sec_confidence_list_temp.append(round(random.uniform(0, 1),3))
            arr = np.array(three_sec_confidence_list_temp)
        three_sec_confidence_list.append(arr)
    submission_data['three_sec_predicted_induced_state_confidence'] = three_sec_confidence_list

    feature_list = final_cols[7:]
    three_top_features = []
    for i in range(submission_data.shape[0]):
        three_top_features_temp = []
        for j in range(3):
            three_top_features_temp.append(random.choice(feature_list))
            arr = np.array(three_top_features_temp)    
        three_top_features.append(arr)
    submission_data['top_three_features'] = three_top_features

    submission_data.to_csv(output_file, index=False)

    print('Testing finished.')

    return 0

if __name__ == "__main__":
    main()
