In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt
import time as tm

In [2]:
# load original data
shopInfoFile = '../dataset/shop_info.txt'
shopInfo = pd.read_table(shopInfoFile, sep = ',', header = None)
shopInfo.columns = ['shopID', 'city', 'locationID', 'perPay', 'score', 'commentCnt', 'shopLevel', 'cate1', 'cate2', 'cate3']

In [3]:
# define evaluation function
def evaluation(preds, dtrain):
    labels = dtrain.get_label()
    num = len(labels)
    return 'loss', abs((labels - preds)/(labels + preds)).sum()/num

In [4]:
#prepare training and testing data
payTH = pd.read_csv('../preprocess/payTH_parallel.txt', sep=" ", header = None)
trainFile = '../preprocess/trainValidFeatures_ensemble.csv'
testFile = '../preprocess/validFeatures_ensemble.csv'
trainData = pd.read_csv(trainFile, header = None)
testData = pd.read_csv(testFile, header = None)

periods = [7, 14, 28, 56, 112]
stats = ['mean', 'std', 'skew', 'kurtosis']
recentDataColumns = []
for period in periods:
    for stat in stats:
        column  = 'last' + str(period) + 'days_' + stat
        recentDataColumns.append(column)

periods = [7, 14, 28]
stats = ['meanView', 'stdView', 'skewView', 'kurtosisView']
recentDataViewColumns = []
for period in periods:
    for stat in stats:
        column = 'last' + str(period) + 'days_' + stat
        recentDataViewColumns.append(column)            
        
periods = [7, 14, 28, 56, 112]
trends = ['copy', 'ridge']
currentTrendcolumns = []
for period in periods:
    for trend in trends:
        column = 'last' + str(period) + 'days_' + trend
        currentTrendcolumns.append(column)
        
primaryKey = ['shopID', 'year', 'month', 'day']
columnDic = {
    'basicInfo':['city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category'],
    'recentData':recentDataColumns,
    'recentDataView':recentDataViewColumns,
    'currentTrend':currentTrendcolumns,
    'temporalInfo':['dayOfWeek', 'holiday', 'numHolidayLast', 'numHolidayCur', 'numHolidayNext'],
    'weather':['maxTemp', 'minTemp', 'weather', 'pm']
}

ensembleCol = ['shopID', 'year', 'month', 'day']
orderCol = ['basicInfo', 'recentData', 'temporalInfo', 'currentTrend', 'weather', 'recentDataView']
for col in orderCol:
    ensembleCol = ensembleCol + columnDic[col]
    
trainData.columns = ensembleCol
testData.columns = ensembleCol

startDateTrain = dt.date(2016, 9, 20)
endDateTrain = dt.date(2016, 10, 17)
startDateTest = dt.date(2016, 10, 18)
endDateTest = dt.date(2016, 10, 31)
startDate = dt.date(2015, 7, 1)
endDate = dt.date(2016, 10, 31)

startTrain = (startDateTrain - startDate).days
endTrain = (endDateTrain - startDate).days
startValid = (startDateTest - startDate).days
endValid = (endDateTest - startDate).days

trainLabel = payTH[np.arange(startTrain, endTrain + 1)].values.reshape(1, -1)[0]
validLabel = payTH[np.arange(startValid, endValid + 1)].values.reshape(1, -1)[0]

In [9]:
#parameter selection
maxDepthList = [5, 6, 7, 8, 9, 10]
etaList = [0.02, 0.05]
numRoundsList = [500, 1000, 1500, 2000]

early_stopping_rounds = 100

selectCol = ensembleCol

validError = []
paramsList = []

for maxDepth in maxDepthList:
    for eta in etaList:
        for numRounds in numRoundsList:
            params = {
                'objective':'reg:linear',
                'max_depth':maxDepth,
                'silent':1,
                'eta':eta,
                'subsample':0.9,
                'colsample_bytree':0.7,
            }
            num_rounds = numRounds
            
            print("calculating model with parameters: max_depth = {0}, eta = {1}, num_rounds = {2}".format(maxDepth, eta, numRounds))
            
            dtrain = xgb.DMatrix(data = trainData[selectCol].values, label = trainLabel, missing=np.nan)
            dval = xgb.DMatrix(data = testData[selectCol].values, missing=np.nan)
            
            bst = xgb.train(params, dtrain, num_rounds)
            pred = bst.predict(dval)
            evaluation = abs((validLabel - pred)/(validLabel + pred)).sum()/len(validLabel)
            print("validation error = {0}".format(evaluation))
            validError.append(evaluation)
            paramsList.append([maxDepth, eta, numRounds])
            

calculating model with parameters: max_depth = 5, eta = 0.02, num_rounds = 500
validation error = 0.10356921445367577
calculating model with parameters: max_depth = 5, eta = 0.02, num_rounds = 1000
validation error = 0.10321155274186349
calculating model with parameters: max_depth = 5, eta = 0.02, num_rounds = 1500
validation error = 0.10344169780216501
calculating model with parameters: max_depth = 5, eta = 0.02, num_rounds = 2000
validation error = 0.10368820614912821
calculating model with parameters: max_depth = 5, eta = 0.05, num_rounds = 500
validation error = 0.10452251643892431
calculating model with parameters: max_depth = 5, eta = 0.05, num_rounds = 1000
validation error = 0.10520010426312451
calculating model with parameters: max_depth = 5, eta = 0.05, num_rounds = 1500
validation error = 0.10601472102415216
calculating model with parameters: max_depth = 5, eta = 0.05, num_rounds = 2000
validation error = 0.10674343252743097
calculating model with parameters: max_depth = 6, 