In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt

In [2]:
# loading original data
shopInfoFile = '../dataset/shop_info.txt'

shopInfo = pd.read_table(shopInfoFile, sep = ',', header = None)
shopInfo.columns = ['shopID', 'city', 'locationID', 'perPay', 'score', 'commentCnt', 'shopLevel', 'cate1', 'cate2', 'cate3']

In [3]:
# define evaluation function
def evaluation(preds, dtrain):
    labels = dtrain.get_label()
    num = len(labels)
    return 'loss', abs((labels - preds)/(labels + preds)).sum()/num

In [4]:
#load training and testing data
payTH = pd.read_csv('../preprocess/payTH_parallel.txt', sep=" ", header = None)
trainFile = '../preprocess/trainValidFeatures_ensemble.csv'
testFile = '../preprocess/validFeatures_ensemble.csv'
trainData = pd.read_csv(trainFile, header = None)
testData = pd.read_csv(testFile, header = None)

periods = [7, 14, 28, 56, 112]
stats = ['mean', 'std', 'skew', 'kurtosis']
recentDataColumns = []
for period in periods:
    for stat in stats:
        column  = 'last' + str(period) + 'days_' + stat
        recentDataColumns.append(column)

periods = [7, 14, 28]
stats = ['meanView', 'stdView', 'skewView', 'kurtosisView']
recentDataViewColumns = []
for period in periods:
    for stat in stats:
        column = 'last' + str(period) + 'days_' + stat
        recentDataViewColumns.append(column)            
        
periods = [7, 14, 28, 56, 112]
trends = ['copy', 'ridge']
currentTrendcolumns = []
for period in periods:
    for trend in trends:
        column = 'last' + str(period) + 'days_' + trend
        currentTrendcolumns.append(column)
        
primaryKey = ['shopID', 'year', 'month', 'day']
columnDic = {
    'basicInfo':['city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category'],
    'recentData':recentDataColumns,
    'recentDataView':recentDataViewColumns,
    'currentTrend':currentTrendcolumns,
    'temporalInfo':['dayOfWeek', 'holiday', 'numHolidayLast', 'numHolidayCur', 'numHolidayNext'],
    'weather':['maxTemp', 'minTemp', 'weather', 'pm']
}

ensembleCol = ['shopID', 'year', 'month', 'day']
orderCol = ['basicInfo', 'recentData', 'temporalInfo', 'currentTrend', 'weather', 'recentDataView']
for col in orderCol:
    ensembleCol = ensembleCol + columnDic[col]
    
trainData.columns = ensembleCol
testData.columns = ensembleCol

# ensemble model for validation

In [5]:
startDateTrain = dt.date(2016, 9, 20)
endDateTrain = dt.date(2016, 10, 17)
startDateTest = dt.date(2016, 10, 18)
endDateTest = dt.date(2016, 10, 31)
startDate = dt.date(2015, 7, 1)
endDate = dt.date(2016, 10, 31)

startTrain = (startDateTrain - startDate).days
endTrain = (endDateTrain - startDate).days
startValid = (startDateTest - startDate).days
endValid = (endDateTest - startDate).days

In [6]:
featureNum = 25   # 1/3 features
totalNum = len(ensembleCol)

randomNum = 500   # 500 random models

trainLabel = payTH[np.arange(startTrain, endTrain + 1)].values.reshape(1, -1)[0]
validLabel = payTH[np.arange(startValid, endValid + 1)].values.reshape(1, -1)[0]

params = {
    'objective':'reg:linear',
    'max_depth':5,
    'silent':1,
    'eta':0.02,
    'subsample':0.9,
    'colsample_bytree':0.7,
}
num_rounds = 1000

randomPreds = []
randomFeatures = []

for i in range(randomNum):
    print("calculating model {}".format(i + 1))
    indexFeature = np.random.permutation(totalNum)[:featureNum]
    selectCol = [ensembleCol[k] for k in indexFeature]
    randomFeatures.append(selectCol)
    
    dtrain = xgb.DMatrix(data = trainData[selectCol].values, label = trainLabel, missing=np.nan)
    dval = xgb.DMatrix(data = testData[selectCol].values, missing=np.nan)

    bst = xgb.train(params, dtrain, num_rounds)
    pred = bst.predict(dval)
    randomPreds.append(abs(pred.astype(np.int32)))

print("storing the results...")
randomPredsDF = pd.DataFrame(randomPreds)
randomFeaturesDF = pd.DataFrame(randomFeatures)
randomPredsDF.to_csv('../combination/randomPreds.csv', header = False, index = False)
randomFeaturesDF.to_csv('../combination/randomFeatures.csv', header = False, index = False)

combination = []

recordNum = len(validLabel)
randomPreds = np.array(randomPreds)

minCombination = []
minEvaluation = 1.0

num = len(randomFeatures)

for i in range(num):
    combination.append([])
    if i % 50 == 0:
        print(i)
    for j in range(num):
        pair = (randomPreds[i] + randomPreds[j])/2
        evaluation = abs((validLabel - pair)/(validLabel + pair)).sum()/recordNum
        combination[i].append(evaluation)
        if evaluation < minEvaluation:
            minEvaluation = evaluation
            minCombination = [i, j]
        
combinationDF = pd.DataFrame(combination)
combinationDF.to_csv('../combination/combination.csv', header = False, index = False)
combinationDF

calculating model 1
calculating model 2
calculating model 3
calculating model 4
calculating model 5
calculating model 6
calculating model 7
calculating model 8
calculating model 9
calculating model 10
calculating model 11
calculating model 12
calculating model 13
calculating model 14
calculating model 15
calculating model 16
calculating model 17
calculating model 18
calculating model 19
calculating model 20
calculating model 21
calculating model 22
calculating model 23
calculating model 24
calculating model 25
calculating model 26
calculating model 27
calculating model 28
calculating model 29
calculating model 30
calculating model 31
calculating model 32
calculating model 33
calculating model 34
calculating model 35
calculating model 36
calculating model 37
calculating model 38
calculating model 39
calculating model 40
calculating model 41
calculating model 42
calculating model 43
calculating model 44
calculating model 45
calculating model 46
calculating model 47
calculating model 48
c



50
100
150
200
250
300
350
400
450


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.111587,0.107741,0.107569,0.107576,0.108478,0.106346,0.110432,0.104647,0.104715,0.107017,...,0.104174,0.106450,0.108978,0.103836,0.102237,0.105560,0.106061,0.105482,0.105704,0.108112
1,0.107741,0.109399,0.105140,0.105539,0.107649,0.105455,0.108656,0.103225,0.103171,0.105892,...,0.102031,0.104208,0.107979,0.101853,0.100538,0.103863,0.105322,0.103305,0.103787,0.106380
2,0.107569,0.105140,0.109224,0.110747,0.106457,0.107827,0.106680,0.105560,0.104979,0.105537,...,0.104326,0.107941,0.108082,0.105915,0.104038,0.105382,0.104961,0.106217,0.106239,0.109039
3,0.107576,0.105539,0.110747,0.116327,0.107014,0.111102,0.105658,0.107608,0.107302,0.107069,...,0.106436,0.110452,0.109329,0.109367,0.107135,0.106491,0.106544,0.108235,0.108867,0.111975
4,0.108478,0.107649,0.106457,0.107014,0.109456,0.105503,0.109294,0.103737,0.103940,0.106986,...,0.102143,0.105012,0.109235,0.102266,0.100665,0.104536,0.105647,0.104263,0.104442,0.107596
5,0.106346,0.105455,0.107827,0.111102,0.105503,0.113160,0.103731,0.107182,0.104308,0.106236,...,0.104962,0.109896,0.107396,0.106895,0.106189,0.104681,0.106091,0.105272,0.105894,0.107833
6,0.110432,0.108656,0.106680,0.105658,0.109294,0.103731,0.112818,0.103280,0.104144,0.106836,...,0.102289,0.103939,0.109342,0.101608,0.100090,0.105050,0.105500,0.104923,0.104274,0.107794
7,0.104647,0.103225,0.105560,0.107608,0.103737,0.107182,0.103280,0.107240,0.102155,0.103715,...,0.102483,0.107332,0.106556,0.103335,0.104504,0.104404,0.103552,0.103135,0.103034,0.105312
8,0.104715,0.103171,0.104979,0.107302,0.103940,0.104308,0.104144,0.102155,0.103708,0.102468,...,0.101574,0.103720,0.105244,0.102625,0.100605,0.102509,0.101929,0.102727,0.103335,0.106303
9,0.107017,0.105892,0.105537,0.107069,0.106986,0.106236,0.106836,0.103715,0.102468,0.107491,...,0.102182,0.105808,0.108273,0.102850,0.100750,0.103308,0.106109,0.103296,0.104268,0.106506


In [7]:
minEvaluation

0.09579756084276006

In [8]:
randomFeatures[minCombination[0]]

['last28days_meanView',
 'month',
 'dayOfWeek',
 'numHolidayNext',
 'last7days_std',
 'last28days_stdView',
 'category',
 'last28days_kurtosisView',
 'last7days_skewView',
 'last28days_skewView',
 'last28days_kurtosis',
 'last14days_stdView',
 'last56days_ridge',
 'day',
 'last112days_ridge',
 'last14days_ridge',
 'pm',
 'last14days_mean',
 'last28days_std',
 'last7days_copy',
 'holiday',
 'last112days_skew',
 'perPay',
 'last112days_copy',
 'last28days_skew']

In [9]:
randomFeatures[minCombination[1]]

['weather',
 'numHolidayLast',
 'last7days_skew',
 'maxTemp',
 'last28days_stdView',
 'last56days_kurtosis',
 'category',
 'last7days_copy',
 'last14days_skew',
 'year',
 'shopLevel',
 'last7days_kurtosisView',
 'last28days_kurtosisView',
 'month',
 'last7days_std',
 'last14days_std',
 'last28days_std',
 'commentCnt',
 'dayOfWeek',
 'holiday',
 'score',
 'last56days_ridge',
 'last56days_copy',
 'last7days_ridge',
 'numHolidayCur']

# ensemble model for final submission

In [10]:
trainPredFile = '../preprocess/trainTestFeatures_ensemble.csv'
trainPredData = pd.read_csv(trainPredFile, header = None)
testPredFile = '../preprocess/testFeatures_ensemble.csv'
testPredData = pd.read_csv(testPredFile, header = None)
trainPredData.columns = ensembleCol
testPredData.columns = ensembleCol

startDateTrain = dt.date(2016, 10, 4)
endDateTrain = dt.date(2016, 10, 31)
startDateTest = dt.date(2016, 11, 1)
endDateTest = dt.date(2016, 11, 14)
startDate = dt.date(2015, 7, 1)
endDate = dt.date(2016, 10, 31)

startTrain = (startDateTrain - startDate).days
endTrain = (endDateTrain - startDate).days
startValid = (startDateTest - startDate).days
endValid = (endDateTest - startDate).days

params = {
    'objective':'reg:linear',
    'max_depth':5,
    'silent':1,
    'eta':0.02,
    'subsample':0.9,
    'colsample_bytree':0.7,
}
num_rounds = 1000
trainPredLabel = payTH[np.arange(startTrain, endTrain + 1)].values.reshape(1, -1)[0]

In [13]:
minPairPreds = []

for i in range(len(minCombination)):
    print("calculating minCombination model {}".format(i + 1))
    selectCol = randomFeatures[minCombination[i]]
    
    dtrain = xgb.DMatrix(data = trainPredData[selectCol].values, label = trainPredLabel, missing=np.nan)
    dtest = xgb.DMatrix(data = testPredData[selectCol].values, missing=np.nan)
    
    bst = xgb.train(params, dtrain, num_rounds)
    pred = bst.predict(dtest)
    minPairPreds.append(abs(pred.astype(np.int32)))

minPred = (minPairPreds[0] + minPairPreds[1])/2

calculating minCombination model 1
calculating minCombination model 2


In [14]:
# save submission file
submit = minPred.reshape(-1, 14).astype(np.int32)
submitDF = pd.DataFrame(submit, index = (np.arange(2000) + 1))
submitDF.to_csv('../prediction/ensembleXGBoost.csv', header = False, index = True, date_format = 'int32')