In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import time as tm

In [2]:
# loading original data
shopInfoFile = '../dataset/shop_info.txt'

shopInfo = pd.read_table(shopInfoFile, sep = ',', header = None)
shopInfo.columns = ['shopID', 'city', 'locationID', 'perPay', 'score', 'commentCnt', 'shopLevel', 'cate1', 'cate2', 'cate3']

In [3]:
#load training and testing data
payTH = pd.read_csv('../preprocess/payTH_parallel.txt', sep=" ", header = None)
trainFile = '../preprocess/trainValidFeatures_ensemble.csv'
testFile = '../preprocess/validFeatures_ensemble.csv'
trainData = pd.read_csv(trainFile, header = None)
testData = pd.read_csv(testFile, header = None)

In [4]:
# preparing training set and validation set
periods = [7, 14, 28, 56, 112]
stats = ['mean', 'std', 'skew', 'kurtosis']
recentDataColumns = []
for period in periods:
    for stat in stats:
        column  = 'last' + str(period) + 'days_' + stat
        recentDataColumns.append(column)

periods = [7, 14, 28]
stats = ['meanView', 'stdView', 'skewView', 'kurtosisView']
recentDataViewColumns = []
for period in periods:
    for stat in stats:
        column = 'last' + str(period) + 'days_' + stat
        recentDataViewColumns.append(column)            
        
periods = [7, 14, 28, 56, 112]
trends = ['copy', 'ridge']
currentTrendcolumns = []
for period in periods:
    for trend in trends:
        column = 'last' + str(period) + 'days_' + trend
        currentTrendcolumns.append(column)
        
primaryKey = ['shopID', 'year', 'month', 'day']
columnDic = {
    'basicInfo':['city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category'],
    'recentData':recentDataColumns,
    'recentDataView':recentDataViewColumns,
    'currentTrend':currentTrendcolumns,
    'temporalInfo':['dayOfWeek', 'holiday', 'numHolidayLast', 'numHolidayCur', 'numHolidayNext'],
    'weather':['maxTemp', 'minTemp', 'weather', 'pm']
}

ensembleCol = ['shopID', 'year', 'month', 'day']
orderCol = ['basicInfo', 'recentData', 'temporalInfo', 'currentTrend', 'weather', 'recentDataView']
for col in orderCol:
    ensembleCol = ensembleCol + columnDic[col]
    
trainData.columns = ensembleCol
testData.columns = ensembleCol

startDateTrain = dt.date(2016, 9, 20)
endDateTrain = dt.date(2016, 10, 17)
startDateTest = dt.date(2016, 10, 18)
endDateTest = dt.date(2016, 10, 31)
startDate = dt.date(2015, 7, 1)
endDate = dt.date(2016, 10, 31)

startTrain = (startDateTrain - startDate).days
endTrain = (endDateTrain - startDate).days
startValid = (startDateTest - startDate).days
endValid = (endDateTest - startDate).days

trainLabel = payTH[np.arange(startTrain, endTrain + 1)].values.reshape(1, -1)[0]
validLabel = payTH[np.arange(startValid, endValid + 1)].values.reshape(1, -1)[0]

# data preprocessing

In [5]:
def detectNaN(a):
    for i in range(len(a[0])):
        e = True
        for j in range(len(a) - 1):
            if np.isnan(a[j][i]):
                e = False
                break
        if (not e):
            print(i)
def replace(a):
    for i in range(len(a[0])):
        e = True
        for j in range(len(a)):
            if np.isnan(a[j][i]):
                a[j][i] = a[j - 1][i]
    return a

In [6]:
# preprocessing training set
trainDataArray = np.array(trainData)
trainDataArrayProcessed = np.delete(trainDataArray, [1, 2], 1)
trainDataProcessed = replace(trainDataArrayProcessed)
detectNaN(trainDataProcessed)

scaler = StandardScaler()
scaler.fit(trainDataProcessed)
trainDataNormalized = scaler.transform(trainDataProcessed)
detectNaN(trainDataNormalized)

In [7]:
# preprocessing validation set
testDataArray = np.array(testData)
testDataArrayProcessed = np.delete(testDataArray, [1, 2], 1)
testDataProcessed = replace(testDataArrayProcessed)
detectNaN(testDataProcessed)

scaler = StandardScaler()
scaler.fit(testDataProcessed)
testDataNormalized = scaler.transform(testDataProcessed)
detectNaN(testDataNormalized)

# parameter selection

In [12]:
neighbors = [2, 5, 10, 20, 50]
preds_knn = []
eval_knn = []
recordNum = len(validLabel)

for num in neighbors:
    
    rgs_knn = KNeighborsRegressor(n_neighbors=num)
    rgs_knn.fit(trainDataNormalized, trainLabel)
    pred_knn = rgs_knn.predict(testDataNormalized)
    preds_knn.append(pred_knn)
    evaluation = abs((validLabel - pred_knn)/(validLabel + pred_knn)).sum()/recordNum
    eval_knn.append(evaluation)
    print(num, evaluation)

2 0.15025776744
5 0.135792059521
10 0.131294982587
20 0.129674811501
50 0.131017965476
