In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt
import threadpool as tp
from multiprocessing.dummy import Pool as ThreadPool
import time as tm

In [2]:
#loading data
shopInfoFile = '../dataset/shop_info.txt'
userPayFile = '../dataset/user_pay.txt'
userViewFile = '../dataset/user_view.txt'

shopInfo = pd.read_table(shopInfoFile, sep = ',', header = None)
userPay = pd.read_table(userPayFile, sep = ',', header = None)
userView = pd.read_table(userViewFile, sep = ',', header = None)

In [3]:
#informationi on shops in different cities
cities = np.unique(shopInfo[1])
shopByCity = {}
shopNumByCity = {}
for city in cities:
    shopNumByCity[city] = shopInfo[shopInfo[1] == city].shape[0]
    shopByCity[city] = np.array(shopInfo[shopInfo[1] == city][0])

In [6]:
#prepare to perform parallel computation
shopNum = shopInfo.shape[0]
startDate = dt.datetime.strptime("2015-07-01 00:00:00", "%Y-%m-%d %H:%M:%S")
endDate = dt.datetime.strptime("2016-10-31 23:59:59", "%Y-%m-%d %H:%M:%S")
days = (endDate - startDate).days + 1

def threadFunc(list):
    shopNum = list[0]
    days = list[1]
    records = list[2]
    viewTH = np.zeros((shopNum, days), dtype = 'int32')
    for i in range(records.shape[0]):
        viewRecord = records[i]
        time = (dt.datetime.strptime(viewRecord[2], "%Y-%m-%d %H:%M:%S") - startDate).days
        shopID = viewRecord[1]
        viewTH[shopID - 1][time] = viewTH[shopID - 1][time] + 1
    return viewTH

In [12]:
#build the view time history table(parallel)
threadNum = 16
parm = []
viewNum = userView.shape[0]
interval = int(np.ceil(viewNum/threadNum))

for i in range(threadNum):
    temp = [shopNum, days]
    start = i * interval
    end = min(viewNum, (i + 1) * interval - 1)
    temp.append(np.array(userView.ix[start:end]))
    parm.append(temp)    

t1 = tm.clock()
pool = ThreadPool(threadNum)
result = pool.map(threadFunc, parm)
pool.close()
pool.join()
t2 = tm.clock()
print("using {0}s for calculating the viewTH table(parallel).".format(t2 - t1))

viewTH = sum(result)

using 277.828297s for calculating the viewTH table(parallel).


In [13]:
#save view time history table
np.savetxt('../preprocess/viewTH_parallel.txt', viewTH, fmt = '%s')

In [6]:
#build the pay time history table(parallel)
threadNum = 16
parm = []
payNum = userPay.shape[0]
interval = int(np.ceil(payNum/threadNum))

for i in range(threadNum):
    temp = [shopNum, days]
    start = i * interval
    end = min(payNum, (i + 1) * interval - 1)
    temp.append(np.array(userPay.ix[start:end]))
    parm.append(temp)   

t1 = tm.clock()
pool = ThreadPool(threadNum)
result = pool.map(threadFunc, parm)
pool.close()
pool.join()
t2 = tm.clock()
print("using {0}s for calculating the payTH table(parallel).".format(t2 - t1))

payTH = sum(result)

using 3593.6999s for calculating the payTH table(parallel).


In [7]:
#save pay time history table
np.savetxt('../preprocess/payTH_parallel.txt', payTH, fmt = '%s')