In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
#loading data
shopInfoFile = '../dataset/shop_info.txt'

shopInfo = pd.read_table(shopInfoFile, sep = ',', header = None)
shopInfo.columns = ['shopID', 'city', 'locationID', 'perPay', 'score', 'commentCnt', 'shopLevel', 'cate1', 'cate2', 'cate3']

In [3]:
#informationi on shops in different cities
cities = np.unique(shopInfo['city'])
shopByCity = {}
shopNumByCity = {}
for city in cities:
    shopNumByCity[city] = shopInfo[shopInfo['city'] == city].shape[0]
    shopByCity[city] = np.array(shopInfo[shopInfo['city'] == city]['shopID'])

In [4]:
# generate city code
cities = np.unique(shopInfo['city'])
cityDic = {}
for index, city in enumerate(cities):
    cityDic[city] = shopNumByCity[city]
cityDF = pd.DataFrame(cityDic, index = ['code']).T
cityDF.to_csv('../preprocess/cityEncoding.csv', header = False)

In [5]:
# generate category code
categoryDic = {}
classFirst = np.unique(shopInfo['cate1'])
count = 1
cur = 1
interval = 1000
countMS = len(classFirst) * 1000
intervalMS = 30
for index1, class1 in enumerate(classFirst):
    categoryDic[class1] = {}
    classSecond = np.unique(shopInfo[shopInfo['cate1'] == class1]['cate2'])

    for class2 in classSecond:
        categoryDic[class1][class2] = {}
        classThird = np.unique(shopInfo[shopInfo['cate1'] == class1][shopInfo['cate2'] == class2]['cate3'])
        for class3 in classThird:
            if class1 == '美食':
                categoryDic[class1][class2][class3] = countMS
                countMS = countMS + 1
            else:
                categoryDic[class1][class2][class3] = cur
                cur = cur + 1
        if class1 == '美食':
            countMS = countMS + intervalMS - 1
        else:
            cur = cur + intervalMS - 1
            
    if class1 != '美食':
        cur = count * interval
        count = count + 1

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [6]:
# save city code and category code data
cityCode = []
categoryCode = []
for shopID in shopInfo['shopID']:
    record = shopInfo.ix[shopID - 1]
    city = record['city']
    cate1 = record['cate1']
    cate2 = record['cate2']
    cate3 = record['cate3']
    cityCode.append(cityDic[city])
    categoryCode.append(categoryDic[cate1][cate2][cate3])

shopInfo_new = pd.read_table(shopInfoFile, sep = ',', header = None)
shopInfo_new.columns = ['shopID', 'city', 'locationID', 'perPay', 'score', 'commentCnt', 'shopLevel', 'cate1', 'cate2', 'cate3']
shopInfo_new['city'] = cityCode
del shopInfo_new['cate1']
del shopInfo_new['locationID']
del shopInfo_new['cate2']
del shopInfo_new['cate3']
shopInfo_new['category'] = categoryCode

shopInfo_new['score'][np.isnan(shopInfo_new['score'])] = 0.0
shopInfo_new['commentCnt'][np.isnan(shopInfo_new['commentCnt'])] = 0.0

shopInfo_new.to_csv('../preprocess/shopInfo.csv', header = False, index = False, date_format = 'int32')

shopInfo_load = pd.read_csv('../preprocess/shopInfo.csv', header = None, dtype = 'int32')
shopInfo_load.columns = ['shopID', 'city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# define dates for training and testing
startDateTrain = dt.date(2016, 9, 20)
endDateTrain = dt.date(2016, 10, 17)
startDateTest = dt.date(2016, 10, 18)
endDateTest = dt.date(2016, 10, 31)
columns = ['shopID', 'year', 'month', 'day', 'city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category']

In [8]:
shopDataTrain = {}
for column in columns:
    shopDataTrain[column] = []

for shopID in shopInfo_load['shopID']:
    curDate = startDateTrain
    endDate = endDateTrain + dt.timedelta(days = 1)
    while curDate != endDate:
        for shopCol in shopInfo_load.columns:
            shopDataTrain[shopCol].append(shopInfo_load[shopCol][shopID - 1])
        shopDataTrain['year'].append(curDate.year)
        shopDataTrain['month'].append(curDate.month)
        shopDataTrain['day'].append(curDate.day)
        curDate = curDate + dt.timedelta(days = 1)

In [9]:
trainFeatures_basicInfo = pd.DataFrame(shopDataTrain, columns = columns)
trainFeatures_basicInfo.to_csv('../preprocess/trainValidFeatures_basicInfo.csv', header = False, index = False, date_format = 'int32')

In [10]:
shopDataTest = {}
for column in columns:
    shopDataTest[column] = []

for shopID in shopInfo_load['shopID']:
    curDate = startDateTest
    endDate = endDateTest + dt.timedelta(days = 1)
    while curDate != endDate:
        for shopCol in shopInfo_load.columns:
            shopDataTest[shopCol].append(shopInfo_load[shopCol][shopID - 1])
        shopDataTest['year'].append(curDate.year)
        shopDataTest['month'].append(curDate.month)
        shopDataTest['day'].append(curDate.day)
        curDate = curDate + dt.timedelta(days = 1)

In [11]:
testFeatures_basicInfo = pd.DataFrame(shopDataTest, columns = columns)
testFeatures_basicInfo.to_csv('../preprocess/validFeatures_basicInfo.csv', header = False, index = False, date_format = 'int32')

In [12]:
#trainTest data
startDateTrain = dt.date(2016, 10, 4)
endDateTrain = dt.date(2016, 10, 31)
startDateTest = dt.date(2016, 11, 1)
endDateTest = dt.date(2016, 11, 14)
columns = ['shopID', 'year', 'month', 'day', 'city', 'perPay', 'score', 'commentCnt', 'shopLevel', 'category']

In [13]:
shopDataTrain = {}
for column in columns:
    shopDataTrain[column] = []

for shopID in shopInfo_load['shopID']:
    curDate = startDateTrain
    endDate = endDateTrain + dt.timedelta(days = 1)
    while curDate != endDate:
        for shopCol in shopInfo_load.columns:
            shopDataTrain[shopCol].append(shopInfo_load[shopCol][shopID - 1])
        shopDataTrain['year'].append(curDate.year)
        shopDataTrain['month'].append(curDate.month)
        shopDataTrain['day'].append(curDate.day)
        curDate = curDate + dt.timedelta(days = 1)

In [14]:
trainFeatures_basicInfo = pd.DataFrame(shopDataTrain, columns = columns)
trainFeatures_basicInfo.to_csv('../preprocess/trainTestFeatures_basicInfo.csv', header = False, index = False, date_format = 'int32')

In [15]:
shopDataTest = {}
for column in columns:
    shopDataTest[column] = []

for shopID in shopInfo_load['shopID']:
    curDate = startDateTest
    endDate = endDateTest + dt.timedelta(days = 1)
    while curDate != endDate:
        for shopCol in shopInfo_load.columns:
            shopDataTest[shopCol].append(shopInfo_load[shopCol][shopID - 1])
        shopDataTest['year'].append(curDate.year)
        shopDataTest['month'].append(curDate.month)
        shopDataTest['day'].append(curDate.day)
        curDate = curDate + dt.timedelta(days = 1)

In [16]:
testFeatures_basicInfo = pd.DataFrame(shopDataTest, columns = columns)
testFeatures_basicInfo.to_csv('../preprocess/testFeatures_basicInfo.csv', header = False, index = False, date_format = 'int32')