In [None]:
import pandas as pd
import os
import glob
import sys
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
from six.moves import cPickle as pickle

In [None]:
# load district hashmap first
def loadCluster(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['district_hash','district_id'] 
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t', names=col_names)
        dlist.append(df)
    df= pd.concat(dlist,ignore_index=True)
    return dict(zip(df.district_hash, df.district_id))


def loadOrders(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['order_id','driver_id','passenger_id','start_district_id',
               'dest_district_id', 'Price','Time'] 
    #toload=1
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[6], names=col_names)
        df=df.replace({'start_district_id':cluster})
        df=df.replace({'dest_district_id':cluster})
        dlist.append(df)
    df= pd.concat(dlist,ignore_index=True)
    df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]
    df['min'] = [(t.hour*60+t.minute)+1 for t in df.Time]
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df
    
def loadWeather(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['Time','Weather','Temperature','PM25'] 
    #toload=1
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[0], names=col_names)
        df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]
        dlist.append(df)

    df= pd.concat(dlist,ignore_index=True)
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df
    
def loadTraffic(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['district_id','lv1','lv2','lv3','lv4','Time'] 
    #toload=1
    def myfun(s):
        return int(s[2:])
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[5], names=col_names,
                           converters={1:myfun,2:myfun,3:myfun,4:myfun})
        df=df.replace({'district_id':cluster})
        dlist.append(df)
        df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]

    df = pd.concat(dlist,ignore_index=True)
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df 

In [None]:
# # load cluster map
path="season_1/training_data/cluster_map"
cluster = loadCluster(path)

# load orders data
path="season_1/training_data/order_data"
orders_train=loadOrders(path)


# load traffic data
path="season_1/training_data/traffic_data"
traffic_train=loadTraffic(path)

#load weather data
path="season_1/training_data/weather_data"
weather_train=loadWeather(path)


In [None]:
def timeSlotData(data):
    index_col = ['date','time_slot']
    data['date'] = data['Time'].dt.date
    grouped =data.groupby(index_col , as_index = False)
    time= pd.DataFrame()
    time['date'] = grouped.count()['date']
    time['time_slot'] = grouped.count()['time_slot']
    time.date = pd.to_datetime(time.date)
    time.sort(['date', 'time_slot'], inplace =True)
    return time

def preparetimeSlotDistrict(data):
    timeSlotDistrict = pd.concat([timeSlot for t in range(1,67)],ignore_index=True)
    timeSlotDistrict['start_district_id']=[i for i in range(1,67) for j in range((timeSlot).shape[0])]
    return timeSlotDistrict

def prepareWeatherData(data, timeSlot, testData =False):
    temp = data.copy(deep=True)
    temp.time_slot = temp.time_slot.astype(int)
    temp.drop(['Time'],axis = 1, inplace=True)
    temp.drop_duplicates(['date', 'time_slot'], take_last = True, inplace=True)
    temp = pd.merge(timeSlot, temp, on = ['time_slot', 'date'], how = 'left')
    temp.fillna(method='ffill', limit=10, inplace =True)
    temp.fillna(method='bfill', limit=10, inplace =True)
    return temp 

def prepareTrafficData(data, timeSlotDistrict):
    temp = data.copy(deep=True)
    temp.time_slot = temp.time_slot.astype(int)
    temp.district_id = temp.district_id.astype(int)
    temp.sort(['district_id', 'Time'], inplace = True)
    temp.drop(['Time'], axis = 1, inplace= True)
    temp.rename(columns={'district_id':'start_district_id'}, inplace=True)
    temp = pd.merge(timeSlotDistrict, temp, on = ['time_slot', 'date', 'start_district_id'], how = 'left')
    
    temp.sort(['date', 'start_district_id','time_slot'], inplace = True)
    temp.fillna(method='bfill', limit=2, inplace =True)
    temp = temp.fillna({'lv1':  int(temp.lv1.mean()), 'lv2':  int(temp.lv2.mean()), 
                        'lv3': int(temp.lv3.mean()), 'lv4': int(temp.lv4.mean())})
    temp['lv1_pect'] = temp.lv1/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv2_pect'] = temp.lv2/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv3_pect'] = temp.lv3/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv4_pect'] = temp.lv4/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    return temp

In [None]:
timeSlot = timeSlotData(orders_train)
#weather data process
weather = prepareWeatherData(weather_train, timeSlot)
temp = weather[weather.date == datetime.date(2016,1,21)]
weather.fillna({'Weather': 4, 'PM25': int((temp.PM25.mean()+100)/2), 'Temperature': int(temp.Temperature.mean())}, inplace=True)
print(weather.isnull().any())

##traffic data process 
timeSlotDistrict = preparetimeSlotDistrict(timeSlot)
traffic = prepareTrafficData(traffic_train, timeSlotDistrict)
print(traffic.isnull().any())

In [None]:
def prepareOrderData(orders, weather, traffic, timeSlotDistrict):
    index_col = ['date','time_slot', 'start_district_id']
    grouped = orders.groupby(index_col)
    data = pd.DataFrame()
    data['demand']=grouped.count()['order_id']
    data['supply']=grouped.count()['driver_id']
    data = data.reset_index()
    data['weekday'] = [t.isoweekday() for t in data.date]
    data['gap']=data['demand']-data['supply']
    data.date = pd.to_datetime(data.date)
    data.start_district_id = data.start_district_id.astype(int)
    data = pd.merge(timeSlotDistrict, data, on = ['time_slot', 'date', 'start_district_id'], how = 'left')

    index_col = ['date','time_slot', 'start_district_id', 'min']
    grouped = orders.groupby(index_col)
    data_permin = pd.DataFrame()
    data_permin['demand_per_min']=grouped.count()['order_id']
    data_permin['supply_per_min']=grouped.count()['driver_id']
    data_permin = data_permin.reset_index()
    data_permin['weekday'] = [t.isoweekday() for t in data_permin.date]
    data_permin['gap_per_min']=data_permin['demand_per_min']-data_permin['supply_per_min']
    data_permin.date = pd.to_datetime(data_permin.date)
    data_permin.start_district_id = data_permin.start_district_id.astype(int)
    if 'weekday' in data_permin.columns:
        data_permin.drop(['weekday'], axis= 1, inplace=True)
    
    total_data = data.copy(True)
    total_data.date = pd.to_datetime(total_data.date)
    total_data.start_district_id = total_data.start_district_id.astype(int)
    col = ['date', 'gap', 'weekday', 'time_slot', 'start_district_id', 'demand', 'supply']
    total_data = total_data[col]
    key_col = ['date','time_slot']
    total_data = pd.merge(total_data, weather, on = key_col, how = 'left' )
    key_col = ['date','time_slot', 'start_district_id']
    total_data = pd.merge(total_data, traffic, on = key_col, how = 'left')
    
    key_col = ['date','time_slot', 'start_district_id', 'min']
    for i in range(1,11):
        total_data['min'] = (total_data.time_slot-1)*10+i
        total_data =  pd.merge(total_data, data_permin, on = key_col, how = 'left')
        min_str = str(i)
        col_dict = {'demand_per_min':'demand_min_'+ min_str, 'supply_per_min': 'supply_min_'+min_str, 'gap_per_min': 'gap_min_'+min_str}
        total_data.rename(columns=col_dict, inplace=True)
    
    total_data.fillna(0, inplace = True)
    total_data.sort(['date', 'start_district_id', 'time_slot'], inplace=True)
    
    #move t-1, t-2 and t-3 predictor to the same line in order to predict t0 gap 
    leftTable = total_data[['date', 'gap', 'weekday', 'time_slot', 'start_district_id']]
    pass_1 = total_data.drop(['gap', 'weekday'], axis = 1)
    pass_1['time_slot'] = pass_1['time_slot']+1
    pass_2 = total_data.drop(['gap', 'weekday'], axis = 1)
    pass_2['time_slot'] = pass_2['time_slot']+2
    pass_3 = total_data.drop(['gap', 'weekday'], axis = 1)
    pass_3['time_slot'] = pass_3['time_slot']+3

    result = pd.merge(leftTable, pass_1, on = ['date', 'time_slot', 'start_district_id'], how = 'left')
    result = pd.merge(result, pass_2, on = ['date', 'time_slot', 'start_district_id'], suffixes=('', '_t_2'), how = 'left')
    result = pd.merge(result, pass_3, on = ['date', 'time_slot', 'start_district_id'], suffixes=('', '_t_3'), how = 'left')

    result = result[(result.time_slot != 1) & (result.time_slot != 2) & (result.time_slot != 3) ]
    result.drop(['min', 'date'], axis=1, inplace =True)
    
    return result

In [None]:
train_data = prepareOrderData(orders_train, weather, traffic, timeSlotDistrict)

In [None]:
pickle_file = 'train_data.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'traffic_train', traffic_train
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [None]:
## direct read the small test set and training set from the pickle file
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

pickle_file = 'train_data.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_data = save['train_data']
  del save  # hint to help gc free up memory
  print('train_data', train_data.shape)