In [None]:
import pandas as pd
import os
import glob
import sys
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
from six.moves import cPickle as pickle

In [None]:
# load district hashmap first
def loadCluster(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['district_hash','district_id'] 
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t', names=col_names)
        dlist.append(df)
    df= pd.concat(dlist,ignore_index=True)
    return dict(zip(df.district_hash, df.district_id))


def loadOrders(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['order_id','driver_id','passenger_id','start_district_id',
               'dest_district_id', 'Price','Time'] 
    #toload=1
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[6], names=col_names)
        df=df.replace({'start_district_id':cluster})
        df=df.replace({'dest_district_id':cluster})
        dlist.append(df)
    df= pd.concat(dlist,ignore_index=True)
    df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]
    df['min'] = [(t.hour*60+t.minute)+1 for t in df.Time]
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df
    
def loadWeather(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['Time','Weather','Temperature','PM25'] 
    #toload=1
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[0], names=col_names)
        df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]
        dlist.append(df)

    df= pd.concat(dlist,ignore_index=True)
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df
    
def loadTraffic(path):
    allFiles = [os.path.join(path,f) for f in os.listdir(path)]
    dlist=[]
    col_names=['district_id','lv1','lv2','lv3','lv4','Time'] 
    #toload=1
    def myfun(s):
        return int(s[2:])
    for file_ in allFiles:
        df = pd.read_table(file_, sep='\t',parse_dates=[5], names=col_names,
                           converters={1:myfun,2:myfun,3:myfun,4:myfun})
        df=df.replace({'district_id':cluster})
        dlist.append(df)
        df['time_slot']=[(t.hour*60+t.minute)/10+1 for t in df.Time]

    df = pd.concat(dlist,ignore_index=True)
    df= df[df.Time.dt.date!=datetime.date(2016,1,1)]
    df['date'] = df['Time'].dt.date
    df.date = pd.to_datetime(df.date)
    return df 

In [None]:
# # load cluster map
path="season_1/training_data/cluster_map"
cluster = loadCluster(path)

# load orders data
path="season_1/training_data/order_data"
orders_train=loadOrders(path)


# load traffic data
path="season_1/training_data/traffic_data"
traffic_train=loadTraffic(path)

#load weather data
path="season_1/training_data/weather_data"
weather_train=loadWeather(path)

In [None]:
def timeSlotData(data, test = False):
    index_col = ['date','time_slot']
    data['date'] = data['Time'].dt.date
    grouped =data.groupby(index_col , as_index = False)
    time= pd.DataFrame()
    time['date'] = grouped.count()['date']
    time['time_slot'] = grouped.count()['time_slot']
    time.date = pd.to_datetime(time.date)
    time.sort(['date', 'time_slot'], inplace =True)
    if(test):
        a = time.iloc[::3, :]
        b = a.copy(True)
        b.time_slot = b.time_slot+3
        time = time.append(b)
        time.sort(['date', 'time_slot'], inplace =True)
    return time

def preparetimeSlotDistrict(data):
    timeSlotDistrict = pd.concat([timeSlot for t in range(1,67)],ignore_index=True)
    timeSlotDistrict['start_district_id']=[i for i in range(1,67) for j in range((timeSlot).shape[0])]
    return timeSlotDistrict

def prepareWeatherData(data, timeSlot, testData =False):
    temp = data.copy(deep=True)
    temp.time_slot = temp.time_slot.astype(int)
    temp.drop(['Time'],axis = 1, inplace=True)
    temp.drop_duplicates(['date', 'time_slot'], take_last = True, inplace=True)
    temp = pd.merge(timeSlot, temp, on = ['time_slot', 'date'], how = 'left')
    temp.fillna(method='ffill', limit=10, inplace =True)
    temp.fillna(method='bfill', limit=10, inplace =True)
    return temp 

def prepareTrafficData(data, timeSlotDistrict, test = False):
    temp = data.copy(deep=True)
    temp.time_slot = temp.time_slot.astype(int)
    temp.district_id = temp.district_id.astype(int)
    temp.sort(['district_id', 'Time'], inplace = True)
    temp.drop(['Time'], axis = 1, inplace= True)
    temp.rename(columns={'district_id':'start_district_id'}, inplace=True)
    temp = pd.merge(timeSlotDistrict, temp, on = ['time_slot', 'date', 'start_district_id'], how = 'left')
    
    temp.sort(['date', 'start_district_id','time_slot'], inplace = True)
    if (test != True):
        temp.fillna(method='bfill', limit=2, inplace =True)
        temp = temp.fillna({'lv1':  int(temp.lv1.mean()), 'lv2':  int(temp.lv2.mean()), 
                            'lv3': int(temp.lv3.mean()), 'lv4': int(temp.lv4.mean())})
    temp['lv1_pect'] = temp.lv1/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv2_pect'] = temp.lv2/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv3_pect'] = temp.lv3/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    temp['lv4_pect'] = temp.lv4/(temp.lv1+temp.lv2+temp.lv3+temp.lv4) 
    return temp

In [None]:
timeSlot = timeSlotData(orders_train)
#weather data process
weather = prepareWeatherData(weather_train, timeSlot)
temp = weather[weather.date == datetime.date(2016,1,21)]
weather.fillna({'Weather': 4, 'PM25': int((temp.PM25.mean()+100)/2), 'Temperature': int(temp.Temperature.mean())}, inplace=True)
print(weather.isnull().any())

##traffic data process 
timeSlotDistrict = preparetimeSlotDistrict(timeSlot)
traffic = prepareTrafficData(traffic_train, timeSlotDistrict)
print(traffic.isnull().any())

In [None]:
def prepareOrderData(orders, weather, traffic, timeSlotDistrict, test = False):
    index_col = ['date','time_slot', 'start_district_id']
    grouped = orders.groupby(index_col)
    data = pd.DataFrame()
    data['demand']=grouped.count()['order_id']
    data['supply']=grouped.count()['driver_id']
    data = data.reset_index()
    data['weekday'] = [t.isoweekday() for t in data.date]
    data['gap']=data['demand']-data['supply']
    data.date = pd.to_datetime(data.date)
    data.start_district_id = data.start_district_id.astype(int)
    data = pd.merge(timeSlotDistrict, data, on = ['time_slot', 'date', 'start_district_id'], how = 'left')
    if(test):
        data.fillna(method='ffill', limit=1, inplace =True)
    
    index_col = ['date','time_slot', 'start_district_id', 'min']
    grouped = orders.groupby(index_col)
    data_permin = pd.DataFrame()
    data_permin['demand_per_min']=grouped.count()['order_id']
    data_permin['supply_per_min']=grouped.count()['driver_id']
    data_permin = data_permin.reset_index()
    data_permin['weekday'] = [t.isoweekday() for t in data_permin.date]
    data_permin['gap_per_min']=data_permin['demand_per_min']-data_permin['supply_per_min']
    data_permin.date = pd.to_datetime(data_permin.date)
    data_permin.start_district_id = data_permin.start_district_id.astype(int)
    if 'weekday' in data_permin.columns:
        data_permin.drop(['weekday'], axis= 1, inplace=True)
    
    total_data = data.copy(True)
    total_data.date = pd.to_datetime(total_data.date)
    total_data.start_district_id = total_data.start_district_id.astype(int)
    col = ['date', 'gap', 'weekday', 'time_slot', 'start_district_id', 'demand', 'supply']
    total_data = total_data[col]
    key_col = ['date','time_slot']
    total_data = pd.merge(total_data, weather, on = key_col, how = 'left' )
    key_col = ['date','time_slot', 'start_district_id']
    total_data = pd.merge(total_data, traffic, on = key_col, how = 'left')
    
    key_col = ['date','time_slot', 'start_district_id', 'min']
    for i in range(1,11):
        total_data['min'] = (total_data.time_slot-1)*10+i
        total_data =  pd.merge(total_data, data_permin, on = key_col, how = 'left')
        min_str = str(i)
        col_dict = {'demand_per_min':'demand_min_'+ min_str, 'supply_per_min': 'supply_min_'+min_str, 'gap_per_min': 'gap_min_'+min_str}
        total_data.rename(columns=col_dict, inplace=True)
    
    total_data.fillna(0, inplace = True)
    total_data.sort(['date', 'start_district_id', 'time_slot'], inplace=True)
    
    #move t-1, t-2 and t-3 predictor to the same line in order to predict t0 gap 
    leftTable = total_data[['date', 'gap', 'weekday', 'time_slot', 'start_district_id']]
    pass_1 = total_data.drop(['weekday'], axis = 1)
    pass_1['time_slot'] = pass_1['time_slot']+1
    pass_2 = total_data.drop(['weekday'], axis = 1)
    pass_2['time_slot'] = pass_2['time_slot']+2
    pass_3 = total_data.drop(['weekday'], axis = 1)
    pass_3['time_slot'] = pass_3['time_slot']+3

    result = pd.merge(leftTable, pass_1, on = ['date', 'time_slot', 'start_district_id'], suffixes=('', '_t_1'), how = 'left')
    result = pd.merge(result, pass_2, on = ['date', 'time_slot', 'start_district_id'], suffixes=('', '_t_2'), how = 'left')
    result = pd.merge(result, pass_3, on = ['date', 'time_slot', 'start_district_id'], suffixes=('', '_t_3'), how = 'left')

    result = result[(result.time_slot != 1) & (result.time_slot != 2) & (result.time_slot != 3) ]
    result.drop(['min'], axis=1, inplace =True)
    
    return result

In [None]:
train_data = prepareOrderData(orders_train, weather, traffic, timeSlotDistrict)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import  cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import randint
import tensorflow as tf
from operator import itemgetter
from time import time

In [None]:
##custom scoring function 
def mape(estimator, X, y, offset = 0):
    y_pred = estimator.predict(X)
    df = pd.DataFrame()
    df['y'] = y
    df['y_pred'] = y_pred - offset
    df = df[X.time_slot >= 46]
    df = df.round()
    df.y_pred[df.y_pred <= 0] = 1  ##need set the value to at least 1 
    dim = df.shape[0]
    df = df[df.y != 0]
    return ((np.sum(np.abs(df.y-df.y_pred)/df.y)/dim))

In [None]:
def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=False)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

In [None]:
###search for the best parameters
def run_randomsearch(X, y, clf, para_dist, cv=5,
                     n_iter_search=20):
    """Run a random search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf,
                        param_distributions=param_dist,
                        n_iter=n_iter_search, 
                        scoring = mape,
                        n_jobs = 25)

    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter settings.").format((time() - start), n_iter_search))

    top_params = report(random_search.grid_scores_, 5)
    return  top_params

In [None]:
dat_train = train_data[(train_data.date !=  datetime.date(2016,1,19)) & (train_data.date !=  datetime.date(2016,1,16)) ]
dat_valid =  train_data[(train_data.date ==  datetime.date(2016,1,19)) | (train_data.date ==  datetime.date(2016,1,16)) ]

print('train', dat_train.shape)
print('validate', dat_valid.shape)

In [None]:
x= dat_train.drop(['gap', 'date'], axis = 1)
y =dat_train.gap
x_valid= dat_valid.drop(['gap', 'date'], axis = 1)
y_valid =dat_valid.gap

In [None]:
from sklearn import ensemble
x_1= train_data.drop(['gap', 'date'], axis = 1)
y_1 =train_data.gap
params = {'n_estimators': 2000, 'max_depth': 4, 'min_samples_split': 5, 'learning_rate': 0.001, 'loss': 'lad'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(x_1, y_1)

mape(clf, x_valid, y_valid)

In [None]:
print(mape(clf, x_valid, y_valid, 0.5))
print(mape(clf, x_valid, y_valid, 1))
print(mape(clf, x_valid, y_valid, 1.5))
print(mape(clf, x_valid, y_valid, 2))
print(mape(clf, x_valid, y_valid, 2.5))

In [None]:
##optimize the parameter of the gbm 
param_dist = {"n_estimators": randint(100, 2000),
              "max_depth": randint(1, 5),
              "min_samples_split": randint(1,20),
              "learning_rate": [0.1, 0.01, 0.001, 0.0001],
              "loss": 'lad'
             }

gbm_model = ensemble.GradientBoostingRegressor()
gbm_model_search = run_randomsearch(x_1, y_1, gbm_model, param_dist, cv=5, n_iter_search=15)

In [None]:
#develop model for different region
def developmodelForRegion(region_id, repeatSampleTimes, giveModel = False, model = None, ntops=1, test = False, testData=None):
    #develop model for two region region 1
    train = train_data[(train_data.date !=  datetime.date(2016,1,19)) & (train_data.date !=  datetime.date(2016,1,16)) ]
    valid =  train_data[(train_data.date ==  datetime.date(2016,1,19)) | (train_data.date ==  datetime.date(2016,1,16)) ]
    train = train[train.start_district_id==region_id] 
    x_valid= valid[valid.start_district_id==region_id].drop(['gap', 'date'], axis = 1)
    y_valid =valid[valid.start_district_id==region_id].gap
    scores = dict()
    model_dict = dict()
    if(giveModel == True):
        gbm = model
        for m in [0, 0.5, 1, 1.5, 2.0, 2.5, 3.0]: 
            result = mape(gbm, x_valid, y_valid, m)
            #print('offset {0:.1f} {1:.5f}'.format (m, result))
            key = str(region_id) + '-' + '0' + '-' + str(m)
            scores[key] = result
            model_dict[key] = gbm
    
    duplicate = train[(train.gap<=5) & (train.gap>= 1) ]
    duplicate_copy = duplicate.copy(True)

    for i in range(1, repeatSampleTimes):
        dat_trian_with_duplicates =  train.copy(True)   
        for j in range(1, i):
            dat_trian_with_duplicates = dat_trian_with_duplicates.append(duplicate_copy)

        x = dat_trian_with_duplicates.drop(['gap', 'date'], axis = 1)
        y = dat_trian_with_duplicates.gap
        params = {'n_estimators': 2000, 'max_depth': 4, 'min_samples_split': 5, 'learning_rate': 0.001, 'loss': 'lad'}
        gbm = ensemble.GradientBoostingRegressor(**params)
        gbm.fit(x, y)

        for m in [0, 0.5, 1, 1.5, 2.0]: 
            result = mape(gbm, x_valid, y_valid, m)
            #print('offset {0:.1f} {1:.5f}'.format (m, result))
            key = str(region_id) + '-' + str(i) + '-' + str(m)
            scores[key] = result
            model_dict[key] = gbm
        del gbm
        del dat_trian_with_duplicates
    
    top_scores = sorted(scores.items(), key=itemgetter(1), reverse=False)[:ntops]
    print(top_scores)
    if(test == False):
        return
    else:
        key = top_scores[0][0]
        finalModel = model_dict[key]
        testData_copy = testData.copy(True)
        x_test = testData_copy.drop(['gap', 'date'], axis = 1)
        result = finalModel.predict(x_test)
        offset = float(key.split('-')[2])
        result = result-offset
        result =result.round()
        result[result <= 0] = 1 
        final_result = pd.DataFrame(testData_copy)
        final_result['gap'] = result
        final_result.sort(['date', 'time_slot', 'start_district_id'], inplace= True)
        combinedTimeDistrict = lambda x: (x.date.strftime('%Y-%m-%d'))+'-'+str(x.time_slot)
        final_result['Time_district'] = final_result.apply(combinedTimeDistrict, axis=1)
        final_result.to_csv('result_'+str(region_id)+'.csv',columns=['start_district_id', 'Time_district', 'gap'], header = False, index =False)
        return

In [None]:
developmodelForRegion(7,1, True, clf, 5)

In [None]:
from threading import Thread, current_thread
threads = []

for i in range(5, 8):
    thread = Thread(
        name=str(i),
        target=developmodelForRegion,
        args=(i, 4, True, clf)
    )
    
    threads.append(thread)

for i_thread, thread in enumerate(threads):
    thread.start()

for i_thread, thread in enumerate(threads):
    thread.join()

In [None]:
dat_train_2 = dat_train[dat_train.start_district_id == 2] 
x_valid_2= dat_valid[dat_valid.start_district_id == 2].drop(['gap', 'date'], axis = 1)
y_valid_2 =dat_valid[dat_valid.start_district_id == 2].gap
    
x_2 = dat_train_2.drop(['gap', 'date'], axis = 1)
y_2 = dat_train_2.gap
params = {'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 5, 'learning_rate': 0.001, 'loss': 'lad'}
clf_2 = ensemble.GradientBoostingRegressor(**params)

clf_2.fit(x_2, y_2)
print(mape(clf_2, x_valid_2, y_valid_2, 0))
print(mape(clf_2, x_valid_2, y_valid_2, 0.5))
print(mape(clf_2, x_valid_2, y_valid_2, 1))

In [None]:
print(mape(clf, x_valid_2, y_valid_2, 0))
print(mape(clf, x_valid_2, y_valid_2, 0.5))
print(mape(clf, x_valid_2, y_valid_2, 1))

In [None]:
##Test data prepare 

In [None]:
# # load cluster map
path="season_1/test_set_2/cluster_map"
cluster = loadCluster(path)

# load orders data
path="season_1/test_set_2/order_data"
orders_test=loadOrders(path)


# load traffic data
path="season_1/test_set_2/traffic_data"
traffic_test=loadTraffic(path)

#load weather data
path="season_1/test_set_2/weather_data"
weather_test=loadWeather(path)


In [None]:
timeSlot = timeSlotData(orders_test, True)
#weather data process
weather = prepareWeatherData(weather_test, timeSlot)
print(weather.isnull().any())

##traffic data process 
timeSlotDistrict = preparetimeSlotDistrict(timeSlot)
traffic = prepareTrafficData(traffic_test, timeSlotDistrict, True)
print(traffic.isnull().any())

In [None]:
test_data = prepareOrderData(orders_test, weather, traffic, timeSlotDistrict, True)
temp = test_data.iloc[3::4, :]
test_data = temp.copy(True)

In [None]:
final_result = pd.DataFrame(test_data)
final_result['gap'] = result
final_result.sort(['date', 'time_slot', 'start_district_id'], inplace= True)
combinedTimeDistrict = lambda x: (x.date.strftime('%Y-%m-%d'))+'-'+str(x.time_slot)
    
final_result['Time_district'] = final_result.apply(combinedTimeDistrict, axis=1)
final_result.to_csv('result_2.csv',columns=['start_district_id', 'Time_district', 'gap'], header = False, index =False)

In [None]:
threads = []

for i in range(5, 8):
    thread = Thread(
        name=str(i),
        target=developmodelForRegion,
        args=(i, 4, True, clf, 1, True, test_data[test_data.start_district_id==i])
    )
    
    threads.append(thread)

for i_thread, thread in enumerate(threads):
    thread.start()

for i_thread, thread in enumerate(threads):
    thread.join()

In [None]:
pickle_file = 'data.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_data': train_data,
    'clf': clf,
    'test_data': test_data
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise