In [1]:
import csv
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
def MAPE(true_result,pred_result):
    return np.average(np.abs(pred_result-true_result)/true_result)
fmt = "%Y-%m-%d %X"
datetime_delta = timedelta(minutes=20)

# Set the files

In [2]:
dev_files = ['result/res_phase1_MultiTaskLasso_0.15.csv',
             'result/res_phase1_MultiTaskElasticNet_0.147.csv',
             'result/res_phase1_SVR_0.147.csv',
             'result/phase1_submission.csv']
ensemble_files = ['result/res_phase2_MultiTaskLasso.csv',
                  'result/res_phase2_MultiTaskElasticNet.csv',
                  'result/res_phase2_SVR.csv',
                  'result/phase2_submission.csv']

# Read pred results

In [3]:
results = []
days = []
for dev_file in dev_files:
    cases = {}
    days = []
    with open(dev_file) as f:
        f.readline()
        reader = csv.reader(f)
        for row in reader:
            start_time = row[1].split(',')[0][1:]
            day = start_time.split()[0]
            if day not in days:
                days.append(day)
            index = row[0]+'-'+row[2]+'-'+start_time
            cases[index]=float(row[3])
    result = []
    for id in range(1,4):
        for dir in range(0,2):
            if id == 2 and dir ==1:
                break;
            for day in days:
                periods = np.zeros([72])
                curr_datetime = datetime.strptime(day+' 00:20:00',fmt)
                for i in range(72):
                    index = str(id)+'-'+str(dir)+'-'+curr_datetime.strftime(fmt)
                    curr_datetime += datetime_delta
                    if index in cases:
                        periods[i] = cases[index]
                for i in range(23,29):
                    result.append(periods[i])
                for i in range(50,56):
                    result.append(periods[i])
    results.append(np.array(result))

# Read ground-truth from data

In [4]:
train_cases = {}
train_days = []
with open('data/training2_20min_avg_volume.csv') as f:
    f.readline()
    reader = csv.reader(f)
    for row in reader:
        start_time = row[1].split(',')[0][1:]
        day = start_time.split()[0]
        if day not in train_days:
            train_days.append(day)
        index = row[0]+'-'+row[2]+'-'+start_time
        train_cases[index]=float(row[3])
true_result = []
for id in range(1,4):
    for dir in range(0,2):
        if id == 2 and dir ==1:
            break;
        for day in days:
            periods = np.zeros([72])
            curr_datetime = datetime.strptime(day+' 00:20:00',fmt)
            for i in range(72):
                index = str(id)+'-'+str(dir)+'-'+curr_datetime.strftime(fmt)
                curr_datetime += datetime_delta
                if index in train_cases:
                    periods[i] = train_cases[index]
            for i in range(23,29):
                true_result.append(periods[i])
            for i in range(50,56):
                true_result.append(periods[i])

# print the MAPE of each model in phase1_test

In [5]:
for result in results:
    print(MAPE(true_result,result))

0.150271873677
0.147482319893
0.147473356541
0.282405937004


# find the best weight

In [6]:
def find_best_weight(weight,i,results,true_result,step=0.01):
    best_mape = 1.0
    best_weight = np.array(weight)
    if i == len(results)-1:
        weight[-1]=1-np.sum(weight[:-1])
        pred = np.zeros(results[0].shape)
        for w,p in zip(weight,results):
            pred += w*p
        return MAPE(true_result,pred),weight
    else:
        rest = np.sum(weight[:i])
        while weight[i]<= 1-rest:
            mape,curr_weight = find_best_weight(np.array(weight),i+1,results,true_result)
            if mape < best_mape:
                best_weight = np.array(curr_weight)
                best_mape = mape
            weight[i]+=step
    return best_mape,best_weight
weight = np.zeros(len(results))
weight[-1] = 1
mape,weight = find_best_weight(weight,0,results,true_result)
print(mape,weight)

(0.13913490633336767, array([ 0.29,  0.19,  0.42,  0.1 ]))


In [7]:
preds = []
for ensemble_file in ensemble_files:
    cases = {}
    days = []
    with open(ensemble_file) as f:
        f.readline()
        reader = csv.reader(f)
        for row in reader:
            start_time = row[1].split(',')[0][1:]
            day = start_time.split()[0]
            if day not in days:
                days.append(day)
            index = row[0]+'-'+row[2]+'-'+start_time
            cases[index]=float(row[3])
    pred = []
    for id in range(1,4):
        for dir in range(0,2):
            if id == 2 and dir ==1:
                break;
            for day in days:
                periods = np.zeros([72])
                curr_datetime = datetime.strptime(day+' 00:20:00',fmt)
                for i in range(72):
                    index = str(id)+'-'+str(dir)+'-'+curr_datetime.strftime(fmt)
                    curr_datetime += datetime_delta
                    if index in cases:
                        periods[i] = cases[index]
                for i in range(23,29):
                    pred.append(periods[i])
                for i in range(50,56):
                    pred.append(periods[i])
    preds.append(np.array(pred))
output = open('result/submission.csv','w')
output.write('tollgate_id,time_window,direction,volume\n')
ensemble_pred = np.zeros(preds[0].shape)
for w,p in zip(weight,preds):
    ensemble_pred += w*p
i = 0
for id in range(1,4):
    for dir in range(0,2):
        if id == 2 and dir ==1:
            break;
        for day in days:
            for k in range(23,29):
                begin_datetime = datetime.strptime(day+' 00:20:00',fmt)+k*datetime_delta
                end_datetime = begin_datetime+datetime_delta
                result = "%d,\"[%s,%s)\",%d,%f\n" %(id,begin_datetime.strftime(fmt),end_datetime.strftime(fmt),dir,ensemble_pred[i])
                output.write(result)
                i+=1
            for k in range(50,56):
                begin_datetime = datetime.strptime(day+' 00:20:00',fmt)+k*datetime_delta
                end_datetime = begin_datetime+datetime_delta
                result = "%d,\"[%s,%s)\",%d,%f\n" %(id,begin_datetime.strftime(fmt),end_datetime.strftime(fmt),dir,ensemble_pred[i])
                output.write(result)
                i+=1
output.close()