In [105]:
import pandas as pd
import numpy as np
import sklearn

import os
from matplotlib import pyplot as plt
import seaborn as sbn

from sklearn.pipeline import Pipeline, FeatureUnion
import dill

%matplotlib inline
plt.style.use('seaborn-paper')

# 辅助函数 

In [106]:
from Tools import *
from sklearn.metrics import make_scorer

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true))

mape = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [107]:
dill.load_session("data.pickle")

# 预处理

## 特征提取

In [108]:
from Preprocess.TimeProcessor import *
from Preprocess.Utils import *

* ### 天气特征

In [109]:
class WeatherFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, weather):
        self.weather = weather
        
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        table = posts.merge(self.weather,  how='left' ,on=['year','day_of_year','hour'])
        return table.fillna(-1)

* ### 前两小时特征

In [110]:
class LastTwoHour(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.info1 = pd.concat([
            pd.read_csv('travel_time_feature.csv'), 
            pd.read_csv('test_travel_time_feature.csv')]).fillna(0)
        
        self.info2 = pd.concat([
            pd.read_csv('../train_volume_feature.csv'), 
            pd.read_csv('../test_volume_feature.csv')
        ]).fillna(0)
    
    def fit(self,x,y=None):
        return self
    
    def transform(self, posts):
        table = posts.merge(self.info1, how='left' ,on=['day_of_year']).fillna(-1)
        table = table.merge(self.info2, how='left' ,on=['day_of_year']).fillna(0)
        return table

## 特征选取

In [111]:
from sklearn.feature_selection import SelectKBest
feature_selector = SelectKBest(k=all)

from sklearn.feature_selection import GenericUnivariateSelect

# 训练模型

In [112]:
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

def log(x):
    return np.log(x)

#TODO: 0.65*abs(x) * log(abs(x) + 1)
def obj1(y_true, y_pred):
    x = y_true - y_pred
    absx = np.abs(x)
    
    grad =((x*abs(x)+x)*log(abs(x)+1)+x*abs(x))/(abs(x)+x**2)
    hess =(2*abs(x)+x**2)/(x**2*abs(x)+abs(x)+2*x**2)
    return grad, hess

def obj2(y_true, y_pred):
    tmp = y_true -  y_pred
    hess = 2 / y_true    
    grad = - hess *  tmp
       
    return grad, hess

'''
    sig = np.sign(y_true - y_pred)
    grad = sig#(sig *  (-1 / y_true)) * 10
    hess = np.zeros(len(y_true))
    return grad, hess
'''
xgb = XGBRegressor(objective=obj2)
regressors = [
    LinearRegression(),
    #RandomForestRegressor(),
    #LGBMRegressor(objective= obj2),
    #BaggingRegressor(),
    LGBMRegressor(boosting_type='dart', objective= obj2),
    XGBRegressor(objective=obj2),
    SVR(),
]

stack = StackingRegressor(regressors=regressors, meta_regressor=xgb)

# 参数

In [113]:
# displayParam(preprocess, ['<h1>预处理参数</h1>'])
# displayParam(stack, ['<h1>训练参数</h1>'])

# 结果

In [114]:
train_avg_travel_time.time_start

0       2016-07-19 00:00:00
1       2016-07-19 00:20:00
2       2016-07-19 01:40:00
3       2016-07-19 02:00:00
4       2016-07-19 02:40:00
5       2016-07-19 03:40:00
6       2016-07-19 04:00:00
7       2016-07-19 04:20:00
8       2016-07-19 04:40:00
9       2016-07-19 05:00:00
10      2016-07-19 06:00:00
11      2016-07-19 06:20:00
12      2016-07-19 06:40:00
13      2016-07-19 07:00:00
14      2016-07-19 07:20:00
15      2016-07-19 07:40:00
16      2016-07-19 08:00:00
17      2016-07-19 08:20:00
18      2016-07-19 08:40:00
19      2016-07-19 09:00:00
20      2016-07-19 09:20:00
21      2016-07-19 09:40:00
22      2016-07-19 10:00:00
23      2016-07-19 10:20:00
24      2016-07-19 10:40:00
25      2016-07-19 11:00:00
26      2016-07-19 11:20:00
27      2016-07-19 11:40:00
28      2016-07-19 12:00:00
29      2016-07-19 12:20:00
                ...        
25114   2016-10-16 21:00:00
25115   2016-10-16 22:20:00
25116   2016-10-16 23:00:00
25117   2016-10-17 00:20:00
25118   2016-10-17 0

In [115]:
t = (train_avg_travel_time.time_start < '2016-10-8') & (train_avg_travel_time.time_start > '2016-9-30')
#t = t | (train_avg_travel_time.time_start < '2016-9-1')
#trafic_train = pd.read_csv('../trafic_train.csv')
tt = train_avg_travel_time.time_start.apply(lambda x: (x.hour < 10) & (x.hour >= 8))
t = (~t) & tt
#t=tt
t2 = train_avg_travel_time[t]
train_x=t2.drop(['avg_travel_time'], axis=1)
train_y=t2.avg_travel_time

train_x.index = range(len(train_x))
train_y.index = range(len(train_y))
train_x

Unnamed: 0,intersection_id,tollgate_id,time_start,time_end
0,B,3,2016-07-19 08:00:00,2016-07-19 08:20:00
1,B,3,2016-07-19 08:20:00,2016-07-19 08:40:00
2,B,3,2016-07-19 08:40:00,2016-07-19 09:00:00
3,B,3,2016-07-19 09:00:00,2016-07-19 09:20:00
4,B,3,2016-07-19 09:20:00,2016-07-19 09:40:00
5,B,3,2016-07-19 09:40:00,2016-07-19 10:00:00
6,B,3,2016-07-20 08:00:00,2016-07-20 08:20:00
7,B,3,2016-07-20 08:20:00,2016-07-20 08:40:00
8,B,3,2016-07-20 08:40:00,2016-07-20 09:00:00
9,B,3,2016-07-20 09:00:00,2016-07-20 09:20:00


In [116]:
train_x.shape

(2404, 4)

In [117]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
stack_params = {
    #model
    'lgbmregressor__n_estimators': sp_randint(300, 500),
    #'xgbregressor__n_estimators': [300, 400, 500],
    #'lgbmregressor-2__n_estimators': [300,400,500],
}

param_search = RandomizedSearchCV(
    estimator=stack,
    param_distributions=stack_params, 
    scoring=mape, 
    n_iter=4,
    cv=2
)

In [118]:
prep_params = {
    'encoding__columns':['intersection_id'],
    'feature_select__k':  'all'
}

feature_process = [
    ('weather', WeatherFeatures(weather)),
  #  ('last2hour', LastTwoHour()),
]

estimators = [
    ('transformed_time', TimeProcessor('time_start')),
    ('encoding',MultiColumnLabelEncoder(columns=['intersection_id'])),
    ('drop', DropProcessor(['time_start', 'time_end'])),
    ('save_table', Saver('../Before.pickle')),
    ('combined_features', FeatureUnion(feature_process)),
    ('feature_select', feature_selector),
]

preprocess = Pipeline(estimators)
preprocess.set_params(**prep_params)

train_x=preprocess.fit_transform(train_x, train_y)
group=train_x[:,4]

clf = param_search.fit(train_x, train_y, groups=group)
param_search.cv_results_
param_search.best_params_

param_search.best_estimator_.verbose=0
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=3)
i = 0
clf2 = param_search.best_estimator_

print(param_search.best_score_)
print(param_search.best_params_)
a={0:[],1:[],2:[]}
for train, test in gkf.split(train_x, train_y, groups=group):
    clf2.fit(train_x[train], train_y[train])
    print('* {}: train:{}, test:{}'.format(i,
        mean_absolute_percentage_error(train_y[train],  clf2.predict(train_x[train])),
        mean_absolute_percentage_error(train_y[test],  clf2.predict(train_x[test]))))
    a[i]= mean_absolute_percentage_error(train_y[test],  clf2.predict(train_x[test]))
    i+=1
    
clf2.fit(train_x, train_y)
print('ALL:', mean_absolute_percentage_error(train_y,  clf2.predict(train_x)),
     '\n','test ALL：',(a[0]+a[1]+a[2])/3)

  f = msb / msw


-0.479724962373
{'lgbmregressor__n_estimators': 363}
* 0: train:0.18558186484451794, test:0.24351458748629815
* 1: train:0.17398303838528195, test:0.23625352462220428
* 2: train:0.18301744924772412, test:0.24924196616880942
ALL: 0.18371980789640097 
 test ALL： 0.24300335942577064


In [13]:
'''from sklearn.externals import joblib
from sklearn.pipeline import make_pipeline
pipe = Pipeline([
    ('preprocess', preprocess),
    ('best_estimator', param_search.best_estimator_)
])
joblib.dump(pipe, 'model.pkl')'''

"from sklearn.externals import joblib\nfrom sklearn.pipeline import make_pipeline\npipe = Pipeline([\n    ('preprocess', preprocess),\n    ('best_estimator', param_search.best_estimator_)\n])\njoblib.dump(pipe, 'model.pkl')"

In [13]:
submission = pd.read_csv('../submission_sample_travelTime.csv').drop('avg_travel_time', axis = 1)
test_x = pd.read_csv('../submission_sample_travelTime.csv').drop('avg_travel_time', axis = 1)
test_x['time_start'], test_x['time_end'] = split_time_window(test_x.time_window)
test_x = test_x.drop(['time_window'], axis=1)

test_x = preprocess.transform(test_x)

clf = param_search.best_estimator_.fit(train_x, train_y)
submission['avg_travel_time'] =  clf.predict(test_x)
from datetime import datetime
now = datetime.now()
submission.to_csv('time{}.{}.{}.csv'.format(
    str(now.date()),
    str(now.hour),
    str(now.minute)
), index = False)

In [19]:
feature_process = [
    ('weather', WeatherFeatures(weather)),
    ('last2hour', LastTwoHour()),
]
combined_features = FeatureUnion(feature_process)

estimators = [
    ('transformed_time', TimeProcessor('time_start')),
    #('encoding',MultiColumnLabelEncoder(columns=['intersection_id'])),
    ('drop', DropProcessor(['time_start', 'time_end'])),
    ('save_table', Saver('../VolumeBefore.pickle')),
    ('combined_features', combined_features),
    ('feature_select', feature_selector),
]

preprocess = Pipeline(estimators)

t = (train_avg_volume.time_start < '2016-10-8') & (train_avg_volume.time_start > '2016-10-1')
t = t | (train_avg_volume.time_start < '2016-9-1')
tt = train_avg_volume.time_start.apply(lambda x: (x.hour < 10) & (x.hour >= 8))
t = (~t) & tt
t = train_avg_volume[t]
train_x = t.drop(['volume'], axis=1)
train_y = t.volume

train_x.index = range(len(train_x))
train_y.index = range(len(train_y))

train_x = preprocess.transform(train_x)

group=train_x[:,4]

param_search = RandomizedSearchCV(
    estimator=stack,
    param_distributions=stack_params, 
    scoring=mape, 
    n_iter=4,
    cv=2
)

clf = param_search.fit(train_x, train_y, groups=group)
print(param_search.best_score_)
print(param_search.best_params_)

param_search.best_estimator_.verbose=0

from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=4)
i = 0
clf = param_search.best_estimator_

for train, test in gkf.split(train_x, train_y, groups=group):
    clf.fit(train_x[train], train_y[train])
    print('* {}: train:{}, test:{}'.format(i,
        mean_absolute_percentage_error(train_y[train],  clf.predict(train_x[train])),
        mean_absolute_percentage_error(train_y[test],  clf.predict(train_x[test]))))
    i+=1

-0.218864897012
{'lgbmregressor__n_estimators': 386}
* 0: train:0.1155416042845095, test:0.16700947412091863
* 1: train:0.11139846190030658, test:0.15497675730306343
* 2: train:0.10072806365069632, test:0.24363375204898205
* 3: train:0.1036005840129719, test:0.18046532519717998


In [20]:
submission = pd.read_csv('../submission_sample_volume.csv').drop('volume', axis = 1)
test_x = pd.read_csv('../submission_sample_volume.csv').drop('volume', axis = 1)
test_x['time_start'], test_x['time_end'] = split_time_window(test_x.time_window)
test_x = test_x.drop(['time_window'], axis=1)

test_x = preprocess.transform(test_x)

clf = param_search.best_estimator_.fit(train_x, train_y)
print(mean_absolute_percentage_error(train_y,  clf.predict(train_x)))
submission['volume'] =  clf.predict(test_x)
from datetime import datetime
now = datetime.now()
submission.to_csv('volume{}.{}.{}.csv'.format(
    str(now.date()),
    str(now.hour),
    str(now.minute)
), index = False)

0.10635953804442032
