In [1]:
import pandas as pd
import numpy as np
import sklearn

import os
from matplotlib import pyplot as plt
import seaborn as sbn

from sklearn.pipeline import Pipeline, FeatureUnion

%matplotlib inline
plt.style.use('seaborn-paper')

# 辅助函数 

In [2]:
from Tools import *
from sklearn.metrics import make_scorer

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true))

mape = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# 读取数据

In [3]:
train_data = {
    'trajectories': pd.read_csv('../trajectories(table 5)_training.csv'),
    'volume': pd.read_csv('../volume(table 6)_training.csv'),
    'avg_travel_time': pd.read_csv('../training_20min_avg_travel_time.csv'),
    'avg_volume': pd.read_csv('../training_20min_avg_volume.csv'),
}

test_data = {
    'trajectories': pd.read_csv('../trajectories(table 5)_training.csv'),
    'volume': pd.read_csv('../volume(table 6)_training.csv'),
    'avg_travel_time': pd.read_csv('../test1_20min_avg_travel_time.csv'),
    'avg_volume': pd.read_csv('../test1_20min_avg_volume.csv'),
}

links = pd.read_csv('../links (table 3).csv')
routes = pd.read_csv('../routes (table 4).csv')
weather = pd.concat([
    pd.read_csv('../weather (table 7)_training_update.csv'),
    pd.read_csv('../weather (table 7)_test1.csv')])

train_path = pd.read_csv('../train_avg_travel_time_4h&path.csv').drop(
    ['Unnamed: 0', 'time_window_start', 'time_start_of_day'], axis=1)

test_path = pd.read_csv('../test_avg_travel_time_4h&path.csv').drop(
    ['Unnamed: 0', 'time_window_start', 'time_start_of_day'], axis=1)

train_trajectories = train_data['trajectories']
train_volume = train_data['volume']

train_avg_travel_time = train_data['avg_travel_time']
train_avg_travel_time = train_avg_travel_time.merge(train_path, how='left')

train_avg_volume = train_data['avg_volume']

test_avg_travel_time = test_data['avg_travel_time']
test_avg_travel_time = test_avg_travel_time.merge(test_path, how='left')

test_avg_volume = test_data['avg_volume']

###  整理数据

In [4]:
# 处理天气数据 插值填充每小时数据

weather.date = pd.to_datetime(weather.date)
weather['time'] = weather.date + pd.to_timedelta(weather.hour, unit='h')

weather = weather.set_index(['time'])
weather = weather.reindex(pd.date_range(start= weather.date.min(), end=weather.date.max(), freq='1H'), fill_value='NaN')

weather['date'] = pd.to_datetime([t.date() for t in weather.index])
weather['year'] = [t.year for t in weather.index]
weather['day_of_year'] = [t.dayofyear for t in weather.index]
weather['hour'] = [t.hour for t in weather.index]
for col in ['pressure', 'sea_pressure', 'wind_direction', 'wind_speed', 'temperature', 'rel_humidity', 'precipitation']:
    weather[col] = weather[col].astype(float).interpolate()
    
weather = weather.drop(['date'], axis=1)

In [5]:
pd.to_pickle(weather, 'weather.pickle')

In [6]:
# 拆分时间窗

def split_time_window(time_window):
    time_start = [time[1:-1].split(',')[0] for time in time_window]
    time_end = [time[1:-1].split(',')[1] for time in time_window]

    return pd.to_datetime(pd.Series(time_start)), pd.to_datetime(pd.Series(time_end))

train_avg_volume['time_start'], train_avg_volume['time_end'] = split_time_window(train_avg_volume.time_window)
train_avg_volume = train_avg_volume.drop(['time_window'], axis=1)

train_avg_travel_time['time_start'], train_avg_travel_time['time_end'] = split_time_window(train_avg_travel_time.time_window)
train_avg_travel_time = train_avg_travel_time.drop(['time_window'], axis=1)

test_avg_volume['time_start'], test_avg_volume['time_end'] = split_time_window(test_avg_volume.time_window)
test_avg_volume = test_avg_volume.drop(['time_window'], axis=1)

test_avg_travel_time['time_start'], test_avg_travel_time['time_end'] = split_time_window(test_avg_travel_time.time_window)
test_avg_travel_time = test_avg_travel_time.drop(['time_window'], axis=1)

# 预处理

## 特征提取

In [7]:
from Preprocess.TimeProcessor import *
from Preprocess.Utils import *

* ### 天气特征

In [8]:
class WeatherFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, weather):
        self.weather = weather
        
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        table = posts.merge(self.weather,  how='left' ,on=['year','day_of_year','hour'])
        return table.fillna(-1)

* ### 前两小时特征

In [9]:
class LastTwoHour(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        #self.traffic = pd.read_csv('../trafic_train.csv')
    
    def fit(self,x,y=None):
        return self
    
    def transform(self, posts):
        return posts

In [10]:
feature_process = [
    ('weather', WeatherFeatures(weather)),
    #('last2hour', LastTwoHour()),
]
combined_features = FeatureUnion(feature_process)

## 特征选取

In [11]:
from sklearn.feature_selection import SelectKBest
feature_selector = SelectKBest(k=all)

from sklearn.feature_selection import GenericUnivariateSelect

# 训练模型

In [12]:
estimators = [
    ('transformed_time', TimeProcessor('time_start')),
    ('encoding',MultiColumnLabelEncoder(columns=['intersection_id'])),
    ('drop', DropProcessor(['time_start', 'time_end'])),
    ('save_table', Saver('../Before.pickle')),
    ('combined_features', combined_features),
    ('feature_select', feature_selector),
]

preprocess = Pipeline(estimators)

In [13]:
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

lr = LinearRegression()

def log(x):
    return np.log(x)

#TODO: 0.65*abs(x) * log(abs(x) + 1)
def obj1(y_true, y_pred):
    x = y_true - y_pred
    absx = np.abs(x)
    
    grad =((x*abs(x)+x)*log(abs(x)+1)+x*abs(x))/(abs(x)+x**2)
    hess =(2*abs(x)+x**2)/(x**2*abs(x)+abs(x)+2*x**2)
    return grad, hess

def obj2(y_true, y_pred):
    tmp = y_true - y_pred
    hess = 2 / y_true    
    grad = - hess *  tmp
       
    return grad, hess

'''
    sig = np.sign(y_true - y_pred)
    grad = sig#(sig *  (-1 / y_true)) * 10
    hess = np.zeros(len(y_true))
    return grad, hess
'''
xgb = XGBRegressor(objective=obj2)

regressors = [
    #LinearRegression(),
    #RandomForestRegressor(),
    #ExtraTreesRegressor(),
    #LGBMRegressor(objective=obj2),
    LGBMRegressor(boosting_type='dart', objective= obj2),
    XGBRegressor(objective=obj2),
    #SVR()
]

stack = StackingRegressor(regressors=regressors,meta_regressor=xgb,verbose=1)



# 参数

In [14]:
displayParam(preprocess, ['<h1>预处理参数</h1>'])

In [15]:
displayParam(stack, ['<h1>训练参数</h1>'])

# 结果

In [16]:
from scipy.stats import randint as sp_randint

prep_params = {
    'encoding__columns':['intersection_id'],
    'feature_select__k':  'all'
}

stack_params = {
    #model
    'lgbmregressor__n_estimators': [100,300,500],
    'xgbregressor__n_estimators': [100,300,500,700],
}

In [17]:
preprocess.set_params(**prep_params)

from sklearn.model_selection import RandomizedSearchCV
param_search = RandomizedSearchCV(
    estimator=stack,
    param_distributions=stack_params, 
    scoring=mape, 
    n_iter=6,
    cv=3
)

In [None]:
t = (train_avg_travel_time.time_start < '2016-10-8') & (train_avg_travel_time.time_start > '2016-10-1')
#trafic_train = pd.read_csv('../trafic_train.csv')

t2 = train_avg_travel_time.drop(t)
train_x=t2.drop(['avg_travel_time'], axis=1)
train_y=t2.avg_travel_time

train_x.index = range(len(train_x))
train_y.index = range(len(train_y))
train_x=preprocess.fit_transform(train_x, train_y)
group=train_x[:,4]

  f = msb / msw


In [None]:
clf = param_search.fit(train_x, train_y, groups=group)

Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: lgbmregressor (1/2)
Fitting r

In [None]:
param_search.cv_results_

In [None]:
param_search.best_params_

In [None]:
param_search.best_estimator_.verbose=0
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
i = 0
clf2 = param_search.best_estimator_
#clf2 = LinearRegression()

for train, test in gkf.split(train_x, train_y, groups=group):
    clf2.fit(train_x[train], train_y[train])
    print('* {}: train:{}, test:{}'.format(i,
        mean_absolute_percentage_error(train_y[train],  clf2.predict(train_x[train])),
        mean_absolute_percentage_error(train_y[test],  clf2.predict(train_x[test]))))
    i+=1

In [None]:
'''from sklearn.externals import joblib
from sklearn.pipeline import make_pipeline
pipe = Pipeline([
    ('preprocess', preprocess),
    ('best_estimator', param_search.best_estimator_)
])
joblib.dump(pipe, 'model.pkl')'''

In [None]:
submission = pd.read_csv('../submission_sample_travelTime.csv').drop('avg_travel_time', axis = 1)
test_x = pd.read_csv('../submission_sample_travelTime.csv').drop('avg_travel_time', axis = 1)
test_x['time_start'], test_x['time_end'] = split_time_window(test_x.time_window)
test_x = test_x.drop(['time_window'], axis=1)

test_x = preprocess.transform(test_x)

clf = param_search.best_estimator_.fit(train_x, train_y)
submission['avg_travel_time'] =  clf.predict(test_x)
submission.to_csv('avg_travel_time.csv', index = False)

In [None]:
train_avg_travel_time.columns