### Preprocess training data

In [2]:
import pandas as pd
import datetime as date
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
# get the training dataset
train_data = pd.read_csv('train.csv', parse_dates = ['date'], dayfirst= True)
print(train_data.info())
print(train_data.describe())
print(train_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14006 entries, 0 to 14005
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      14006 non-null  int64         
 1   date    14006 non-null  datetime64[ns]
 2   speed   14006 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 328.4 KB
None
                id         speed
count  14006.00000  14006.000000
mean    7002.50000     32.779118
std     4043.32827     13.573813
min        0.00000      2.573417
25%     3501.25000     19.301089
50%     7002.50000     36.580595
75%    10503.75000     45.877665
max    14005.00000     53.161286
   id                date      speed
0   0 2017-01-01 00:00:00  43.002930
1   1 2017-01-01 01:00:00  46.118696
2   2 2017-01-01 02:00:00  44.294158
3   3 2017-01-01 03:00:00  41.067468
4   4 2017-01-01 04:00:00  46.448653


In [3]:
# 缺失值处理，向上填充或者回填
# 将日期设置为索引

# train_data.set_index('date',inplace=True)

# 缺失值位置
# train_data['2018-01'].head()
# fill the date with the last known value
# 结果是处理到2018/12/31 20:00
# train_data = train_data.resample("H").ffill()
# 线性填充，取前后均值

# train_data = train_data.resample("H").interpolate()

# train_data['2018-01'].head()
# 将索引重新转回为列

# train_data.reset_index(level=0, inplace=True)

# train_data.head()
# fill the date with the next known value
# train_data.resample("H").bfill()
# train_data['2018-01'].head()

In [4]:
public_vacation_list = [
    '20170101','20170102','20170128','20170130',
    '20170131','20170404','20170414','20170415',
    '20170417','20170501','20170503','20170530',
    '20170701','20171001','20171002','20171005',
    '20171028','20171222','20171225','20171226'
    
    '20180101','20180216','20180217','20180219',
    '20180330','20180331','20180402','20180405',
    '20180501','20180522','20180618','20180701',
    '20180702','20180925','20181001','20181017',
    '20181222','20181225','20181226'
]

# bigrain_list = [
#     '20170524','20170613','20170617','20170717',
#     '20170803','20180608','20180826','20180829',
#     '20180916'
# ]
yellow_rain_list = [
    '20170421','20170504','20170524','20170613',
    '20170617','20170621','20170717','20170718'
    '20170723','20170803','20170804','20170823'
    '20170827','20170828','20170923','20171017'
    '20180606','20180608','20180613','20180622'
    '20180623','20180702','20180715','20180810'
    '20180811','20180812','20180817','20180819'
    '20180820','20180822','20180826','20180827'
    '20180828','20180829','20180901','20180902'
    '20180916','20180924'
]
red_rain_list = [
    '20170524','20170613','20170617','20170717'
    '20170803','20180608','20180826','20180829'
    '20180916'
]
black_rain_list = [
    '20170524'
]
# extract the date
def extract_date(df,col):
    df["year"] = df[col].apply(lambda x: x.year)
    df["month"] = df[col].apply(lambda x: x.month)
    df["day"] = df[col].apply(lambda x: x.day)
    df["hour"] = df[col].apply(lambda x: x.hour)
#     df["is_morning_work"] = df[col].apply(lambda x : 1 if x.hour in [8,9] else 0)
#     df["is_evening_work"] = df[col].apply(lambda x : 1 if x.hour in [17,18,19] else 0)
#     df["is_sunday"] = df[col].apply(lambda x : 1 if x.weekday() in [6] else 0)
    df["dayofweek"] = df[col].apply(lambda x : x.weekday()+1)
    df["tmp"] = df[col].apply(lambda x: x.strftime('%Y%m%d'))
    df["is_holiday"] = df["tmp"].apply(lambda x: 1 if x in public_vacation_list else 0)
    df["is_shanzhu"] = df["tmp"].apply(lambda x : 1 if x in ['20180916'] else 0)
    df["is_jam"] = df["tmp"].apply(lambda x : 1 if x in ['20181016'] else 0)
    df["is_yellowrain"] = df["tmp"].apply(lambda x: 1 if x in yellow_rain_list else 0)
    df["is_redrain"] = df["tmp"].apply(lambda x: 1 if x in red_rain_list else 0)
    df["is_blackrain"] = df["tmp"].apply(lambda x: 1 if x in black_rain_list else 0)
    
extract_date(train_data,'date')
train_data.head(165)
# train_data.head(160)


Unnamed: 0,id,date,speed,year,month,day,hour,dayofweek,tmp,is_holiday,is_shanzhu,is_jam,is_yellowrain,is_redrain,is_blackrain
0,0,2017-01-01 00:00:00,43.002930,2017,1,1,0,7,20170101,1,0,0,0,0,0
1,1,2017-01-01 01:00:00,46.118696,2017,1,1,1,7,20170101,1,0,0,0,0,0
2,2,2017-01-01 02:00:00,44.294158,2017,1,1,2,7,20170101,1,0,0,0,0,0
3,3,2017-01-01 03:00:00,41.067468,2017,1,1,3,7,20170101,1,0,0,0,0,0
4,4,2017-01-01 04:00:00,46.448653,2017,1,1,4,7,20170101,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,160,2017-01-07 16:00:00,16.578257,2017,1,7,16,6,20170107,0,0,0,0,0,0
161,161,2017-01-07 17:00:00,11.493021,2017,1,7,17,6,20170107,0,0,0,0,0,0
162,162,2017-01-07 18:00:00,22.530929,2017,1,7,18,6,20170107,0,0,0,0,0,0
163,163,2017-01-07 19:00:00,31.559262,2017,1,7,19,6,20170107,0,0,0,0,0,0


In [5]:
#构建交叉项，只对线性回归有效
#hour_day = train_data[['dayofweek', 'hour']].copy()
# onc = OneHotEncoder()

# onehot 星期几
# hour_day.loc[:,'dayofweek'] = hour_day.loc[:,'dayofweek'].astype(str)
# hour_day = pd.get_dummies(hour_day, columns=['dayofweek'])
# onehot 小时
# hour_day.loc[:,'hour'] = hour_day.loc[:,'hour'].astype(str)
# hour_day = pd.get_dummies(hour_day, columns=['hour'])

# hour_day.head()

# p = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False).fit(hour_day)
# hour_day_poly = pd.DataFrame(p.transform(hour_day), columns=p.get_feature_names(hour_day.columns))
# pd.options.display.max_columns = None
# hour_day_poly.head()

# train_data = pd.merge(train_data, hour_day_poly,left_index=True,right_index=True)

# p = PolynomialFeatures(degree=2,i).fit(hour_day)
# f = pd.DataFrame(p.transform(df), columns=p.get_feature_names(df.columns))
# print('deg 2\n', f)

# # 多项式特征
# train_data = train_data[['Attack', 'Defense']]

In [6]:
# onehot 星期几
train_data['dayofweek'] = train_data['dayofweek'].astype(str)
train_data = pd.get_dummies(train_data, columns=['dayofweek'])
# onehot 小时
# train_data['hour'] = train_data['hour'].astype(str)
# train_data = pd.get_dummies(train_data, columns=['hour'])
display(train_data)

Unnamed: 0,id,date,speed,year,month,day,hour,tmp,is_holiday,is_shanzhu,...,is_yellowrain,is_redrain,is_blackrain,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
0,0,2017-01-01 00:00:00,43.002930,2017,1,1,0,20170101,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,2017-01-01 01:00:00,46.118696,2017,1,1,1,20170101,1,0,...,0,0,0,0,0,0,0,0,0,1
2,2,2017-01-01 02:00:00,44.294158,2017,1,1,2,20170101,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3,2017-01-01 03:00:00,41.067468,2017,1,1,3,20170101,1,0,...,0,0,0,0,0,0,0,0,0,1
4,4,2017-01-01 04:00:00,46.448653,2017,1,1,4,20170101,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14001,14001,2018-12-31 12:00:00,19.865269,2018,12,31,12,20181231,0,0,...,0,0,0,1,0,0,0,0,0,0
14002,14002,2018-12-31 15:00:00,17.820375,2018,12,31,15,20181231,0,0,...,0,0,0,1,0,0,0,0,0,0
14003,14003,2018-12-31 16:00:00,12.501851,2018,12,31,16,20181231,0,0,...,0,0,0,1,0,0,0,0,0,0
14004,14004,2018-12-31 18:00:00,15.979319,2018,12,31,18,20181231,0,0,...,0,0,0,1,0,0,0,0,0,0


In [7]:
# drop the id
# train_data.drop(columns=["id", "date","date_year"], inplace=True)
# train_data.drop(columns=["id", "date","tmp","dayofweek","hour"], inplace=True)
train_data.drop(columns=["id", "date","tmp"], inplace=True)
train_data.head()

Unnamed: 0,speed,year,month,day,hour,is_holiday,is_shanzhu,is_jam,is_yellowrain,is_redrain,is_blackrain,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
0,43.00293,2017,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,46.118696,2017,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1
2,44.294158,2017,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,1
3,41.067468,2017,1,1,3,1,0,0,0,0,0,0,0,0,0,0,0,1
4,46.448653,2017,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,1


### Preprocess test data

In [8]:
test_data = pd.read_csv('test.csv', parse_dates = ['date'], dayfirst= True)
print(test_data.info())
print(test_data.describe())
print(test_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3504 entries, 0 to 3503
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      3504 non-null   int64         
 1   date    3504 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 54.9 KB
None
                id
count  3504.000000
mean   1751.500000
std    1011.661999
min       0.000000
25%     875.750000
50%    1751.500000
75%    2627.250000
max    3503.000000
   id                date
0   0 2018-01-01 02:00:00
1   1 2018-01-01 05:00:00
2   2 2018-01-01 07:00:00
3   3 2018-01-01 08:00:00
4   4 2018-01-01 10:00:00


In [9]:
extract_date(test_data,'date')

In [10]:
# onehot 星期几
test_data['dayofweek'] = test_data['dayofweek'].astype(str)
test_data = pd.get_dummies(test_data, columns=['dayofweek'])
# onehot 小时
# test_data['hour'] = test_data['hour'].astype(str)
# test_data = pd.get_dummies(test_data, columns=['hour'])
test_data.head()

Unnamed: 0,id,date,year,month,day,hour,tmp,is_holiday,is_shanzhu,is_jam,is_yellowrain,is_redrain,is_blackrain,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
0,0,2018-01-01 02:00:00,2018,1,1,2,20180101,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,2018-01-01 05:00:00,2018,1,1,5,20180101,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,2018-01-01 07:00:00,2018,1,1,7,20180101,0,0,0,0,0,0,1,0,0,0,0,0,0
3,3,2018-01-01 08:00:00,2018,1,1,8,20180101,0,0,0,0,0,0,1,0,0,0,0,0,0
4,4,2018-01-01 10:00:00,2018,1,1,10,20180101,0,0,0,0,0,0,1,0,0,0,0,0,0


In [11]:
# drop the id
# test_data.drop(columns=["id", "date","date_year"], inplace=True)
test_data.drop(columns=["id", "date","tmp"], inplace=True)
test_data.head()

Unnamed: 0,year,month,day,hour,is_holiday,is_shanzhu,is_jam,is_yellowrain,is_redrain,is_blackrain,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
0,2018,1,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0
1,2018,1,1,5,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2018,1,1,7,0,0,0,0,0,0,1,0,0,0,0,0,0
3,2018,1,1,8,0,0,0,0,0,0,1,0,0,0,0,0,0
4,2018,1,1,10,0,0,0,0,0,0,1,0,0,0,0,0,0


### Fit model

In [12]:
# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn import metrics

In [13]:
X_train = train_data.drop(columns=["speed"])
Y_train = train_data["speed"]
X_test = test_data

In [52]:
# # 训练随机森林解决回归问题
# rf = RandomForestRegressor(n_estimators=50, n_jobs=-1)
# rf.fit(X_train, Y_train)
# y_pred_train = rf.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 1.8278263076943413


In [69]:
# # lightgbm
# lightgbm = LGBMRegressor(boosting_type='gbdt',objective='regression', n_estimators=1000)
# lightgbm.fit(X_train, Y_train)
# y_pred_train = lightgbm.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 5.180054755928473


In [54]:
# # ridge
# ridge = Ridge()
# ridge.fit(X_train, Y_train)
# y_pred_train = ridge.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 138.03960207145627


In [55]:
# # LinearRegression
# lr = LinearRegression()
# lr.fit(X_train, Y_train)
# y_pred_train = lr.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 138.03942363273163


In [33]:
# # Lasso
# lasso = Lasso()
# lasso.fit(X_train, Y_train)
# y_pred_train = lasso.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 145.35853782898383


In [34]:
# GradientBoostingRegressor
# params = {'n_estimators': 500, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
# gbr = GradientBoostingRegressor(n_estimators=350)
# gbr.fit(X_train, Y_train)
# y_pred_train = gbr.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 12.736545755763267


In [62]:
# # SVR
# svr = SVR()
# svr.fit(X_train, Y_train)
# y_pred_train = svr.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 198.58090366257778


In [63]:
# # SGDRegressor
# sgd = SGDRegressor()
# sgd.fit(X_train, Y_train)
# y_pred_train = sgd.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 2.2664771328665101e+30


In [64]:
# # ExtraTreesRegressor
# extra = ExtraTreesRegressor()
# extra.fit(X_train, Y_train)
# y_pred_train = extra.predict(X_train)

# # 评估回归性能
# print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 2.0943961478312985e-27


In [14]:
# XGBRegressor
# other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 4, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.5, 'reg_alpha': 3, 'reg_lambda': 3}
# xgb = XGBRegressor(**other_params)
xgb = XGBRegressor(n_estimators=100,n_jobs=-1)
xgb.fit(X_train, Y_train)
y_pred_train = xgb.predict(X_train)

# 评估回归性能
print('Mean Squared Error:', metrics.mean_squared_error(Y_train, y_pred_train))

Mean Squared Error: 7.1119299185431455


In [137]:
# cv_params = {'n_estimators': [100, 250, 500, 750, 1000]}

# cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
# cv_params = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
# cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
# cv_params = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
# cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2,0.3]}
# other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 4, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.5, 'reg_alpha': 3, 'reg_lambda': 3}
# other_params = {'n_estimators':500}

# model = XGBRegressor(**other_params)
# model = XGBRegressor()
# grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
# grid.fit(X_train, Y_train)

# print("最高得分：%.3f" % grid.best_score_)
# print("最优参数: %s" % grid.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    3.6s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    6.3s finished


最高得分：-78.905
最优参数: {'n_estimators': 100}


In [19]:
# # Setup cross validation folds
# kf = KFold(n_splits=12, random_state=42, shuffle=True)
# # Define error metrics
# def mse(y, y_pred):
#     return mean_squared_error(y, y_pred)


In [15]:
from pandas.core.frame import DataFrame

y_pred_test = xgb.predict(X_test)
# y_pred_test = lightgbm.predict(X_test)
# print(X_test)

# shanzhu = X_test[(X_test.year==2018)&(X_test.month==9)].index.tolist()
# shanzhu = X_test[(X_test.year==2018)&(X_test.month==9)&(X_test.day==16)&(X_test.hour>=11)&(X_test.hour<=15)].index.tolist()
# print(shanzhu)
# y_pred_test[shanzhu] = y_pred_test[shanzhu] - 5
# print(y_pred_test[shanzhu])

# tanhuan = X_test[(X_test.year==2018)&(X_test.month==10)&(X_test.day==16)&(X_test.hour>=7)&(X_test.hour<=19)].index.tolist()
# print(tanhuan)
# y_pred_test[tanhuan] = y_pred_test[tanhuan] - 3
# print(y_pred_test[tanhuan])
# print(y_pred_test)

submit = {"speed": y_pred_test}
submit_data = DataFrame(submit)
# submit_data["speed"].to_list()


submit_data.to_csv("final.csv", index_label="id")