In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
# Define folder of the data
DATA_FOLDER_PATH = 'data/'

# read in train dataset
train_set = pd.read_csv(
  DATA_FOLDER_PATH + 'train.csv', 
  parse_dates=['date'], 
  index_col="date",
  dayfirst=True
)

# set date as index
train_set.index = pd.to_datetime(train_set.index)
test_set = pd.read_csv(
  DATA_FOLDER_PATH + 'test.csv', 
  parse_dates=['date'], 
  index_col="date",
  dayfirst=True
)

# read in test dataset
test_set.index = pd.to_datetime(test_set.index)

In [None]:
# define following functions used for data formatting
def match_date(date, match_date_list):
  if date.strftime("%Y-%m-%d") in match_date_list:
    return 1
  else:
    return 0

def match_month(datetime, match_month_list):
  if datetime.month in match_month_list:
    return 1
  else:
    return 0

def match_day_of_week(datetime, match_dayofweek_list):
  if datetime.weekday() in match_dayofweek_list:
    return 1
  else:
    return 0

def match_hour_of_day(datetime, match_hourofday_list):
  if datetime.hour in match_hourofday_list:
    return 1
  else:
    return 0

# define a list of hk public holidays in 2017-2018
HK_PUBLIC_HOLIDAY_LIST = ['2017-01-01', '2017-01-02', '2017-01-28', '2017-01-29', '2017-01-30', '2017-01-31', '2017-04-05', '2017-04-14', '2017-04-15', '2017-04-17', '2017-05-01', '2017-05-03', '2017-05-30', '2017-07-01', '2017-10-01', '2017-10-02', '2017-10-05', '2017-10-28', '2017-12-25', '2017-12-26',
                          '2018-01-01', '2018-02-16', '2018-02-17', '2018-02-18', '2018-02-19', '2018-03-30', '2018-03-31', '2018-04-02', '2018-04-05', '2018-05-01', '2018-05-22', '2018-06-18', '2018-07-02', '2018-09-25', '2018-10-01', '2018-10-17', '2018-12-25', '2018-12-26']


In [None]:
# to expand dataset with more feature columns
def expand_date_related_features(df):
  df['week_day'] = df.index.dayofweek
  df['hour'] = df.index.hour
  df['holiday'] = df.index.map(lambda x: match_date(x, HK_PUBLIC_HOLIDAY_LIST))
  df['weekend'] = df.index.map(lambda x: match_day_of_week(x, [5,6]))
  for i in range(12):
    df['month_' + str(i)] = df.index.to_series().map(lambda x: match_month(x, [i]))
  for i in range(7):
    df['day_of_week_' + str(i)] = df.index.to_series().map(lambda x: match_day_of_week(x, [i]))
  for i in range(24):
    df['hour_of_day_' + str(i)] = df.index.to_series().map(lambda x: match_hour_of_day(x, [i]))
  return df

In [None]:
# to retrieve speed by a given date
def get_speed_by_date(datetime, df):
  d1 = df[df.index == datetime]
  if len(list(d1.speed)) > 0:
    return list(d1.speed)[0]
  else:
    d2 = df[df.week_day == datetime.weekday()][df.hour == datetime.hour][df.holiday == (1 if datetime.strftime("%Y-%m-%d") in HK_PUBLIC_HOLIDAY_LIST else 0)]
    if len(list(d2.speed)) > 0:
      return d2.speed.mean()
    else:
      return 0

# to expand dataset with prev and next hour speed as features
def expand_speed_related_features(df, df_ref):
  df['prev_hour_speed'] = df.index.to_series().map(lambda x: get_speed_by_date(x - timedelta(hours = 1), df_ref))
  df['next_hour_speed'] = df.index.to_series().map(lambda x: get_speed_by_date(x + timedelta(hours = 1), df_ref))
  return df


In [None]:
# update train dataset and test dataset with the new features
train_set = expand_date_related_features(train_set)
test_set = expand_date_related_features(test_set)

train_set = expand_speed_related_features(train_set, train_set)
test_set = expand_speed_related_features(test_set, train_set)

In [None]:
# split out the feature columns used for training
train_feature_list = [
  'prev_hour_speed', 'next_hour_speed',
  'holiday', 'weekend',
  'month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
  'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
  'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6',
  'hour_of_day_0', 'hour_of_day_1', 'hour_of_day_2', 'hour_of_day_3', 'hour_of_day_4', 'hour_of_day_5', 'hour_of_day_6', 
  'hour_of_day_7', 'hour_of_day_8', 'hour_of_day_9', 'hour_of_day_10', 'hour_of_day_11', 'hour_of_day_12', 'hour_of_day_13',
  'hour_of_day_14', 'hour_of_day_15', 'hour_of_day_16', 'hour_of_day_17', 'hour_of_day_18', 'hour_of_day_19', 'hour_of_day_20',
  'hour_of_day_21', 'hour_of_day_22', 'hour_of_day_23'
]

# split out the label column
train_label = ['speed']

train_set_X = train_set[train_feature_list]
train_set_Y = train_set[train_label]
test_set_X = test_set[train_feature_list]

In [None]:
from sklearn.preprocessing import RobustScaler

# rescale the prev and next speed value
speed_scaler = RobustScaler()
speed_scaler = speed_scaler.fit(train_set[['speed']].to_numpy())

train_set_X['prev_hour_speed'] = speed_scaler.transform(train_set_X[['prev_hour_speed']])
train_set_X['next_hour_speed'] = speed_scaler.transform(train_set_X[['next_hour_speed']])
# train_set_Y['speed'] = speed_scaler.transform(train_set_Y[['speed']])

test_set_X['prev_hour_speed'] = speed_scaler.transform(test_set_X[['prev_hour_speed']])
test_set_X['next_hour_speed'] = speed_scaler.transform(test_set_X[['next_hour_speed']])

In [None]:
from tensorflow import keras

# build a bi dierectional lstm model for training
# with 256 units
X_train_np = np.expand_dims(train_set_X.values.astype(np.float32), axis=1)
y_train = train_set_Y

model = keras.Sequential()
model.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=128, 
      input_shape=(X_train_np.shape[1], X_train_np.shape[2])
    )
  )
)

# add one more output layer with 1 unit
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
# train model using train_dataset
history = model.fit(
    X_train_np, y_train, 
    epochs=50, 
    batch_size=20, 
    validation_split=0.1,
    shuffle=True
)

In [None]:
# apply the model to the test set, and output the result
X_test_np = np.expand_dims(test_set_X.values.astype(np.float32), axis=1)

df_test_y = model.predict(X_test_np)
nn_df = pd.DataFrame(df_test_y, columns=['speed'], index=test_set.id)
nn_df['speed'] = nn_df[['speed']]
nn_df.to_csv(DATA_FOLDER_PATH + 'result.csv')