In [None]:
import pandas as pd
import numpy as np

# modeling
import keras
from keras.utils import pad_sequences
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
# Expects: run the code on lstm_traffic_only_1 to generate the csv info
traffic_df = pd.read_csv("../data/traffic_pre_lstm.csv")
traffic_df['timestamp'] = pd.to_datetime(traffic_df['timestamp'])
traffic_df.set_index('timestamp', inplace=True)
traffic_df.head()

In [None]:
traffic_df.shape

In [None]:
def create_sequence_training_set(training_sample, seq_length, pred_horizon, input_features, output_features):
    seq_arrays = []
    seq_labs = []

    for i in range(training_sample.shape[0] - seq_length - pred_horizon):
        seq_arrays.append(
            training_sample.iloc[i:seq_length+i, :input_features].to_numpy())

        seq_labs.append(
            training_sample.iloc[seq_length+pred_horizon+i, :output_features])

    seq_arrays = np.array(seq_arrays, dtype=object).astype(np.float32)
    seq_labs = np.array(seq_labs, dtype=object).astype(np.float32)

    return seq_arrays, seq_labs

# create validation dataset


def create_validation_set(validation_sample, seq_length, pred_horizon, input_features, output_features):
    val_arrays = []
    val_labs = []

    for i in range(validation_sample.shape[0] - seq_length - pred_horizon):
        if i < seq_length:
            val_arrays.append(
                validation_sample.iloc[:(i+1), :input_features].to_numpy())

            val_labs.append(validation_sample.iloc[:(
                i+pred_horizon+1), :output_features].to_numpy()[-1])
        else:
            val_arrays.append(
                validation_sample.iloc[i:seq_length+i, :input_features].to_numpy())

            val_labs.append(
                validation_sample.iloc[seq_length+i+pred_horizon, :output_features])

    val_arrays = pad_sequences(
        val_arrays, padding='pre', dtype=object).astype(np.float32)

    val_labs = np.array(val_labs, dtype=object).astype(np.float32)

    return val_arrays, val_labs

In [None]:
def train_lstm_model(model_path, 
                     seq_length, 
                     seq_arrays, 
                     seq_labs, 
                     input_features, 
                     output_features):
    
    # build the network
    output_size = output_features

    model = Sequential()

    model.add(LSTM(
        input_shape=(seq_length, input_features),
        units=100, activation="relu", return_sequences=True))
    model.add(Dropout(0.025))

    model.add(LSTM(units=64, activation="relu", return_sequences=True))
    model.add(Dropout(0.025))
    model.add(LSTM(units=32, activation="relu", return_sequences=False))
    model.add(Dense(units=output_size, activation="linear"))

    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error',
                  optimizer=optimizer, metrics=['mse'])

    print(model.summary())

    # fit the network
    history = model.fit(seq_arrays,
                        seq_labs,
                        epochs=1000,
                        batch_size=80,
                        validation_split=0.05,
                        verbose=2,
                        callbacks=[
                            keras.callbacks.EarlyStopping(
                                monitor='val_loss',
                                min_delta=0.1,
                                patience=5,
                                verbose=0,
                                mode='min'),
                            keras.callbacks.ModelCheckpoint(
                                model_path,
                                monitor='val_loss',
                                save_best_only=True,
                                mode='min',
                                verbose=0)
                        ])

    return model, history

In [None]:
def plot_prediction_across_network(model, val_arrays, val_labs, title):
    y_pred_test = model.predict(val_arrays)
    y_true_test = val_labs

    # aggregating for easier visualization
    y_pred_dv = [row.sum() for row in y_pred_test]
    y_true_dv = [row.sum() for row in y_true_test]

    start = 100
    ts = 1000

    plt.plot(y_pred_dv[start:start+ts], label='Predicted Value')
    plt.plot(y_true_dv[start:start+ts], label='Actual Value')
    plt.title(title,
            fontsize=22, fontweight='bold')
    plt.ylabel('value')
    plt.xlabel('row')
    plt.legend()
    plt.show()

# Alternative 3: Parking information

In [None]:
parking_df = pd.read_csv("../data/aarhus_parking_geolocated.csv")
parking_df['updatetime'] = pd.to_datetime(parking_df['updatetime'])

# create a timestamp at the 5 minute interval in line with the traffic data
parking_df["timestamp"] = parking_df["updatetime"].apply(
    pd.Timestamp.ceil, freq="5min")
parking_df.set_index("timestamp", drop=True, inplace=True)

# aggregate over the timestamp
parking_df = parking_df.groupby(parking_df.index).agg({
    'vehiclecount': 'sum',
    'totalspaces': 'sum'
})

parking_df['occupancy_rate'] = parking_df['vehiclecount'] / \
    parking_df['totalspaces']

# merge with a complete timestamp index
timestamp_index = pd.DataFrame({"timestamp": pd.date_range(pd.to_datetime(
    parking_df.index.min()), pd.to_datetime(parking_df.index.max()), freq="5min").to_list()})

parking_df = timestamp_index.merge(
    parking_df,
    how="left",
    left_on="timestamp",
    right_index=True
)

# impute missing values using linear interpolation
parking_df['occupancy_rate'].interpolate("linear", inplace=True)
parking_df.set_index("timestamp", inplace=True)

parking_features = ['occupancy_rate']

combined_parking_traffic = traffic_df.merge(
    parking_df[parking_features],
    how="left",
    left_index=True,
    right_index=True
)

# remove rows at the end without occupancy data
combined_parking_traffic = combined_parking_traffic[combined_parking_traffic["occupancy_rate"].notna()]

In [None]:
combined_parking_traffic.shape

In [None]:
training_sample = combined_parking_traffic[100:17000]
validation_sample = combined_parking_traffic[17001:]

validation_sample.shape

In [None]:
# set predictive horizon and sequence length
ph = 5
seq_length = 12

# features to randomly sample without replacement
# must be a value between 1 to 450, inclusive
input_features = 450
output_features = 449

sensor = pd.Series(training_sample.columns).sample(
    input_features, replace=False).sort_values().to_list()


seq_arrays, seq_labs = create_sequence_training_set(
    training_sample, seq_length, ph, input_features, output_features)

val_arrays, val_labs = create_validation_set(
    validation_sample, seq_length, ph, input_features, output_features)

In [None]:
model, history = train_lstm_model('lstm_traffic_parking.keras',
                                  seq_length,
                                  seq_arrays,
                                  seq_labs,
                                  input_features,
                                  output_features)
# list all data in history
print(history.history.keys())

In [None]:
def summarize_history_loss(history, title):
    # summarize history for Loss/MSE
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

summarize_history_loss(history, "Traffic + Parking model loss/mse")

scores_test = model.evaluate(val_arrays, val_labs, verbose=2)
print('\nMSE: {}'.format(scores_test[1]))

In [None]:
plot_prediction_across_network(model, val_arrays, val_labs, 'Sum of Average Speed Across Network')