In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/traffic.csv')

df['DateTime'] = pd.to_datetime(df['DateTime'])

#extra temporal features
df['day_of_month'] = df['DateTime'].dt.day
df['week_of_year'] = df['DateTime'].dt.isocalendar().week.astype(int)
print(df[['DateTime', 'day_of_month', 'week_of_year']].head())

             DateTime  day_of_month  week_of_year
0 2015-11-01 00:00:00             1            44
1 2015-11-01 01:00:00             1            44
2 2015-11-01 02:00:00             1            44
3 2015-11-01 03:00:00             1            44
4 2015-11-01 04:00:00             1            44


In [5]:
df.shape

(48120, 6)

In [6]:
df.columns

Index(['DateTime', 'Junction', 'Vehicles', 'ID', 'day_of_month',
       'week_of_year'],
      dtype='object')

In [7]:
missing_timestamps = {}

for junction in df['Junction'].unique():
    df_j = df[df['Junction'] == junction]
    
    # Create full hourly date range from min to max DateTime for this junction
    full_range = pd.date_range(start=df_j['DateTime'].min(), end=df_j['DateTime'].max(), freq='H')
    
    # Find missing timestamps by set difference
    missing = full_range.difference(df_j['DateTime'])
    
    missing_timestamps[junction] = missing
    print(f'Junction {junction} missing timestamps count: {len(missing)}')

#missing timestamps for junction 4:
print("\nMissing timestamps for Junction 4:")
print(missing_timestamps[4])


Junction 1 missing timestamps count: 0
Junction 2 missing timestamps count: 0
Junction 3 missing timestamps count: 0
Junction 4 missing timestamps count: 0

Missing timestamps for Junction 4:
DatetimeIndex([], dtype='datetime64[ns]', freq='h')


  full_range = pd.date_range(start=df_j['DateTime'].min(), end=df_j['DateTime'].max(), freq='H')


In [8]:
#sort data by junction and datetime for correct lag calculation
df = df.sort_values(['Junction', 'DateTime'])

#creates lag features
df['Vehicles_lag_1'] = df.groupby('Junction')['Vehicles'].shift(1)
df['Vehicles_lag_2'] = df.groupby('Junction')['Vehicles'].shift(2)

#first few rows to confirm lag features
print(df[['DateTime', 'Junction', 'Vehicles', 'Vehicles_lag_1', 'Vehicles_lag_2']].head(10))


             DateTime  Junction  Vehicles  Vehicles_lag_1  Vehicles_lag_2
0 2015-11-01 00:00:00         1        15             NaN             NaN
1 2015-11-01 01:00:00         1        13            15.0             NaN
2 2015-11-01 02:00:00         1        10            13.0            15.0
3 2015-11-01 03:00:00         1         7            10.0            13.0
4 2015-11-01 04:00:00         1         9             7.0            10.0
5 2015-11-01 05:00:00         1         6             9.0             7.0
6 2015-11-01 06:00:00         1         9             6.0             9.0
7 2015-11-01 07:00:00         1         8             9.0             6.0
8 2015-11-01 08:00:00         1        11             8.0             9.0
9 2015-11-01 09:00:00         1        12            11.0             8.0


In [11]:
df = df.dropna(subset=['Vehicles_lag_1', 'Vehicles_lag_2']).copy()
# more lag features (up to 6 hours ago)
for lag in range(3, 7):
    df[f'Vehicles_lag_{lag}'] = df.groupby('Junction')['Vehicles'].shift(lag)

#rolling (moving) average features
df['Vehicles_roll_mean_3'] = df.groupby('Junction')['Vehicles'].shift(1).rolling(window=3).mean()
df['Vehicles_roll_mean_6'] = df.groupby('Junction')['Vehicles'].shift(1).rolling(window=6).mean()

In [13]:
print(df[['DateTime', 'Junction', 'Vehicles',
          'Vehicles_lag_1', 'Vehicles_lag_2', 'Vehicles_lag_3', 'Vehicles_lag_4',
          'Vehicles_lag_5', 'Vehicles_lag_6', 'Vehicles_roll_mean_3', 'Vehicles_roll_mean_6']].head(10))
print(f"Rows after dropping NaNs: {len(df)}")
print(df.isnull().sum())



              DateTime  Junction  Vehicles  Vehicles_lag_1  Vehicles_lag_2  \
2  2015-11-01 02:00:00         1        10            13.0            15.0   
3  2015-11-01 03:00:00         1         7            10.0            13.0   
4  2015-11-01 04:00:00         1         9             7.0            10.0   
5  2015-11-01 05:00:00         1         6             9.0             7.0   
6  2015-11-01 06:00:00         1         9             6.0             9.0   
7  2015-11-01 07:00:00         1         8             9.0             6.0   
8  2015-11-01 08:00:00         1        11             8.0             9.0   
9  2015-11-01 09:00:00         1        12            11.0             8.0   
10 2015-11-01 10:00:00         1        15            12.0            11.0   
11 2015-11-01 11:00:00         1        17            15.0            12.0   

    Vehicles_lag_3  Vehicles_lag_4  Vehicles_lag_5  Vehicles_lag_6  \
2              NaN             NaN             NaN             NaN   
3

## Modelling


In [18]:
df_model = df.dropna().copy()

In [23]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

feature_cols = [f'Vehicles_lag_{lag}' for lag in range(1, 7)]  # lag_1 to lag_6
target_col = 'Vehicles'

df_model = df_model.dropna(subset=feature_cols + [target_col])  # drops rows with NaNs in relevant cols

X = df_model[feature_cols].values 
y = df_model[target_col].values    

#Reshape X for LSTM input: (samples, timesteps, features)
X = X.reshape((X.shape[0], X.shape[1], 1))  # 6 timesteps, 1 feature
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False)

scaler_X = MinMaxScaler()
X_train_2d = X_train.reshape(-1, 1)
X_test_2d = X_test.reshape(-1, 1)

scaler_X.fit(X_train_2d)
X_train_scaled = scaler_X.transform(X_train_2d).reshape(X_train.shape)
X_test_scaled = scaler_X.transform(X_test_2d).reshape(X_test.shape)

#Scales target values
scaler_y = MinMaxScaler()
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
scaler_y.fit(y_train)
y_train_scaled = scaler_y.transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

#LSTM model
model = Sequential()
model.add(LSTM(50, activation='tanh', input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

#model with early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train_scaled,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

#Evaluat on test set
loss = model.evaluate(X_test_scaled, y_test_scaled)
print(f'Test Loss (MSE): {loss:.6f}')

# predictions and inverse scale
y_pred_scaled = model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_true = scaler_y.inverse_transform(y_test_scaled)

#error metrics in original scale
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)

print(f'MAE: {mae:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')


Epoch 1/50


  super().__init__(**kwargs)


[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0054 - val_loss: 0.0021
Epoch 2/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0012 - val_loss: 0.0021
Epoch 3/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0011 - val_loss: 0.0018
Epoch 4/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0011 - val_loss: 0.0017
Epoch 5/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0011 - val_loss: 0.0018
Epoch 6/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9.9108e-04 - val_loss: 0.0019
Epoch 7/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9.7361e-04 - val_loss: 0.0017
Epoch 8/50
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9.5643e-04 - val_loss: 0.0017
Epoch 9/50
[1m1082/108