In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras

In [25]:
# Load detectors.csv
detectors = pd.read_csv('dataset/detectors.csv')
detectors['detid'] = detectors['detid'].astype('str')
detectors.set_index(['citycode', 'detid'], inplace=True)

# Pull and clean up traffic data for a city
def convert_traffic_data(city):
    # Load {city}.csv
    city_data = pd.read_csv(f'dataset/cities/{city}.csv')

    # Divide the interval column by 3600 to get the hour
    city_data['interval'] = city_data['interval'].apply(lambda x: (x // 3600) % 24)

    # Merge rows
    city_data = city_data.groupby(['city', 'detid', 'day', 'interval']).mean().reset_index()

    # Convert day column to day of the week
    city_data['date'] = city_data['day']
    city_data['day'] = pd.to_datetime(city_data['day']).dt.dayofweek

    # Drop the datetime and timestamp columns
    city_data = city_data.drop(columns=['error', 'speed'])
    # Merge with detectors.csv
    city_data['detid'] = city_data['detid'].astype('str')
    city_data = city_data.merge(detectors, left_on=['city', 'detid'], right_index=True)

    # Remove rows with missing data
    city_data = city_data.drop(columns=['road', 'long', 'lat', 'detid'])

    city_data['linkid'] = city_data['linkid'].astype('str')
    city_data['limit'] = city_data['limit'].astype('int64', errors='ignore').fillna(0)
    city_data['lanes'] = city_data['lanes'].astype('int64', errors='ignore').fillna(1)

    # Convert the fclass column to a range
    fclass_ids = [ 'other', 'living_street', 'residential', 'service', 'tertiary_link', 'tertiary', 'secondary_link', 'secondary', 'primary_link', 'primary', 'trunk_link', 'trunk', 'motorway_link', 'motorway' ]
    city_data['fclass'] = city_data['fclass'].apply(lambda x: fclass_ids.index(x))

    # Multiply the flow column by the lane count and the road length
    city_data['flow'] = city_data['flow'] * city_data['lanes'] * city_data['length']

    # Convert the limit column to int
    city_data['limit'] = city_data['limit'].astype('int64', errors='ignore').fillna(0)

    # Average the columns over all links in a city
    city_data_avg = city_data.copy()
    city_data_avg = city_data_avg.drop(columns=['linkid', 'date'])
    city_data_avg = city_data_avg.groupby(['city', 'day', 'interval']).mean().reset_index()

    return city_data, city_data_avg

# cities = [ 'augsburg', 'basel', 'bern', 'bolton', 'bordeaux', 'bremen', 'cagliari', 'constance', 'darmstadt', 'essen', 'graz', 'groningen', 'hamburg', 'kassel', 'london', 'luzern', 'manchester', 'marseille', 'munich', 'paris', 'rotterdam', 'santander', 'speyer', 'strasbourg', 'stuttgart', 'torino', 'toulouse', 'vilnius', 'wolfsburg', 'zurich' ]
cities = ['zurich']

data = pd.DataFrame()
data_avg = pd.DataFrame()

for city in cities:
    print(f'Loading {city}...          ', end='\r')
    city_data, city_data_avg = convert_traffic_data(city)

    data = pd.concat([data, city_data])
    data_avg = pd.concat([data_avg, city_data_avg])

Loading zurich...          

In [26]:
data.head()

Unnamed: 0,city,day,interval,flow,occ,date,length,pos,fclass,limit,lanes,linkid
0,zurich,0,0,10.581907,0.007719,2015-10-26,0.304631,0.012527,9,50,1,633.0
1,zurich,0,1,6.092613,0.002278,2015-10-26,0.304631,0.012527,9,50,1,633.0
2,zurich,0,2,4.874091,0.002194,2015-10-26,0.304631,0.012527,9,50,1,633.0
3,zurich,0,3,3.960199,0.002194,2015-10-26,0.304631,0.012527,9,50,1,633.0
4,zurich,0,4,6.092613,0.002556,2015-10-26,0.304631,0.012527,9,50,1,633.0


In [27]:
data_avg.head()

Unnamed: 0,city,day,interval,flow,occ,length,pos,fclass,limit,lanes
0,zurich,0,0,9.225859,0.00906,0.26251,0.069447,6.054956,45.691855,1.0
1,zurich,0,1,6.31931,0.004902,0.26251,0.069447,6.054956,45.691855,1.0
2,zurich,0,2,4.535509,0.003499,0.26251,0.069447,6.054956,45.691855,1.0
3,zurich,0,3,4.989186,0.005162,0.26251,0.069447,6.054956,45.691855,1.0
4,zurich,0,4,9.486045,0.00628,0.26251,0.069447,6.054956,45.691855,1.0


In [28]:
# Print row counts for each city
data_avg.groupby('city').count()

Unnamed: 0_level_0,day,interval,flow,occ,length,pos,fclass,limit,lanes
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
zurich,168,168,168,168,168,168,168,168,168


In [29]:
# cities = [ 'augsburg', 'basel', 'bern', 'bordeaux', 'bremen', 'constance', 'essen', 'hamburg', 'london', 'luzern', 'manchester', 'marseille', 'paris', 'speyer', 'strasbourg', 'torino', 'wolfsburg', 'zurich' ]

data = data[data['city'].isin(cities)]
data_avg = data_avg[data_avg['city'].isin(cities)]

In [30]:
# Add hour sin/cos and day sin/cos columns
data['hour_sin'] = np.sin(2 * np.pi * data['interval'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['interval'] / 24)
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 7)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 7)

# Remove days without all 24 hours
data = data[data.groupby(['city', 'linkid', 'day'])['interval'].transform('count') == 24]

In [31]:
# Index by city and linkid
data = data.reset_index().set_index(['city', 'linkid', 'date', 'interval'])

# Replace NaNs and infs with 0
data = data.replace([np.inf, -np.inf], np.nan).fillna(0)

print(data.shape)
data.head()

(171288, 13)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,index,day,flow,occ,length,pos,fclass,limit,lanes,hour_sin,hour_cos,day_sin,day_cos
city,linkid,date,interval,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
zurich,633.0,2015-10-26,0,0,0,10.581907,0.007719,0.304631,0.012527,9,50,1,0.0,1.0,0.0,1.0
zurich,633.0,2015-10-26,1,1,0,6.092613,0.002278,0.304631,0.012527,9,50,1,0.258819,0.965926,0.0,1.0
zurich,633.0,2015-10-26,2,2,0,4.874091,0.002194,0.304631,0.012527,9,50,1,0.5,0.866025,0.0,1.0
zurich,633.0,2015-10-26,3,3,0,3.960199,0.002194,0.304631,0.012527,9,50,1,0.707107,0.707107,0.0,1.0
zurich,633.0,2015-10-26,4,4,0,6.092613,0.002556,0.304631,0.012527,9,50,1,0.866025,0.5,0.0,1.0


In [32]:
# Create a train/test/validation split
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, shuffle=False)
train, val = train_test_split(train, test_size=0.2, shuffle=False)

print(train.shape, test.shape, val.shape)

(109624, 13) (34258, 13) (27406, 13)


In [40]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
               train_df=train, val_df=val, test_df=test,
               label_columns=None):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in
                                            enumerate(label_columns)}
            self.column_indices = {name: i for i, name in
                                enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])
    
    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels
    
    def plot(self, model=None, plot_col='flow', max_subplots=3):
        import matplotlib.pyplot as plt

        inputs, labels = self.example()
        plt.figure(figsize=(12, 8))
        plot_col_index = self.column_indices[plot_col]
        max_n = min(max_subplots, len(inputs))
        for n in range(max_n):
            plt.subplot(max_n, 1, n+1)
            plt.ylabel(f'{plot_col} [normed]')
            plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                    label='Inputs', marker='.', zorder=-10)

            if self.label_columns:
                label_col_index = self.label_columns_indices.get(plot_col, None)
            else:
                label_col_index = plot_col_index

            if label_col_index is None:
                continue

            plt.scatter(self.label_indices, labels[n, :, label_col_index],
                        edgecolors='k', label='Labels', c='#2ca02c', s=64)
            if model is not None:
                predictions = model(inputs)
                plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                        marker='X', edgecolors='k', label='Predictions',
                        c='#ff7f0e', s=64)

            if n == 0:
                plt.legend()

        plt.xlabel('Time [hr]')

  
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32,)

        ds = ds.map(self.split_window)

        return ds
  
    def train(self):
        return self.make_dataset(self.train_df)

    def val(self):
        return self.make_dataset(self.val_df)

    def test(self):
        return self.make_dataset(self.test_df)

    def example(self):
        """Get and cache an example batch of `inputs, labels` for plotting."""
        result = getattr(self, '_example', None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train()))
            # And cache it for next time
            self._example = result
        return result
    
window = WindowGenerator(input_width=24, label_width=24, shift=24, label_columns=['flow', 'occ'])
print(window.train_df.describe())
window

               index            day           flow            occ  \
count  109624.000000  109624.000000  109624.000000  109624.000000   
mean    54821.101693       3.001095      58.886910       0.101923   
std     31651.092307       1.999808     114.959584       0.131762   
min         0.000000       0.000000       0.000000       0.000000   
25%     27405.750000       1.000000       7.896211       0.012778   
50%     54825.500000       3.000000      26.168560       0.048806   
75%     82231.250000       5.000000      64.428927       0.134417   
max    109637.000000       6.000000    1860.797125       1.169778   

              length            pos         fclass          limit     lanes  \
count  109624.000000  109624.000000  109624.000000  109624.000000  109624.0   
mean        0.249714       0.074578       5.964387      46.214698       1.0   
std         0.244585       0.135409       2.182895      13.051461       0.0   
min         0.023588       0.000000       0.000000       0.000

Total window size: 48
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Label indices: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47]
Label column name(s): ['flow', 'occ']

In [43]:
# Create an LSTM model
def create_model(input_shape):
    from keras.models import Sequential
    from keras.layers import LSTM, Dense, Dropout, GRU

    model = Sequential([
        LSTM(256, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        Dense(1)
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001, clipvalue=1.0),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )

    return model

# Create the model for hourly predictions for one week
model = create_model(input_shape=(window.label_width, len(window.train_df.columns)))
model.summary()
print(model.input_shape)



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 24, 256)           276480    
                                                                 
 dropout_15 (Dropout)        (None, 24, 256)           0         
                                                                 
 lstm_10 (LSTM)              (None, 24, 128)           197120    
                                                                 
 dropout_16 (Dropout)        (None, 24, 128)           0         
                                                                 
 lstm_11 (LSTM)              (None, 24, 64)            49408     
                                                                 
 dropout_17 (Dropout)        (None, 24, 64)            0         
                                                                 
 dense_5 (Dense)             (None, 24, 1)            

In [44]:
# Train the model
def train_model(model, window, epochs=10):
    from keras.callbacks import EarlyStopping

    early_stop = EarlyStopping(monitor='val_loss', patience=10)

    # Print model inputs and outputs
    print(f'Inputs: {model.input_shape}')
    print(f'Outputs: {model.output_shape}')

    history = model.fit(
        window.train(),
        epochs=epochs,
        validation_data=window.val(),
        callbacks=[early_stop],
        batch_size=32
    )

    return history

history = train_model(model, window)

Inputs: (None, 24, 13)
Outputs: (None, 24, 1)
Epoch 1/10


2023-12-20 21:26:30.775295: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
