In [5]:
import sys
import pandas
sys.path.append('../')

from src.graph_utils import partition_graph_by_lonlat
import networkx as nx
from jurbey.jurbey import JURBEY

with open("data/1558537930325.jurbey", 'rb') as tempf:
    g = JURBEY.load(tempf.read())
print(g.number_of_nodes())
g_partition = partition_graph_by_lonlat(g)

A = nx.adjacency_matrix(g_partition)


290100


In [8]:
#Our speed data uses segment ids, but the model uses sequential indexes, based on `.nodes()`
import math
id_to_idx = {}
# defaultdict won't do what you expect in Pandas
df = pandas.read_csv("data/timeseries_speed_april_first_week.csv")


for id_ in df["from_node"].unique():
    id_to_idx[id_] = math.nan
for id_ in df["to_node"].unique():
    id_to_idx[id_] = math.nan
    
for idx, id_ in enumerate(g_partition.nodes()):
    id_to_idx[id_] = idx
# -

# Let's transform ids to indeces
df["from_node_idx"] = df.replace({"from_node": id_to_idx})["from_node"]
df["to_node_idx"] = df.replace({"to_node": id_to_idx})["to_node"]

df.head()

# ## First let's build sparse 3D data tensor

# +
import torch
TOTAL_T_STEPS = 144


def snapshot(t, df=df, g_partition=g_partition):
    df_t = df[[t, "from_node_idx", "to_node_idx"]]
    df_t = df_t.dropna()
    row = df_t["from_node_idx"].tolist()
    col = df_t["to_node_idx"].tolist()
    data = df_t[t].tolist()
    size = len(g_partition.nodes())  

    return {"indices": (row, col), "values": data, "shape": (size, size)}


# +
from scipy.sparse import hstack

def build_sparse_dataset(from_=0, to=TOTAL_T_STEPS):
    dataset = {"indices": ([], [], []), "values": []}
    for t in range(from_, to):

        snap = snapshot(str(t))
        dataset["indices"][0].extend([t] * len(snap["indices"][0]))
        dataset["indices"][1].extend(snap["indices"][0])
        dataset["indices"][2].extend(snap["indices"][1])
        dataset["values"].extend(snap["values"])

    i = torch.LongTensor(dataset["indices"])
    v = torch.FloatTensor(dataset["values"])
    return torch.sparse.FloatTensor(i, v, torch.Size((to, *snap["shape"])))

dataset = build_sparse_dataset()


In [11]:
type(dataset)

torch.Tensor

In [13]:
print(f"Shape of data: {df.shape}. Missing in data: {df.isnull().sum().sum()}.")

Shape of data: (6162, 149). Missing in data: 818224.


In [233]:
df = pandas.read_csv("data/timeseries_speed_april_first_week.csv")


df = df.T
print(f"original number of road segment: {df.shape[1]}")
df = df.loc[:, df.isnull().mean() < 0.5]
print(f"original number of road segment after filtering: {df.shape[1]}")
df = df.T
df = df.loc[:, df.columns != 'Unnamed: 0']
df = df.loc[:, df.columns != 'from_node']
df = df.loc[:, df.columns != 'to_node']
print(df[0:10])
df = df.T

original number of road segment: 6162
original number of road segment after filtering: 213
           0        1    2        3        4        5          6         7  \
4        NaN      NaN  NaN  10.4688  10.6366  10.2885    10.6175   1.75854   
7    10.2788      NaN  NaN      NaN  10.8314      NaN    3.14074   2.07388   
16   10.8814      NaN  NaN      NaN  10.9489  10.1905    8.48206   10.6813   
124  10.2644      NaN  NaN  10.5666      NaN  7.37511   0.915255   10.1382   
157      NaN      NaN  NaN      NaN  10.8314      NaN    7.53377   5.43425   
204      NaN  10.0555  NaN      NaN      NaN      NaN  0.0954053    10.709   
351      NaN      NaN  NaN  10.4688  10.4049  7.20193    7.68655   7.31936   
352      NaN      NaN  NaN   10.471  10.3797  0.40066   0.149782  0.486138   
379  10.3584      NaN  NaN      NaN  10.6964  6.91769    5.38908   10.8364   
389      NaN      NaN  NaN  10.4688    10.38   7.0872    8.65096   10.2432   

            8        9  ...      134        135   

In [321]:
import math
import numpy as np
import pandas as pd

class DataLoader():
    """A class for loading and transforming data for the lstm model"""

    def __init__(self, dataframe, split):
        i_split = int(len(dataframe) * split)
        self.data_train = dataframe.values[:i_split]
        self.data_test  = dataframe.values[i_split:]
        self.len_train  = len(self.data_train)
        self.len_test   = len(self.data_test)
        self.len_train_windows = None

    def get_test_data(self, seq_len, normalise):
        '''
        Create x, y test data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise reduce size of the training split.
        '''
        data_windows = []
        for i in range(self.len_test - seq_len):
            data_windows.append(self.data_test[i:i+seq_len])

        data_windows = np.array(data_windows).astype(float)
        data_windows = self.normalise_windows(data_windows, single_window=False) if normalise else data_windows

        x = data_windows[:, :-1]
        y = data_windows[:, -1, [0]]
        return x,y

    def get_train_data(self, seq_len, normalise):
        '''
        Create x, y train data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise use generate_training_window() method.
        '''
        data_x = []
        data_y = []
        for i in range(self.len_train - seq_len):
            x, y = self._next_window(i, seq_len, normalise)
            data_x.append(x)
            data_y.append(y)
        return np.array(data_x, dtype=np.float32), np.array(data_y, dtype=np.float32)

    def generate_train_batch(self, seq_len, batch_size, normalise):
        '''Yield a generator of training data from filename on given list of cols split for train/test'''
        i = 0
        while i < (self.len_train - seq_len):
            x_batch = []
            y_batch = []
            for b in range(batch_size):
                if i >= (self.len_train - seq_len):
                    # stop-condition for a smaller final batch if data doesn't divide evenly
                    yield np.array(x_batch), np.array(y_batch)
                    i = 0
                x, y = self._next_window(i, seq_len, normalise)
                x_batch.append(x)
                y_batch.append(y)
                i += 1
            yield np.array(x_batch, dtype=np.float32), np.array(y_batch, dtype=np.float32)

    def _next_window(self, i, seq_len, normalise):
        '''Generates the next data window from the given index location i'''
        window = self.data_train[i:i+seq_len]
        window = self.normalise_windows(window, single_window=True)[0] if normalise else window
        x = window[:-1,:]
        y = window[-1:,:]
        print(x.shape)
        return x, y

    def normalise_windows(self, window_data, single_window=False):
        '''Normalise window with a base value of zero'''
        normalised_data = []
        window_data = [window_data] if single_window else window_data
        for window in window_data:
            normalised_window = []
            for col_i in range(window.shape[1]):
                normalised_col = [((float(p) / float(window[0, col_i])) - 1) for p in window[:, col_i]]
                normalised_window.append(normalised_col)
            normalised_window = np.array(normalised_window).T # reshape and transpose array back into original multidimensional format
            normalised_data.append(normalised_window)
        return np.array(normalised_data)


In [322]:
print(len(df))
data = DataLoader(df, 0.8)

144


In [329]:
x, y = data.get_train_data(seq_len=30, normalise=False)
x = np.swapaxes(x,0,2)
y = np.swapaxes(y,0,2)
x = np.swapaxes(x,1,2)
y = np.swapaxes(y,1,2)
print(y.shape)
print(x.shape)

(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(29, 213)
(213, 85, 1)
(213, 85, 29)


In [330]:
from src.tgcn.temporal_spatial_model import TGCN

In [331]:
print(x[0])
print(y[0])
x = torch.from_numpy(x)
y = torch.from_numpy(y)
print(x.size())
print(y.size())


[[       nan        nan        nan ...        nan 10.559644         nan]
 [       nan        nan 10.468811  ... 10.559644         nan  1.6303675]
 [       nan 10.468811  10.6366205 ...        nan  1.6303675        nan]
 ...
 [ 1.8972399 10.166076  10.721033  ...  4.6672025        nan 10.232055 ]
 [10.166076  10.721033         nan ...        nan 10.232055  10.66193  ]
 [10.721033         nan        nan ... 10.232055  10.66193   10.476906 ]]
[[ 1.6303675 ]
 [        nan]
 [10.732706  ]
 [        nan]
 [        nan]
 [ 6.3112288 ]
 [        nan]
 [10.337724  ]
 [ 1.3691394 ]
 [10.610737  ]
 [        nan]
 [10.888935  ]
 [10.657227  ]
 [        nan]
 [10.035851  ]
 [10.17424   ]
 [ 0.40861756]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [10.202348  ]
 [10.224177  ]
 [10.866926  ]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [        nan]
 [ 1.8014525 ]
 [ 5.4981275 ]
 [10.538274  ]
 [10.052774  ]
 

In [332]:
from test_tube import Experiment 
from pytorch_lightning import Trainer
import os

# PyTorch summarywriter with a few bells and whistles    
exp = Experiment(save_dir=os.getcwd())

# train on cpu using only 10% of the data (for demo purposes)
# pass in experiment for automatic tensorboard logging.    
trainer = Trainer(experiment=exp, max_nb_epochs=100, train_percent_check=0.1)


gpu available: False, used: False


In [338]:
train_x = torch.from_numpy(x)
train_y = torch.from_numpy(y)
train=torch.zeros(len(train_x),213,look_back+1)
train[:,:,:look_back]=train_x
train[:,:,look_back:(look_back+1)]=train_y

In [345]:
import imp  
import src.tgcn.layers.lstm as l
l = imp.reload(l)
LSTMs = l.LSTMs
model = LSTMs(input_dim=29, hidden_dim=32, data=train)
trainer.fit(model)

  0%|          | 0/3 [00:00<?, ?it/s]

     Name    Type  Params
0    lstm    LSTM   16512
1  linear  Linear      33


  return {'loss': F.mse_loss(y_hat, y)}


RuntimeError: The size of tensor a (10) must match the size of tensor b (85) at non-singleton dimension 1