In [1]:
import sys
import pandas
sys.path.append('../')

from src.graph_utils import partition_graph_by_lonlat
import networkx as nx
from jurbey.jurbey import JURBEY

with open("../data/1558537930325.jurbey", 'rb') as tempf:
    g = JURBEY.load(tempf.read())
print(g.number_of_nodes())
g_partition = partition_graph_by_lonlat(g)


290100


**Convert to edge-based graph**

In [2]:
import networkx as nx
L = nx.line_graph(nx.DiGraph(g_partition))

In [3]:
nodes = list(L.nodes())
g_partition[nodes[10][0]][nodes[10][1]]['data']

Arc(arcType=<ArcType.LANE_STRAIGHT: 'LANE_STRAIGHT'>, roadClass=<RoadClass.MajorRoad: 2>, roadAccessibility=<RoadAccessibility.NoRestriction: 1>, metadata={'bicycle': 'no', 'highway': 'primary', 'lanes': '4', 'lit': 'yes', 'maxspeed': '50', 'name': 'Bismarckstraße', 'oneway': 'yes', 'postal_code': '10625', 'ref': 'B 2;B 5', 'surface': 'asphalt', 'turn:lanes': 'through|through|through;right|right'}, signs=[], vehicleAccessibility=[], geometry=[GeoCoordinates(lon=13.3207077, lat=52.5123944, alt=nan), GeoCoordinates(lon=13.3207877, lat=52.5123711, alt=nan)])

**Extract dynamic (speed) + static features from nodes**

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
enc = OneHotEncoder(handle_unknown='ignore')
ienc = OrdinalEncoder()
scaler = StandardScaler()
def arc_features(arc):
    arc = g_partition[arc[0]][arc[1]]
    return [
        arc['data'].metadata['highway'],
        arc['data'].metadata.get('surface', 'no_sur'),
        arc['data'].roadClass.name
    ],  [float(arc['data'].metadata.get('maxspeed', '50')), 
        int(arc['data'].metadata.get('lanes', '1'))]
def construct_features():
    data = list()
    data_ord = list()
    for node in L.nodes:
        data.append(arc_features(node)[0])
        data_ord.append(arc_features(node)[1])
    return enc.fit_transform(data), ienc.fit_transform(data_ord)
    
x, y = construct_features()
  

In [None]:
enc.categories_

In [None]:
ienc.categories_

In [None]:
x.shape

In [None]:
x

**Preprocess adjacency matrix**

In [5]:
adj = nx.to_scipy_sparse_matrix(L, format="coo")
import scipy.sparse as sp
import numpy as np
import torch

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)
                                    
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
                                    
adj = normalize(adj + sp.eye(adj.shape[0]))
adj = sparse_mx_to_torch_sparse_tensor(adj)
                                    

In [None]:
# adj = nx.to_scipy_sparse_matrix(L, format="coo")
# adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [None]:
# def diagonal_degree_matrix(adj):
#     diag = np.zeros([adj.shape[0], adj.shape[0]]) # basically dimensions of your graph
#     rows, cols = adj.nonzero()
#     for row, col in zip(rows, cols):
#         diag[row, row] += 1
#     return diag

# D = diagonal_degree_matrix(adj)
# np.power(D, -0.5).dot(adj).dot(np.power(D, -0.5))

In [29]:
#Our speed data uses segment ids, but the model uses sequential indexes, based on `.nodes()`
import math
id_to_idx = {}
# defaultdict won't do what you expect in Pandas
df = pandas.read_csv("../data/timeseries_speed_april_first_week.csv")
df = df.T
l = (df.isnull().mean() < 0.5).tolist()

indices = [i for i, x in enumerate(l) if x == True]
print(indices)

[4, 7, 16, 124, 157, 204, 351, 352, 379, 389, 430, 487, 492, 513, 539, 551, 568, 569, 587, 613, 630, 639, 649, 675, 680, 698, 711, 713, 780, 782, 784, 865, 1037, 1062, 1070, 1079, 1080, 1088, 1091, 1125, 1206, 1215, 1217, 1238, 1241, 1397, 1400, 1450, 1517, 1550, 1556, 1560, 1564, 1574, 1703, 1733, 1756, 1804, 1805, 1806, 1807, 1819, 1824, 1853, 1907, 1917, 1964, 1996, 1999, 2003, 2007, 2029, 2030, 2033, 2036, 2037, 2040, 2042, 2098, 2099, 2102, 2104, 2116, 2119, 2123, 2128, 2137, 2143, 2210, 2261, 2281, 2284, 2289, 2358, 2382, 2387, 2394, 2406, 2411, 2412, 2473, 2487, 2550, 2568, 2604, 2605, 2649, 2657, 2674, 2677, 2738, 2753, 2766, 2788, 2794, 2809, 2851, 2853, 2869, 2871, 2917, 2925, 2956, 3081, 3086, 3087, 3088, 3090, 3117, 3120, 3132, 3223, 3264, 3450, 3458, 3459, 3530, 3577, 3587, 3627, 3629, 3645, 3728, 3792, 3870, 3974, 3984, 3993, 4006, 4064, 4065, 4083, 4084, 4303, 4445, 4448, 4550, 4551, 4572, 4602, 4655, 4673, 4688, 4698, 4700, 4704, 4798, 4814, 4866, 4889, 4953, 4967, 4992

In [30]:
id_to_idx = {}

for idx, id_ in enumerate(L.nodes()):
    id_to_idx[id_] = idx
df = df.T
df = df.loc[:, df.columns != 'Unnamed: 0']

df2 = df['from_node']
df3 = df['to_node']

df_filled = df.loc[:, df.columns != 'from_node']
df_filled = df.loc[:, df.columns != 'to_node']


#df_filled = df_filled.interpolate(method='nearest', axis=1)
SPEED_COLUMNS = list(map(str, range(TOTAL_T_STEPS)))
df_filled = df_filled.fillna(df_filled[SPEED_COLUMNS].mean())
df_filled['from_node'] = df2
df_filled['to_node'] = df3

print(df_filled[0:10])


KeyError: "None of [Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n       ...\n       '134', '135', '136', '137', '138', '139', '140', '141', '142', '143'],\n      dtype='object', length=144)] are in the [columns]"

In [25]:
df_filled[0:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,136,137,138,139,140,141,142,143,from_node,to_node
0,11632500.0,11632500.0,11632500.0,10.85446,11632500.0,5.680978,7.036838,1.499997,10.08863,10.58746,...,10.58112,11632500.0,11632500.0,11632500.0,11632500.0,11632500.0,10.85084,10.03641,628154368,1023689595
1,11632500.0,11632500.0,11632500.0,10.85446,11632500.0,5.680978,7.036838,1.499997,10.08863,10.58746,...,10.58112,11632500.0,11632500.0,11632500.0,11632500.0,11632500.0,10.85084,10.03641,628154368,1023689595
2,47922460.0,47922460.0,47922460.0,47922460.0,47922460.0,10.48621,47922460.0,47922460.0,10.93722,47922460.0,...,47922460.0,47922460.0,47922460.0,47922460.0,47922460.0,47922460.0,47922460.0,47922460.0,527147009,27537239
3,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,...,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,43928930.0,527147009,26908815
4,8604863.0,8604863.0,8604863.0,10.46881,10.63662,10.28853,10.61751,1.758539,3.404401,10.22457,...,6.098767,10.25766,10.58617,8604863.0,10.66948,4.313417,10.3929,10.51831,628154370,3804638178
5,11851980.0,11851980.0,11851980.0,11851980.0,11851980.0,11851980.0,10.287,11851980.0,11851980.0,10.83357,...,10.53758,10.287,11851980.0,11851980.0,11851980.0,7.238553,10.49798,10.49173,628154372,26938222
6,10469250.0,10469250.0,10469250.0,10.85446,10469250.0,5.680978,7.036838,1.499997,5.044317,10.58746,...,0.9618658,10469250.0,10469250.0,10469250.0,10469250.0,10469250.0,10.85084,10.03641,628154375,1560866145
7,10.27885,60964180.0,60964180.0,60964180.0,10.83141,60964180.0,3.140737,2.073876,0.3852635,4.10448,...,10.50583,10.51731,60964180.0,60964180.0,60964180.0,10.18992,10.43675,4.196431,5791596551,1321327852
8,579162100.0,579162100.0,579162100.0,579162100.0,579162100.0,10.29825,1.220956,10.29825,579162100.0,579162100.0,...,579162100.0,579162100.0,579162100.0,579162100.0,579162100.0,579162100.0,579162100.0,579162100.0,5791621141,4782446443
9,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,...,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,1930540000.0,5791621141,26875019


**Create rolling window tensor dataset**

In [9]:
import torch
import scipy.sparse
TOTAL_T_STEPS = 144

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
enc = OneHotEncoder(handle_unknown='ignore')
ienc = OrdinalEncoder()
scaler = RobustScaler()
import math
  
speed_features = df_filled.values.flatten()
speed_features = np.array([s for s in speed_features if not math.isnan(s)]).reshape(-1,1)
print(speed_features[0:10])  
scaler.fit(speed_features)

[[10.8544569 ]
 [ 5.68097802]
 [ 7.03683772]
 [ 1.49999732]
 [10.08863449]
 [10.58745861]
 [ 5.72889727]
 [10.42498398]
 [10.69523907]
 [10.86074638]]


RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

In [10]:
scaler.transform(np.array(speed_features[0:10]).reshape(-1,1))

array([[ 0.06501617],
       [-0.50318748],
       [-0.35427329],
       [-0.96238497],
       [-0.01909418],
       [ 0.03569172],
       [-0.49792451],
       [ 0.01784712],
       [ 0.04752926],
       [ 0.06570694]])

In [11]:
def build_dataset_to_numpy_tensor(from_=0, to=TOTAL_T_STEPS, df=None, norm=False):
    """
    We extract features from speed (actual speed, whether speed is missing)
    and combine with static features.
    :return:
         np.ndarray: dataset tensor of shape [num_time_steps, num_nodes, num_features]
    """
    dataset = list()
    for t in range(from_, to):
        cat_features_at_t = [['primary', 'asphalt', 'MajorRoad']] * len(L.nodes)
        ord_features_at_t = [[50.0, 4]] * len(L.nodes)
        speed_features_at_t = [50] * len(L.nodes) 
        speed_is_nan_feature = [1] * len(L.nodes)
        for _, row in df.iterrows():

            arc = (row['from_node'], row['to_node'])
            cat_features_at_t[id_to_idx[arc]], ord_features_at_t[id_to_idx[arc]]  = arc_features(arc)
            speed_features_at_t[id_to_idx[arc]] = row[str(t)]
            if np.isnan(row[str(t)]): 
                speed_is_nan_feature[id_to_idx[arc]] = 0
        dataset.append(np.concatenate([scaler.transform(np.array(speed_features_at_t).reshape(-1, 1)) if norm else  np.array(speed_features_at_t).reshape(-1, 1), 
                                       np.array(speed_is_nan_feature).reshape(-1, 1), 
                                       ienc.fit_transform(ord_features_at_t),
                                       enc.fit_transform(cat_features_at_t).toarray()], axis=1))
    return np.stack(dataset, axis=0)

Y = build_dataset_to_numpy_tensor(df=df)
Y_filled = build_dataset_to_numpy_tensor(df=df_filled, norm=True)

In [13]:
X = np.moveaxis(Y, source=(0,1,2), destination=(2,0,1))
X_filled = np.moveaxis(Y_filled, source=(0,1,2), destination=(2,0,1))


# num_vertices, num_features, num_timesteps
X.shape

(6163, 29, 144)

In [14]:
# Build mask tensor
X_masked = torch.where(torch.isnan(torch.from_numpy(X)), torch.tensor([0]), torch.tensor([1]))
X_masked = X_masked.bool()
print(X.shape)
print(X_masked.shape)

(6163, 29, 144)
torch.Size([6163, 29, 144])


In [15]:
def generate_dataset_concat(X, X_masked, num_timesteps_input, num_timesteps_output):
    """
    Takes node features for the graph and divides them into multiple samples
    along the time-axis by sliding a window of size (num_timesteps_input+
    num_timesteps_output) across it in steps of 1.
    :param X: Node features of shape (num_vertices, num_features,
    num_timesteps)
    :return:
        - Node data (features + labels) divided into multiple samples. Shape is
          (num_samples, num_vertices, num_features, num_timesteps_input).
        
    """
    # Generate the beginning index and the ending index of a sample, which
    # contains (num_points_for_training + num_points_for_predicting) points
    indices = [(i, i + (num_timesteps_input + num_timesteps_output)) for i
               in range(X.shape[2] - (
                num_timesteps_input + num_timesteps_output) + 1)]

    # Save samples
    features, target = [], []
    mask = []
    for i, j in indices:
        features.append(X[:, :, i: i + num_timesteps_input])
        target.append(X[:, 0, i + num_timesteps_input: j])
        mask.append(X_masked[:, 0, i + num_timesteps_input: j])

    return torch.from_numpy(np.array(features)), torch.from_numpy(np.array(target)),torch.stack(mask)

In [16]:
# training, validation, testing : 0.7, 0.1, 0.2
split_line1 = int(X.shape[2] * 0.7)
split_line2 = int(X.shape[2] * 0.9)
train_original_data = X_filled[:, :, :split_line1]
val_original_data = X_filled[:, :, split_line1:split_line2]
test_original_data = X_filled[:, :, split_line2:]

train_mask = X_masked[:, :, :split_line1]
valid_mask = X_masked[:, :, split_line1:split_line2]
test_mask = X_masked[:, :, split_line2:]


look_back = 9
look_ahead = 1
# num_samples, num_nodes, num_timesteps, num_features

training_data, training_target, train_mask = generate_dataset_concat(train_original_data, train_mask,
                                                       num_timesteps_input=look_back,
                                                       num_timesteps_output=look_ahead)
valid_data, valid_target, valid_mask = generate_dataset_concat(val_original_data, valid_mask,
                                             num_timesteps_input=look_back,
                                             num_timesteps_output=look_ahead)
test_data, test_target, test_mask = generate_dataset_concat(test_original_data, test_mask,
                                               num_timesteps_input=look_back,
                                               num_timesteps_output=look_ahead)

print(f"shape of training: {training_data.shape}, {training_target.shape}, {train_mask.shape}")
print(f"shape of validation: {valid_data.shape}, {valid_target.shape}, {valid_mask.shape}")
print(f"shape of testing: {test_data.shape}, {test_target.shape}, {test_mask.shape}")

shape of training: torch.Size([91, 6163, 29, 9]), torch.Size([91, 6163, 1]), torch.Size([91, 6163, 1])
shape of validation: torch.Size([20, 6163, 29, 9]), torch.Size([20, 6163, 1]), torch.Size([20, 6163, 1])
shape of testing: torch.Size([6, 6163, 29, 9]), torch.Size([6, 6163, 1]), torch.Size([6, 6163, 1])


In [17]:
train_mask[1, :, :].shape

torch.Size([6163, 1])

In [18]:
from test_tube import Experiment 
from pytorch_lightning import Trainer
import os

# PyTorch summarywriter with a few bells and whistles    
exp = Experiment(save_dir=os.getcwd())

# pass in experiment for automatic tensorboard logging.    
trainer = Trainer(experiment=exp, max_nb_epochs=45, train_percent_check=1)



gpu available: False, used: False


In [None]:
import h5py
with h5py.File('data.hdf5', 'w') as f:
    f.create_dataset("train", data=training_data)
    f.create_dataset("valid", data=valid_data)
    f.create_dataset("test", data=test_data)
with h5py.File('target.hdf5', 'w') as f:
    f.create_dataset("train", data=training_target)
    f.create_dataset("valid", data=valid_target)
    f.create_dataset("test", data=test_target)
with h5py.File('mask.hdf5', 'w') as f:
    f.create_dataset("train", data=train_mask)
    f.create_dataset("valid", data=valid_mask)
    f.create_dataset("test", data=test_mask)

In [None]:
type(test_data)

In [19]:
data = {}
data['train'] = training_data
data['valid'] = valid_data
data['test'] = test_data

target = {}
target['train'] = training_target
target['valid'] = valid_target
target['test'] = test_target

mask = {}
mask['train'] = train_mask
mask['valid'] = valid_mask
mask['test'] = test_mask
# batch shape: torch.Size([1, 6163, 26, 10])
print(valid_data.shape)
print(valid_mask[10, :, :])
print(valid_target[10, :, :])

# IMPORTANT: we normalize speed
print(valid_target[10, :, :].masked_select(valid_mask[10, :, :]))

torch.Size([20, 6163, 29, 9])
tensor([[ True],
        [ True],
        [False],
        ...,
        [False],
        [False],
        [False]])
tensor([[-0.0253],
        [-0.0253],
        [    nan],
        ...,
        [    nan],
        [    nan],
        [    nan]], dtype=torch.float64)
tensor([-2.5327e-02, -2.5327e-02,  2.0929e-02,  2.0929e-02, -2.5327e-02,
        -2.5327e-02,  6.3787e-02,  5.7759e-02,  4.3152e-02,  1.7935e-02,
        -9.1229e-01,  6.9816e-02,  6.9816e-02,  2.3782e-02,  2.3782e-02,
         4.3152e-02,  4.3152e-02,  4.3152e-02,  2.0929e-02,  5.7759e-02,
         6.6219e-02, -2.2938e-02, -2.2938e-02,  4.9663e-02,  4.9663e-02,
        -2.2938e-02, -2.2938e-02,  7.4679e-02,  7.4679e-02,  3.2882e-02,
         3.2882e-02,  2.6664e-02,  4.4069e-02,  3.5913e-02, -1.1271e+00,
        -9.6343e-01,  1.3115e-03, -9.1869e-01,  6.9645e-02,  2.4169e-02,
        -6.3686e-03,  4.3152e-02,  4.3152e-02,  1.3596e-02,  1.5644e-02,
         1.3596e-02,  4.3152e-02,  4.3152e-02, -

**Now start training**

In [20]:
from torch.utils.data import DataLoader, TensorDataset
dataloader = DataLoader(TensorDataset(data['train'], target['train']), batch_size=1, shuffle=False)

In [21]:
import imp  
import src.tgcn.temporal_spatial_model as l
import src.tgcn.layers.lstmcell as h
h = imp.reload(h)

l = imp.reload(l)
TGCN = l.TGCN
model = TGCN(input_dim=29, hidden_dim=29, layer_dim=2, output_dim=1, adj=adj, 
             datasets=data, targets= target, mask=mask, scaler=scaler)
trainer.fit(model)

          Name        Type  Params
0      gc_lstm  GCLSTMCell    7830
1  gc_lstm.x2h      Linear    3480
2  gc_lstm.h2h      Linear    3480
3           fc      Linear      30
4      dropout     Dropout       0


 34%|███▍      | 38/111 [00:14<00:27,  2.65it/s, batch_nb=36, epoch=0, tng_loss=nan, v_nb=1]

KeyboardInterrupt: 

In [None]:
import imp  
import src.tgcn.stgcn as l

# PyTorch summarywriter with a few bells and whistles    
exp = Experiment(save_dir="../data")

# pass in experiment for automatic tensorboard logging.    
trainer = Trainer(experiment=exp, max_nb_epochs=30, train_percent_check=1)

l = imp.reload(l)
STGCN = l.STGCN
model2 = STGCN(adj=adj, datasets=data, targets=target, mask=mask, scaler=scaler)
trainer.fit(model2)

In [None]:
import imp  
import src.tgcn.layers.lstm as l

# PyTorch summarywriter with a few bells and whistles    
exp = Experiment(save_dir="../data")

# pass in experiment for automatic tensorboard logging.    
trainer = Trainer(experiment=exp, max_nb_epochs=30, train_percent_check=1)

l = imp.reload(l)
LSTMs = l.LSTMs
model3 = LSTMs(input_dim=9, datasets=data, targets=target, mask=mask)
trainer.fit(model3)