In [90]:
%reload_ext autoreload
%autoreload 2

In [91]:
import sys
import pandas
sys.path.append('../')

from src.graph_utils import partition_graph_by_lonlat
import networkx as nx
from jurbey.jurbey import JURBEY

with open("../data/berlin.jurbey", 'rb') as tempf:
    g = JURBEY.load(tempf.read())
print(g.number_of_nodes())
g_partition = partition_graph_by_lonlat(g)

230206


**Convert to edge-based graph**

In [92]:
import networkx as nx
L = nx.line_graph(nx.DiGraph(g_partition))

In [3]:
nodes = list(L.nodes())
g_partition[nodes[10][0]][nodes[10][1]]['data']

Arc(arcType=<ArcType.LANE_STRAIGHT: 'LANE_STRAIGHT'>, roadClass=<RoadClass.MajorRoad: 2>, roadAccessibility=<RoadAccessibility.NoRestriction: 1>, metadata={'bicycle': 'no', 'highway': 'primary', 'lanes': '4', 'lit': 'yes', 'maxspeed': '50', 'name': 'Bismarckstraße', 'oneway': 'yes', 'postal_code': '10625', 'ref': 'B 2;B 5', 'surface': 'asphalt', 'turn:lanes': 'through|through|through;right|right'}, signs=[], vehicleAccessibility=[], geometry=[GeoCoordinates(lon=13.3207077, lat=52.5123944, alt=nan), GeoCoordinates(lon=13.3207877, lat=52.5123711, alt=nan)])

**Extract dynamic (speed) + static features from nodes**

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
enc = OneHotEncoder(handle_unknown='ignore')
ienc = OrdinalEncoder()
scaler = StandardScaler()
def arc_features(arc):
    arc = g_partition[arc[0]][arc[1]]
    return [
        arc['data'].metadata['highway'],
        arc['data'].metadata.get('surface', 'no_sur'),
        arc['data'].roadClass.name
    ],  [float(arc['data'].metadata.get('maxspeed', '50')), 
        int(arc['data'].metadata.get('lanes', '1'))]

def construct_features():
    data = list()
    data_ord = list()
    for node in L.nodes:
        data.append(arc_features(node)[0])
        data_ord.append(arc_features(node)[1])
    return enc.fit_transform(data), ienc.fit_transform(data_ord)
    
x, y = construct_features()
  

In [5]:
enc.categories_

[array(['access_ramp', 'corridor', 'living_street', 'platform', 'primary',
        'residential', 'secondary', 'secondary_link', 'service',
        'tertiary', 'tertiary_link', 'unclassified'], dtype=object),
 array(['asphalt', 'cobblestone', 'cobblestone:flattened', 'concrete',
        'concrete:plates', 'grass_paver', 'no_sur', 'paved',
        'paving_stones', 'sett'], dtype=object),
 array(['DirtRoad', 'LocalRoad', 'MajorRoad'], dtype=object)]

In [6]:
ienc.categories_

[array([ 5., 10., 20., 30., 50.]), array([1., 2., 3., 4., 5.])]

In [7]:
x.shape

(6161, 25)

In [8]:
x

<6161x25 sparse matrix of type '<class 'numpy.float64'>'
	with 18483 stored elements in Compressed Sparse Row format>

**Preprocess adjacency matrix**

In [9]:
adj = nx.to_scipy_sparse_matrix(L, format="coo")
import scipy.sparse as sp
import numpy as np
import torch

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)
                                    
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [10]:
# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

adj = normalize(adj + sp.eye(adj.shape[0]))
adj = sparse_mx_to_torch_sparse_tensor(adj)

In [11]:
adj.shape

torch.Size([6161, 6161])

In [12]:
#Our speed data uses segment ids, but the model uses sequential indexes, based on `.nodes()`
import math
id_to_idx = {}
# defaultdict won't do what you expect in Pandas
df = pandas.read_csv("../data/timeseries_speed_april_first_week.csv")
df = df.T
l = (df.isnull().mean() < 0.5).tolist()

indices = [i for i, x in enumerate(l) if x == True]
print(indices)

[0, 1, 4, 5, 6, 7, 16, 25, 62, 70, 75, 76, 92, 93, 95, 124, 130, 131, 132, 133, 135, 157, 160, 161, 164, 165, 168, 191, 201, 202, 203, 204, 214, 215, 216, 218, 235, 236, 245, 262, 265, 267, 268, 272, 278, 289, 321, 322, 333, 348, 349, 350, 353, 358, 361, 363, 366, 369, 377, 378, 384, 387, 398, 402, 403, 428, 462, 464, 469, 470, 471, 472, 476, 481, 482, 485, 490, 502, 503, 505, 506, 509, 511, 522, 524, 525, 533, 534, 537, 543, 544, 549, 562, 563, 564, 565, 566, 567, 568, 585, 596, 611, 615, 616, 619, 620, 628, 629, 636, 637, 647, 670, 671, 672, 673, 678, 696, 704, 705, 707, 708, 709, 711, 724, 734, 735, 736, 777, 778, 779, 780, 782, 787, 788, 796, 803, 805, 827, 828, 862, 863, 871, 875, 882, 891, 893, 903, 912, 913, 931, 944, 947, 949, 950, 977, 987, 988, 1035, 1055, 1058, 1060, 1067, 1068, 1076, 1077, 1078, 1083, 1085, 1086, 1089, 1091, 1092, 1093, 1094, 1097, 1098, 1107, 1108, 1112, 1113, 1115, 1116, 1123, 1136, 1147, 1149, 1153, 1192, 1193, 1204, 1210, 1211, 1213, 1215, 1217, 1218, 1

In [13]:
id_to_idx = {}

for idx, id_ in enumerate(L.nodes()):
    id_to_idx[id_] = idx
df = df.T
df = df.loc[:, df.columns != 'Unnamed: 0']

df2 = df['from_node']
df3 = df['to_node']

df_filled = df.loc[:, df.columns != 'from_node']
df_filled = df.loc[:, df.columns != 'to_node']


df_filled = df_filled.T
for column in df_filled:
    df_filled[column] = pandas.to_numeric(df_filled[column])

df_filled = df_filled.interpolate(method='nearest', axis=1)

df_filled = df_filled.fillna(method='backfill')
df_filled = df_filled.T
df_filled['from_node'] = df2
df_filled['to_node'] = df3

print(df_filled[0:10])


           0          1          2          3          4          5  \
0  28.817616  28.817616  28.817616  28.817616  29.111668   8.288389   
1  28.817616  28.817616  28.817616  28.817616  29.111668   8.288389   
2  18.285511  18.285511  18.285511  28.817616  29.111668  10.486210   
3  18.285511  18.285511  18.285511  20.377332  19.033088  10.486210   
4  10.952773  10.952773  10.952773  20.377332   9.472034   7.588203   
5  11.614436  11.463308  16.867678  12.195039   9.297380   8.159171   
6  11.614436  11.463308  16.867678  28.817616  29.111668   7.857661   
7  11.974769  11.485186  11.188807  12.195039  10.916819  12.367358   
8  11.974769  11.485186  11.188807  12.195039  10.916819   8.030382   
9  11.974769  11.485186  11.188807  12.195039  10.916819  28.050392   

           6          7          8          9  ...        136        137  \
0   6.779508  10.833259   9.540780   7.870202  ...   9.136869  11.530145   
1   6.779508  10.833259   9.540780   7.870202  ...   9.136869  11.

In [14]:
df[0:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,136,137,138,139,140,141,142,143,from_node,to_node
0,,,,28.8176,29.1117,8.28839,6.77951,10.8333,9.54078,7.8702,...,9.13687,11.5301,8.26313,21.0634,6.51702,8.93157,9.54278,7.23683,628154368,1023689595
1,,,,28.8176,29.1117,8.28839,6.77951,10.8333,9.54078,7.8702,...,9.13687,11.5301,8.26313,21.0634,6.51702,8.93157,9.54278,7.23683,628154368,1023689595
2,,,18.2855,,,10.4862,16.6318,11.1338,7.66307,14.4178,...,,,,,,,,,527147009,27537239
3,,,,,19.0331,,,4.87397,12.138,9.49861,...,,,,12.8901,,,13.1788,,527147009,26908815
4,,,10.9528,20.3773,9.47203,7.5882,9.70223,6.16726,5.86041,7.97867,...,6.70508,9.68438,29.0125,14.253,8.93251,6.7644,10.1548,10.8212,628154370,3804638178
5,11.6144,11.4633,16.8677,12.195,9.29738,8.15917,9.5067,6.45886,6.60621,29.3148,...,8.88396,11.1116,9.41114,11.0165,8.95801,7.74332,9.64047,10.3062,628154372,26938222
6,,,,28.8176,29.1117,7.85766,6.80283,10.8333,8.56587,8.0793,...,2.42212,8.3045,8.42848,21.0634,6.51702,5.53282,9.54278,6.62726,628154375,1560866145
7,11.9748,11.4852,11.1888,12.195,10.9168,12.3674,8.4414,4.90624,6.8405,16.2578,...,13.0102,11.0444,8.37879,8.96592,9.21806,9.49643,11.131,9.62171,5791596551,1321327852
8,,,,,,8.03038,2.81968,6.19882,23.8004,,...,,,,,,,,,5791621141,4782446443
9,,,,,,28.0504,,,,,...,,,,,,7.28012,,,5791621141,26875019


**Create rolling window tensor dataset**

In [22]:
import torch
import scipy.sparse
TOTAL_T_STEPS = 144

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
ienc = OrdinalEncoder()
    
def build_dataset_to_numpy_tensor(from_=0, to=TOTAL_T_STEPS, df=None):
    """
    We extract features from speed (actual speed, whether speed is missing)
    and combine with static features.
    :return:
         np.ndarray: dataset tensor of shape [num_time_steps, num_nodes, num_features]
    """
    dataset = list()
    for t in range(from_, to):
        cat_features_at_t = [['primary', 'asphalt', 'MajorRoad']] * len(L.nodes)
        ord_features_at_t = [[50.0, 4]] * len(L.nodes)
        speed_features_at_t = [50] * len(L.nodes) 
        speed_is_nan_feature = [1] * len(L.nodes)
        for _, row in df.iterrows():

            arc = (row['from_node'], row['to_node'])
            cat_features_at_t[id_to_idx[arc]], ord_features_at_t[id_to_idx[arc]]  = arc_features(arc)
            speed_features_at_t[id_to_idx[arc]] = row[str(t)]
            if np.isnan(row[str(t)]): 
                speed_is_nan_feature[id_to_idx[arc]] = 0
        dataset.append(np.concatenate([scaler.fit_transform(np.array(speed_features_at_t).reshape(-1, 1)), 
                                       np.array(speed_is_nan_feature).reshape(-1, 1), 
                                       ienc.fit_transform(ord_features_at_t),
                                       enc.fit_transform(cat_features_at_t).toarray()], axis=1))
    return np.stack(dataset, axis=0)

data = build_dataset_to_numpy_tensor(df=df)

In [42]:
# Build mask tensor
data_speed_only = data[:,:,0]
data_masked = torch.where(torch.isnan(torch.from_numpy(data_speed_only)), torch.tensor([0]), torch.tensor([1]))
data_masked = data_masked.bool()

In [43]:
data.shape

(144, 6161, 29)

In [44]:
data_masked.shape

torch.Size([144, 6161])

In [40]:
split_line1 = int(data.shape[0] * 0.7)
split_line2 = int(data.shape[0] * 0.9)

In [41]:
trg_data = data[:split_line1,:, :]
val_data = data[split_line1:split_line2,:, :]
tst_data = data[split_line2:,:,:]

trg_mask = data[:split_line1,:, :]
val_mask = data[split_line1:split_line2,:, :]
tst_mask = data[split_line2:,:,:]

In [74]:
from src.utils.dataset import SlidingWindowDataset
from torch.utils.data import DataLoader

In [109]:
import numpy as np
import torch
import torch.utils.data


class SlidingWindowDataset(torch.utils.data.Dataset):
    def __init__(self, *tensors, window=1, horizon=1, dtype=torch.float):
        super().__init__()
        print(locals())
        assert all(tensors[0].shape[0] == t.shape[0] for t in tensors)

        self._tensors = tensors
        self._window = window
        self._horizon = horizon
        self._dtype = dtype

    def __getitem__(self, index):
        item = []
        for t in self._tensors:
            x = t[index : index + self._window]
            y = t[index + self._window : index + self._window + self._horizon]
            item.append(
                {
                    "x": torch.from_numpy(x).type(self._dtype),
                    "y": torch.from_numpy(y).type(self._dtype),
                }
            )
        return item

    def __len__(self):
        return self._tensors[0].shape[0] - self._window - self._horizon + 1


In [110]:
dataset = SlidingWindowDataset(trg_data, trg_mask, window=10, horizon=1)

{'dtype': torch.float32, 'horizon': 1, 'window': 10, 'self': <__main__.SlidingWindowDataset object at 0x14581be10>, 'tensors': (array([[[        nan,  0.        ,  4.        , ...,  0.        ,
          0.        ,  1.        ],
        [        nan,  0.        ,  4.        , ...,  0.        ,
          0.        ,  1.        ],
        [        nan,  0.        ,  4.        , ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [        nan,  0.        ,  4.        , ...,  1.        ,
          0.        ,  0.        ],
        [        nan,  0.        ,  4.        , ...,  0.        ,
          1.        ,  0.        ],
        [-0.29626711,  1.        ,  4.        , ...,  0.        ,
          0.        ,  1.        ]],

       [[        nan,  0.        ,  4.        , ...,  0.        ,
          0.        ,  1.        ],
        [        nan,  0.        ,  4.        , ...,  0.        ,
          0.        ,  1.        ],
        [        nan,  0.        ,  4.   

In [112]:
dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    drop_last=True,
    pin_memory=True
)

In [113]:
next(iter(dataloader))

[{'x': tensor([[[[-2.6279e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              0.0000e+00,  1.0000e+00],
            [-2.4440e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              0.0000e+00,  1.0000e+00],
            [-1.8680e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              1.0000e+00,  0.0000e+00],
            ...,
            [        nan,  0.0000e+00,  4.0000e+00,  ...,  1.0000e+00,
              0.0000e+00,  0.0000e+00],
            [-2.2350e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              1.0000e+00,  0.0000e+00],
            [-1.4239e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              0.0000e+00,  1.0000e+00]],
  
           [[-1.6694e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              0.0000e+00,  1.0000e+00],
            [-1.5884e-01,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              0.0000e+00,  1.0000e+00],
            [ 8.7631e+00,  1.0000e+00,  4.0000e+00,  ...,  0.0000e+00,
              1