In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

In [3]:
TOTAL_T_STEPS = 144

# Get adjacency matrix for our partitions of Jurbey map

In [4]:
from src.graph_utils import partition_graph_by_lonlat
import networkx as nx
from jurbey.jurbey import JURBEY

with open("../data/1556798416403.jurbey", 'rb') as tempf:
    g = JURBEY.load(tempf.read())
    
g_partition = partition_graph_by_lonlat(g)

A = nx.adjacency_matrix(g_partition)

# Convert timeseries raw data into sparse tensor
Tensor size: 144 timestamps by adjacency matrix of speeds

In [5]:
import pandas

In [6]:
df = pandas.read_csv("../data/timeseries_speed_april_first_week.csv")

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,136,137,138,139,140,141,142,143,from_node,to_node
0,result.average,,,,28.817616,29.111668,8.288389,6.779508,10.833259,9.54078,...,9.136869,11.530145,8.263133,21.063414,6.517024,8.931566,9.542779,7.236827,628154368,1023689595
1,result.average,,,,28.817616,29.111668,8.288389,6.779508,10.833259,9.54078,...,9.136869,11.530145,8.263133,21.063414,6.517024,8.931566,9.542779,7.236827,628154368,1023689595
2,result.average,,,18.285511,,,10.48621,16.631804,11.133786,7.663071,...,,,,,,,,,527147009,27537239
3,result.average,,,,,19.033088,,,4.873969,12.137977,...,,,,12.890066,,,13.178754,,527147009,26908815
4,result.average,,,10.952773,20.377332,9.472034,7.588203,9.702233,6.167263,5.860414,...,6.705081,9.68438,29.012508,14.253002,8.932515,6.7644,10.154796,10.82118,628154370,3804638178


In [8]:
#Our speed data uses segment ids, but the model uses sequential indexes, based on `.nodes()`
import math
id_to_idx = {}
# defaultdict won't do what you expect in Pandas

for id_ in df["from_node"].unique():
    id_to_idx[id_] = math.nan
for id_ in df["to_node"].unique():
    id_to_idx[id_] = math.nan
    
for idx, id_ in enumerate(g_partition.nodes()):
    id_to_idx[id_] = idx

In [9]:
# Let's transform ids to indeces
df["from_node_idx"] = df.replace({"from_node": id_to_idx})["from_node"]
df["to_node_idx"] = df.replace({"to_node": id_to_idx})["to_node"]

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,138,139,140,141,142,143,from_node,to_node,from_node_idx,to_node_idx
0,result.average,,,,28.817616,29.111668,8.288389,6.779508,10.833259,9.54078,...,8.263133,21.063414,6.517024,8.931566,9.542779,7.236827,628154368,1023689595,0.0,350.0
1,result.average,,,,28.817616,29.111668,8.288389,6.779508,10.833259,9.54078,...,8.263133,21.063414,6.517024,8.931566,9.542779,7.236827,628154368,1023689595,0.0,350.0
2,result.average,,,18.285511,,,10.48621,16.631804,11.133786,7.663071,...,,,,,,,527147009,27537239,1.0,1608.0
3,result.average,,,,,19.033088,,,4.873969,12.137977,...,,12.890066,,,13.178754,,527147009,26908815,1.0,2629.0
4,result.average,,,10.952773,20.377332,9.472034,7.588203,9.702233,6.167263,5.860414,...,29.012508,14.253002,8.932515,6.7644,10.154796,10.82118,628154370,3804638178,2.0,1197.0


## First let's build sparse 3D data tensor

In [11]:
import torch

def snapshot(t, df=df, g_partition=g_partition):
    df_t = df[[t, "from_node_idx", "to_node_idx"]]
    df_t = df_t.dropna()
    row = df_t["from_node_idx"].tolist()
    col = df_t["to_node_idx"].tolist()
    data = df_t[t].tolist()
    size = len(g_partition.nodes())  

    return {"indices": (row, col), "values": data, "shape": (size, size)}

In [12]:
from scipy.sparse import hstack

def build_sparse_dataset(from_=0, to=TOTAL_T_STEPS):
    dataset = {"indices": ([], [], []), "values": []}
    for t in range(from_, to):

        snap = snapshot(str(t))
        dataset["indices"][0].extend([t] * len(snap["indices"][0]))
        dataset["indices"][1].extend(snap["indices"][0])
        dataset["indices"][2].extend(snap["indices"][1])
        dataset["values"].extend(snap["values"])

    i = torch.LongTensor(dataset["indices"])
    v = torch.FloatTensor(dataset["values"])
    return torch.sparse.FloatTensor(i, v, torch.Size((to, *snap["shape"])))

dataset = build_sparse_dataset()

## Now let's split sparse TxKxK Tensor into 3 TxKxK tensors for training, validation and testing

In [13]:
nonzero_values_cnt = len(dataset._values())
# what percent goes into training/validation/testing
tng_pct = 0.7
val_pct = 0.1
tst_pct = 1 - tng_pct - val_pct
# now we want to split list of all non-zeros promortionally:
# [0, split1_idx], [split1_idx, split2_idx] and [split2_idx:]
split1_idx = int(nonzero_values_cnt * tng_pct)
split2_idx = -int(nonzero_values_cnt * tst_pct)

In [14]:
from random import shuffle

# but we select indexes randomly
idxs = list(range(nonzero_values_cnt))
shuffle(idxs)
# these are non-zero indexes
tng_idxs = idxs[:split1_idx]
val_idxs = idxs[split1_idx:split2_idx]
tst_inxs = idxs[split2_idx:]

In [15]:
dataset_split = {}
for name, idxs in [('tng', tng_idxs), ('val', val_idxs), ('tst', tst_inxs)]:
    i = torch.LongTensor([
        dataset._indices()[0][idxs].tolist(),
        dataset._indices()[1][idxs].tolist(),
        dataset._indices()[2][idxs].tolist()
    ])
    v = torch.FloatTensor(dataset._values()[idxs])
    # NOTE sparse tensor is not supported yet by the model
    dataset_split[name] = torch.sparse.FloatTensor(i, v, dataset.shape).to_dense()

# Let's train model

In [None]:
from src.nmf.lsm_rn import LSM_RN
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from test_tube import Experiment


model = LSM_RN(TOTAL_T_STEPS, n=3475, k=50, λ=0.1, adj_mat=A, datasets=dataset_split, batch_size=8)
exp = Experiment(save_dir='lsm_rn_logs')
checkpoint_callback = ModelCheckpoint(
    filepath='lsm_rn.ckpt',
    save_best_only=True,
    verbose=True,
    monitor='avg_val_mae',
    mode='min'
)


# most basic trainer, uses good defaults
trainer = Trainer(experiment=exp, checkpoint_callback=checkpoint_callback)    
trainer.fit(model)
#TODO lr decay

[autoreload of src.nmf.lsm_rn failed: Traceback (most recent call last):
  File "/Users/dscsade/.local/share/virtualenvs/speed-imputation-gDyT7lob/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Users/dscsade/.local/share/virtualenvs/speed-imputation-gDyT7lob/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 450, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/dscsade/.local/share/virtualenvs/speed-imputation-gDyT7lob/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 387, in update_generic
    update(a, b)
  File "/Users/dscsade/.local/share/virtualenvs/speed-imputation-gDyT7lob/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 357, in update_class
    update_instances(old, new)
  File "/Users/dscsade/.local/share/virtualenvs/speed-imputation-gDyT7lob/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 317, in u

gpu available: False, used: False
Empty DataFrame
Columns: [Name, Type, Params]
Index: []


 97%|█████████▋| 35/36 [02:19<00:03,  3.76s/it, avg_laplacian_term=102, avg_val_fro=234, avg_val_loss=336, avg_val_mae=9.64, batch_nb=16, epoch=0, tng_loss=739.647, v_nb=0]

save callback...

Epoch 00001: avg_val_mae improved from inf to 9.64308, saving model to lsm_rn.ckpt/_ckpt_epoch_1.ckpt


 97%|█████████▋| 35/36 [02:15<00:03,  3.86s/it, avg_laplacian_term=100, avg_val_fro=234, avg_val_loss=334, avg_val_mae=9.6, batch_nb=16, epoch=1, tng_loss=751.846, v_nb=0] 

save callback...

Epoch 00002: avg_val_mae improved from 9.64308 to 9.59567, saving model to lsm_rn.ckpt/_ckpt_epoch_2.ckpt


 97%|█████████▋| 35/36 [02:15<00:03,  3.71s/it, avg_laplacian_term=99.1, avg_val_fro=234, avg_val_loss=333, avg_val_mae=9.53, batch_nb=16, epoch=2, tng_loss=754.271, v_nb=0]

save callback...

Epoch 00003: avg_val_mae improved from 9.59567 to 9.52733, saving model to lsm_rn.ckpt/_ckpt_epoch_3.ckpt


 97%|█████████▋| 35/36 [02:13<00:03,  3.76s/it, avg_laplacian_term=97.7, avg_val_fro=233, avg_val_loss=331, avg_val_mae=9.44, batch_nb=16, epoch=3, tng_loss=752.452, v_nb=0]

save callback...

Epoch 00004: avg_val_mae improved from 9.52733 to 9.44399, saving model to lsm_rn.ckpt/_ckpt_epoch_4.ckpt


 58%|█████▊    | 21/36 [01:22<01:01,  4.11s/it, avg_laplacian_term=97.7, avg_val_fro=233, avg_val_loss=331, avg_val_mae=9.44, batch_nb=16, epoch=4, tng_loss=747.000, v_nb=0]