In [None]:
# Graph Neural Network
import torch_geometric as pyG
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_networkx

# Neural Networks
import torch
import torch.nn.functional as F
import torch.nn as nn


# data processing
import pandas as pd
import networkx as nx

# plotting
import matplotlib.pyplot as plt

# utils
import random
from tqdm import tqdm

In [None]:
# load bike dataset from datasets folder
bike_data = pd.read_csv('datasets/MontrealBikeLane.csv')
# removing columns that have any missing data
bike_data = bike_data.dropna(axis=1)
bike_data["Date"] = pd.to_datetime(bike_data["Date"], format="%d/%m/%Y")
bike_data = bike_data.sort_values(by=['Date', 'Time'], ascending=True)
bike_data = bike_data.reset_index(drop=True)
bike_data.head(12)

In [None]:
bike_data.loc[:, [col not in  ["Time"] for col in bike_data.columns]].plot(x="Date", figsize=(20,10))

In [None]:
# create dictionary with column names and index order starting from 0 excluding Date and Time
# this is because we want to predict the number of bikes in the future
# and we don't want to include the future in our features
node_encoder = {}
for i, col in enumerate(bike_data.columns):
    if col != 'Date' and col != 'Time':
        node_encoder[col] = i - 2
node_encoder

In [None]:
# open the BikeLanesConnections.txt and extract the connections between the nodes via links in the PyG format.
# Use the previous function of node enconder to transforms bike lanes name to the desired encoded format.
# Path: BikeLanesConnections.txt, format X <-> Y
# X and Y are the names of the bike lanes
# X <-> Y means that X and Y are connected
def get_edge_connections():
    with open('BikeLanesConnections.txt') as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    content = [x.split(' <-> ') for x in content]
    content = [[node_encoder[x[0]], node_encoder[x[1]]] for x in content]
    content = torch.tensor(content).t().contiguous()
    return content

# create function to get edge and node index
def get_node_and_edge_index(bike_data):
    node_index = list(
        range(0, len(
                    [col for col in bike_data.columns if col not in ["Date", "Time"]]
                )
        )
    )
    
    
    edge_index = get_edge_connections()

    return node_index, edge_index


# create function to create a Data object in Pytorch Geometric for each row
def get_all_nodes_features_and_labels(dataframe, k = 12):
    """
    Each feature vector from the nodes is the vector of lagged values of the
    bike count from previous k Date. It assumes the dataframe is ordered by Date and Time.
    """
    all_nodes_features = []
    all_nodes_labels = []
    for j in [col for col in dataframe.columns if col not in ["Date", "Time"]]:
        node_features_across_dates = []
        node_labels_across_dates = []
        for i in range(dataframe.shape[0]):
            # get the previous k values
            lagged_values = dataframe[j].iloc[i-k:i].values
            lagged_values = torch.tensor(lagged_values).float()
            label_value = torch.tensor(dataframe[j].iloc[i]).float()
            # if there are not k previous values, skip the row
            if len(lagged_values) == k:
                node_features_across_dates.append(lagged_values)
                node_labels_across_dates.append(label_value)
        all_nodes_features.append(node_features_across_dates)
        all_nodes_labels.append(node_labels_across_dates)

    return all_nodes_features, all_nodes_labels

# showcasing one graph
node_index, edge_index = get_node_and_edge_index(bike_data)
all_nodes_features, all_nodes_labels = get_all_nodes_features_and_labels(bike_data, k=7)
print(all_nodes_features[0][0], all_nodes_labels[0][0])

In [None]:
graph_list = []
for date_index in range(len(all_nodes_features[0])):
    node_features = [all_nodes_features[i][date_index] for i in range(len(all_nodes_features))]
    node_labels = [all_nodes_labels[i][date_index] for i in range(len(all_nodes_labels))]
    graph = Data(x=torch.stack(node_features), y=torch.stack(node_labels), edge_index=edge_index)
    graph_list.append(graph)
print(len(graph_list))
graph = graph_list[0]
graph

In [None]:
def convert_to_networkx(graph, n_sample=None):
    g = to_networkx(graph, node_attrs=["x"])
    y = graph.x.numpy()

    if n_sample is not None:
        sampled_nodes = random.sample(g.nodes, n_sample)
        g = g.subgraph(sampled_nodes)

    return g


def plot_graph(g):
    plt.figure(figsize=(4, 3))
    nx.draw_planar(g, node_size=30, arrows=False, node_color=None)
    plt.show() 
    
    
g = convert_to_networkx(graph, n_sample=None)
plot_graph(g)

In [None]:
class RandomNodeSplitter:
    def __init__(self, train_val_test_ratio = [0.8, 0.1, 0.1]):
        self.train_val_test_ratio = train_val_test_ratio
        self.has_mask = False
    

    def setup_split(self, num_nodes) -> None:
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)

        # randomly assign nodes to train, val, test with permutation
        train_ratio = self.train_val_test_ratio[0]
        train_n = int(num_nodes * train_ratio)
        val_ratio = self.train_val_test_ratio[1]
        val_n = int(num_nodes * val_ratio)

        permutation = torch.randperm(num_nodes)
        train_mask[permutation[:train_n]] = 1
        val_mask[permutation[train_n:train_n+val_n]] = 1
        test_mask[permutation[train_n+val_n:]] = 1

        self.train_mask = train_mask
        self.val_mask = val_mask
        self.test_mask = test_mask

        self.has_mask = True


    def __call__(self, graph, save_mask=True) -> pyG.data.data.Data:
        if self.has_mask is False or save_mask is False:
            print("Generating new random split.")
            self.setup_split(graph.num_nodes)

        # assign masks to graph
        graph.train_mask = self.train_mask
        graph.val_mask = self.val_mask
        graph.test_mask = self.test_mask

        return graph

split = RandomNodeSplitter()
graph = split(graph)
graph

In [None]:
# using loader to get the full dataset but split into train, val, test
split = RandomNodeSplitter()
split.setup_split(graph.num_nodes)
graph_list = [split(graph) for graph in graph_list]
loader = DataLoader(graph_list, batch_size=len(graph_list), shuffle=False)
for dataset in loader:
    break
dataset

# GraphGP