In [1]:
# This notebook converts the graph data from the pickle file containing the list of graph representations 
# into anoter pickle file containing the list of graph representations in the format of PyG format.
import torch
import pickle
import numpy as np
from pyg_preprocess import create_tensordata, convert_to_Data, pseudo_data, convert_to_PairData

#### Step 1: Load original data of GRs (Pickle file)

We first load in the pickle file of the previously generated graph representations, containing a list with entries of the form $[[A, NF, EF], Y]$ where each component is a NumPy array.

In [2]:
# Load data (MAC)
# pickle_data_path = "/Users/xaviermootoo/Documents/Data/ssl-seizure-detection/pickle/jh101_grs.pickle"
# pickle_data = pickle.load(open(pickle_data_path, "rb"))

# Load data (PC)
pickle_data_path = r"C:\Users\xmoot\Desktop\Data\ssl-seizure-detection\patient_gr\jh101_grs.pickle"
pickle_data = pickle.load(open(pickle_data_path, "rb"))

#### Step 2: Convert standard GRs to PyG GRs
The function `create_tensordata` converts the pickle file list of standard graph representations, namely a list with entries of the form $[ [A, NF, EF] , Y]$, where $A$ is the weighted adjacency matrix, $NF$ are the node features, and $EF$ are the edge features. The function first converts $A$ to the `edge_index` format for PyTorch Geometric, which is a tensor of shape [2, num_edges] where each column $[i \ \ j]^T$ indicates the directed edge $i \to j$. If $A$ is a binary adjaceny matrix it will convert it accordingly, if $A$ is a weighted adjacency matrix that is for a complete graph, it will instead generate the corresponding `edge_index` for a complete graph and then stack the weights along with $EF$ in the edge features object `edge_attr` that PyTorch geometric uses. The node features $NF$ are untouched, but converted to float32 a tensor, we typically notate this by $x$ in PyTorch geometric. By the end we have a list with entries of the form `[[edge_index, x, edge_attr], y]` where each component is a tensor, `x` and `edge_attr` are float32 whereas `edge_index` and `y` are long. 

In [3]:
# Convert normal GRs to list of PyG tensors: [[A, NF, EF], Y] -> [[edge_index, x, edge_attr], y]
pyg_grs_path = r"C:\Users\xmoot\Desktop\Data\ssl-seizure-detection\patient_gr\jh101_pyg_grs.pt"
pyg_grs = create_tensordata(num_nodes=107, data_list=pickle_data, complete=True, save=True, logdir=pyg_grs_path)     

In [3]:
# Check if it works
pyg_data_path = r"C:\Users\xmoot\Desktop\Data\ssl-seizure-detection\patient_gr\jh101_tensors_grs_pyg.pt"
pyg_data = torch.load(pyg_data_path)
print(len(pyg_data))
print(len(pickle_data))
print(type(pyg_data[0][0][0]))
print("Edge features shape:", pyg_data[0][0][2].shape)
print("Edge features stored in edge_attr:", pyg_data[0][0][2])
print("Edge features stored in adj:", pickle_data[0][0][0])

#### Step 3: Convert PyG GRs to PyG Data objects
Stop after this step if you only need PyG Data objects for supervised learning! Here we take the PyG graph representations, and apply the `convert_to_Data` function to create a new list where each entry is now a `torch_geometric.data.Data` object. This is the main object uses to hold graphs in PyG, so we need to use it, especially for batching.

In [6]:
# Convert the PyG GRs to the PyG Data format
pyg_Data_path = r"C:\Users\xmoot\Desktop\Data\ssl-seizure-detection\patient_gr\jh101_pyg_Data.pt"
Data_list = convert_to_Data(pyg_grs, save=True, logdir=pyg_Data_path)

#### Step 4: Create pseudolabeled dataset
In this step we take the output of Step 2 and apply the relative positioning pseudolabeled dataset generation for graph pairs.

In [4]:
# Create a list of pseudolabeled graph pairs with entries of the form: [[edge_index1, x1, edge_attr1], [edge_index2, x2, edge_attr2], y]
pdata = pseudo_data(pyg_data, tau_pos = 12 // 0.12, tau_neg = (7 * 60) // 0.12, stats = True, save = False, patientid = "patient", logdir = None)

Number of examples: 926986
0    483636
1    443350
Name: y, dtype: int64


In [6]:
# Check if it works
print(len(pdata))
example = pdata[0]
graph1, graph2, label = example
edge_index1, x1, edge_attr1 = graph1
edge_index2, x2, edge_attr2 = graph2
print("Edge features shape:", edge_attr1.shape)
print("Edge features stored in edge_attr:", edge_attr1)
print("Edge features stored in adj:", pickle_data[0][0][0])

926986
Edge features shape: torch.Size([11342, 1])
Edge features stored in edge_attr: tensor([[ 0.6607],
        [-0.1258],
        [-2.1098],
        ...,
        [ 1.7842],
        [-1.2728],
        [ 1.8921]])


#### Step 5: Convert pseudolabeled dataset to PairData
Instead of converting each graph pair in the pseudolabeled dataset to a `torch_geometric.data.Data` object, we instead create a new class called `PairData` that inherits from the `torch_geometric.data.Data` class, which will allow us to do batching on pairs of graphs.

In [7]:
# Create PairData
Pair_Data_path = r"C:\Users\xmoot\Desktop\Data\ssl-seizure-detection\patient_pseudolabeled\relative_positioning\jh101_12s_7min_PairData.pt"
Pair_Data = convert_to_PairData(pdata, save=True, logdir=Pair_Data_path)

In [None]:
# Link for pairs of graphs: https://pytorch-geometric.readthedocs.io/en/latest/advanced/batching.html
# Link for creating datasets: https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
# Link for Data handling tutorial: https://www.youtube.com/watch?v=Vz5bT8Xw6Dc&list=PLGMXrbDNfqTzqxB1IGgimuhtfAhGd8lHF&index=5