<a href="https://colab.research.google.com/github/veersangha/CPSC483-FinalProject/blob/main/CPSC_483_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the pytorch library into environment and check its version
import os
import torch
print("Using torch", torch.__version__)

Using torch 1.13.0+cu116


In [None]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu116.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.13.0+cu116.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_scatter-2.1.0%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 41.2 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_sparse-0.6.16%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 64.5 MB/s 
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_cluster-1.6.0%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 65.9 MB/s 
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_spline_conv-1.2.1%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (873 kB)
[K     |█

In [None]:
# import torch_geometric.data into environment
from torch_geometric.data import Dataset, Data, DataLoader
import numpy as np
import pandas as pd
import os

In [None]:
from scipy.signal import resample
from scipy.ndimage import median_filter

Instructions: 
Please take the 5 ECG examples and the ecg_df.csv that are in "example data" folder in the Github, and copy them (i.e. drag and drop) to the folder connected to this Colab --> They should be stored in the home directory, outside of the sample_data folder that is already populated automatically

In [None]:
# Read in the label file that we have downloaded from Github
ecg_df = pd.read_csv('ecg_df.csv')

In [None]:
import numpy as np
import matplotlib.mlab as mlab
from scipy.signal import hilbert
from sklearn.feature_selection import mutual_info_regression
# As we discuss in the report, there are different ways to define edges, here we will demonstrate the fully connected method

def gen_edges_cg(x):
    """
    Generate edges based on complete graph
    :param x: (T, C)
    :return: edge_index: (2, C * C - C)
    """
    samples, channels = x.shape
    edge_index = [[i, j] for i in range(channels) for j in range(channels)
                  if i != j]
    edge_index = np.asarray(edge_index).T
    return edge_index

# Load the time series data as feature for each node
def gen_features_raw(x):
    """
    Generate node features using raw data.
    :param x: (T, C)
    """
    # x = x[range(0, x.shape[0], 2), :]
    features = x.T
    return features



In [None]:
# This is the code to make our graph objects i.e. take the ECG data and labela and make it a graph which we load in during training
def gen_data_list(data, label, edge_type='corr'):
    """
    Generate graph data list from matrix data and label.
    :param data: training or testing data in matrix form, shape: (N, T, C)
    :param label: training or testing label in matrix form, shape: (N, )
    :return: training or testing data list,
             each item in this list is a torch_geometric.data.Data object.
    """
    data_list = []
    for trial in range(data.shape[0]):
        trial_data = data[trial, ...]
        trial_label = label[trial]
        # generate edge index and node features, i have commented out the methods we aren't using in this demo
        
        '''
        if edge_type == 'corr':
            edge_index, edge_weight = gen_edges_corr(trial_data)
        elif edge_type == 'mi':
            edge_index, edge_weight = gen_edges_mi(trial_data)
        '''
        if edge_type == 'cg':
            edge_index = gen_edges_cg(trial_data)
            edge_weight = np.zeros((edge_index.shape[-1], 1))

        x = gen_features_raw(trial_data)

        edge_index = torch.from_numpy(edge_index).long()
        edge_weight = torch.from_numpy(edge_weight).float()
        x = torch.from_numpy(x).float()

        graph_data = Data(x=x, edge_index=edge_index,
                          y=trial_label, edge_attr=edge_weight)
        data_list.append(graph_data)
    return data_list




In [None]:
# Here we define the Data Loader so we can read in ECGs as we go. 
# We need to do this because we have thousands of ECGs we train on, which we cannot load into memory at once

class EcgDataset(Dataset):
  def __init__(self, root, filename, transform=None, pre_transform=None):
    self.filename = filename
    super(EcgDataset,self).__init__(root, transform, pre_transform)

  @property
  def raw_file_names(self):
      return self.filename

  @property
  def processed_file_names(self):
      self.data = pd.read_csv(self.raw_paths[0]).reset_index()
      
      return [f'data_{i}.pt' for i in list(self.data.index)]

  def download(self):
    pass

  def process(self):
      self.data = pd.read_csv(self.raw_paths[0])
      for index,row in self.data.iterrows():
          # Read data from `raw_path`.
          #Load data and label
          label = self._get_labels(row["Gender"])
          data_npy = np.load(self.root + '/'+row['FileID'])

          # Take 5 seconds of data
          data_npy = data_npy[:2500]

          # Baseline Wander Correction
          data_npy = data_npy - median_filter(data_npy,size=(500,1))

          # Downsample
          data_npy = resample(data_npy, 500, axis=0)
          
          data_npy =data_npy[:,0:12]
          
          # We create the graph objects using function defined above
          data = gen_data_list(np.expand_dims(data_npy,0),label,edge_type='corr')
          data = data[0]

          if self.pre_filter is not None and not self.pre_filter(data):
              continue

          if self.pre_transform is not None:
              data = self.pre_transform(data)

          torch.save(data, os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
  def _get_labels(self, label):
      label = np.asarray([label])
      return torch.tensor(label, dtype=torch.int64)
      
  def len(self):
      return len(self.processed_file_names)

  def get(self, idx):
      data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
      return data


In [None]:
# Set up the file paths so that data loader can access correctly
!mkdir processed
!mkdir raw
!mv ecg_df.csv raw

mkdir: cannot create directory ‘processed’: File exists


Processing...
Done!


In [None]:
# Now we run the data loader, which will actually process the raw npy files and create processed .pt files 
# in the newly ceated /processed folder
# We will then access these newly created files when loading into the data loader
ecg_df = 'ecg_df.csv'
dataset = EcgDataset('.',ecg_df)

In [None]:
train_loader = DataLoader(dataset,batch_size=2,shuffle=False)

In [None]:
# Here we will demonstrate pipeline with a very simple GCN
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool



class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.linear1 = Linear(hidden_channels, out_channels)
        self.linear2 = Linear(out_channels, 1)
        self.relu = F.relu
        self.sigmoid = torch.sigmoid

    def forward(self, x, edge_index, batch):

        output1 = self.conv2(self.relu(self.conv1(x, edge_index)), edge_index)
        output2 = global_mean_pool(output1, batch)

        output = self.sigmoid(self.linear2(self.relu(self.linear1(output2))))
        return output

In [None]:
model = GCN(in_channels=500, hidden_channels=64, out_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 18.6 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.0


In [None]:
loss_func = torch.nn.BCELoss()
from torchmetrics.classification import AUROC

def train(model, loader, optimizer, loss_func):

    loss = 0
    model.train()
    all_labels = []
    all_preds = []
    for i, data in enumerate(loader):
      pred = model(data.x, data.edge_index, data.batch)
      target = data.y
      target = target.unsqueeze(1)
      target = target.float()
      loss = loss_func(pred, target)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      all_labels.append(target)
      all_preds.append(pred)
    all_labels = torch.flatten(torch.cat(all_labels))
    all_preds = torch.flatten(torch.cat(all_preds))
    print(f'True Labels: {all_labels}')
    print(f'Predicted Labels: {all_preds}')
    auroc = AUROC(task='binary')
    auroc_score = auroc(all_labels,all_preds)

    return model, auroc_score

In [None]:
def test(model, loader):
    model.eval()

    correct = 0
    all_labels = []
    all_preds = []
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)  
        target = data.y
        target = target.unsqueeze(1)
        target = target.float()

        all_labels.append(target)
        all_preds.append(out)
    all_labels = torch.flatten(torch.cat(all_labels))
    all_preds = torch.flatten(torch.cat(all_preds))
    auroc = AUROC(task='binary')
    auroc_score = auroc(all_labels,all_preds)    

    return auroc_score  # Derive ratio of correct predictions.

We will show training on this toy dataset for 10 epochs

In [None]:
epochs = 10

for epoch in range(1, epochs):
    model,train_auroc = train(model, train_loader, optimizer, loss_func)
    test_auroc = test(model, train_loader)
    print(f'Epoch: {epoch:03d}, Train AUC: {train_auroc:.4f} Test AUC: {test_auroc:.4f}')

True Labels: tensor([0., 1., 0., 0., 1.])
Predicted Labels: tensor([4.1827e-10, 1.0000e+00, 1.1154e-11, 2.6577e-17, 1.0000e+00],
       grad_fn=<ReshapeAliasBackward0>)
Epoch: 001, Train AUC: 1.0000 Test AUC: 1.0000
True Labels: tensor([0., 1., 0., 0., 1.])
Predicted Labels: tensor([4.1826e-10, 1.0000e+00, 1.1055e-11, 2.6563e-17, 1.0000e+00],
       grad_fn=<ReshapeAliasBackward0>)
Epoch: 002, Train AUC: 1.0000 Test AUC: 1.0000
True Labels: tensor([0., 1., 0., 0., 1.])
Predicted Labels: tensor([4.1827e-10, 1.0000e+00, 1.0957e-11, 2.6548e-17, 1.0000e+00],
       grad_fn=<ReshapeAliasBackward0>)
Epoch: 003, Train AUC: 1.0000 Test AUC: 1.0000
True Labels: tensor([0., 1., 0., 0., 1.])
Predicted Labels: tensor([4.1826e-10, 1.0000e+00, 1.0860e-11, 2.6533e-17, 1.0000e+00],
       grad_fn=<ReshapeAliasBackward0>)
Epoch: 004, Train AUC: 1.0000 Test AUC: 1.0000
True Labels: tensor([0., 1., 0., 0., 1.])
Predicted Labels: tensor([4.1826e-10, 1.0000e+00, 1.0765e-11, 2.6519e-17, 1.0000e+00],
       