# libraries, datasets

In [1]:
# libraries
import torch
from torch_geometric.data import Data
import pandas as pd 
from torch_geometric.transforms import RemoveTrainingClasses, RandomNodeSplit  
import torch.nn.functional as F
import torch.nn
from torch_geometric.nn import SAGEConv
import matplotlib.pyplot as plt

#datasets
from torch_geometric.datasets import AttributedGraphDataset
from torch_geometric.datasets import Planetoid 
from torch_geometric.transforms import NormalizeFeatures

# evaluation 
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

In [2]:
# download datasets
datasets = ['cora_A', 'citeseer_A', 'pubmed_A', 'cora_P', 'citeseer_P', 'pubmed_P']

# AGD
cora_A = AttributedGraphDataset(root='AGD', name='Cora')
citeseer_A = AttributedGraphDataset(root='AGD', name='CiteSeer')
pubmed_A =  AttributedGraphDataset(root='AGD', name='PubMed')

# planetoid
cora_P = Planetoid(root='Planetoid', name='Cora', transform=NormalizeFeatures())
citeseer_P = Planetoid(root='Planetoid', name='CiteSeer',transform=NormalizeFeatures())
pubmed_P =  Planetoid(root='Planetoid', name='PubMed',transform=NormalizeFeatures())

# network model

In [3]:
#neural network model 
#implement a two-layer GraphSage from GCN example:

class GraphSage(torch.nn.Module):
    def __init__(self, attributes, classes, dimension=32):
        super(GraphSage, self).__init__() 
        self.conv1 = SAGEConv(attributes, dimension) 
        self.conv2 = SAGEConv(dimension, classes)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index 

        x = self.conv1(x, edge_index) #layer 1 
        x = F.relu(x)
        x = F.dropout(x, training=self.training) 
        x = self.conv2(x, edge_index) #layer 2 

        return x

In [4]:
## training & evaluating 
class accuracy():
    def __init__(self, model, device, dataset, data):
        self.model = model(attributes=dataset.num_node_features, classes=dataset.num_classes, dimension=32).to(device)
        self.data = data
    
    def train(self, num_epochs, lr=0.001):
        loss_vals= []
        valid_vals= []
        optimizer = torch.optim.Adam(self.model.parameters(), lr, weight_decay=5e-4)
        print('Training the model\n')

        for epoch in range(num_epochs): 
            optimizer.zero_grad() 
            out = self.model(self.data)
            loss = F.cross_entropy(input=out[self.data.train_mask], target=self.data.y[self.data.train_mask]) 
            loss.backward() 
            optimizer.step()  
 
    def evaluate(self, dimension=1, write_confusion=False):
        # returns a confusion matrix if set write_confusion=True
        print('Evaluating the model\n')
        self.model.eval()
        pred = self.model(self.data).argmax(dimension)        
        correct = (pred[self.data.test_mask] == self.data.y[self.data.test_mask]).sum()
        acc = int(correct) / int(self.data.test_mask.sum())
        print(f'Accuracy: {acc:.4f}')

        if write_confusion: 
            new = confusion_matrix(self.data.y[self.data.test_mask].numpy(), pred[self.data.test_mask].numpy(), normalize='true')
            return(pd.DataFrame(data=new))

# Default split vs Random split
- leave one out manually, default splits and random splits

## functions - take out a node/multiple nodes

In [5]:
#defining a manual train mask function 
#train_mask = a list of booleans to mask, same length as num_nodes
from typing import Optional, Tuple, Union
import torch
from torch import Tensor
from torch_geometric.data import Data, HeteroData
from torch_geometric.transforms import BaseTransform


def random_train_mask(dataset):
#     cora_A_data = transform_nodes(cora_A[0])'
    transform_nodes = RandomNodeSplit(split = 'test_rest') # RandomNodeSplit is a class 
    data = transform_nodes(dataset[0])
    return data

def split(which_nodes, num_nodes, num_test_nodes) -> Tuple[Tensor, Tensor, Tensor]:
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.ones(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    count = 0
    for i in range(num_nodes):  #separate the testing set
        if count == num_test_nodes: break 
        if i not in which_nodes: 
            test_mask[i] = True
            count+=1

    for i in which_nodes: #separate the training and validation set
        train_mask[i] = True  
        val_mask[i] = False

    return train_mask, val_mask, test_mask

def create_train_mask(dataset, which_nodes, test_ratio, default_train=False):
    """
    which_nodes: a list of integers(node indices) that will be masked
    test_ratio: float, what percentage of the entire nodes that will be used for the testing set
    
    returns: data with train_mask, val_mask, test_mask attributes manually set
    """
    
    data = dataset[0]
    num_nodes = data.num_nodes
    num_test_nodes = test_ratio * data.num_nodes
        
    if default_train: 
        # random split using the pyg function 
        return random_train_mask(dataset) 
    
    else: 
        # manual split taking out the designated nodes
        for store in data.node_stores:
            train_masks, val_masks, test_masks = zip(*[split(which_nodes, num_nodes, num_test_nodes)])
            store.train_mask = torch.stack(train_masks, dim=-1).squeeze(-1)
            store.val_mask = torch.stack(val_masks, dim=-1).squeeze(-1)
            store.test_mask = torch.stack(test_masks, dim=-1).squeeze(-1)
        return data

def from_numpy(train, val, test):
    return torch.from_numpy(train), torch.from_numpy(val), torch.from_numpy(test)

def mask_a_node(data, which_node):
    """
    given a dataset, mask a node of given index into False (i.e. mask one datapoint in the training set)
    note: the dataset must already have a preexisting train_mask tensor
    
    which_node: an integer index of a node to be masked 
    """
    train_mask = data.train_mask.numpy()
    val_mask = data.val_mask.numpy()
    test_mask = data.test_mask.numpy()
    
    for store in data.node_stores:
        train_mask[which_node] = 'True'
        val_mask[which_node] = 'False'
        test_mask[which_node] = 'False'
        

        train_masks, val_masks, test_masks = zip(*[from_numpy(train_mask, val_mask, test_mask)])
        
        store.train_mask = torch.stack(train_masks, dim=-1).squeeze(-1)
        store.val_mask = torch.stack(val_masks, dim=-1).squeeze(-1)
        store.test_mask = torch.stack(test_masks, dim=-1).squeeze(-1)
    return data

In [6]:
## define transform_nodes
cora_A_data_manual = create_train_mask(cora_A, [3, 4, 5], 0.2, default_train=False)
print(cora_A_data_manual.train_mask[:10])
cora_A_data_one = mask_a_node(cora_A_data_manual, [0, 1])
print(cora_A_data_manual.train_mask[:10])

tensor([False, False, False,  True,  True,  True, False, False, False, False])
tensor([ True,  True, False,  True,  True,  True, False, False, False, False])


### function - take out a class

In [7]:
# define a manual train class function 
def random_train_class(dataset, which_class):
    """
    note: dataset must have a preexisting train_mask
    classes (List[int]) â€“ The classes to remove from the training set.
    """
#     cora_A_data = transform_nodes(cora_A[0])'
    transform_class = RemoveTrainingClasses(which_class) # RandomNodeSplit is a class 
    data = transform_class(dataset[0])
    return data

# training

In [8]:
cora_A_random_data = random_train_mask(cora_A)
citeseer_A_random_data = random_train_mask(citeseer_A)
pubmed_A_random_data = random_train_mask(pubmed_A) # caution, don't reiterate

## cora_A
- cora_A_random_data
- cora_A_data_manual
- cora_A_data_one

In [9]:
#cora_A 
deviceA = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
deviceB = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
deviceC = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataA = cora_A_random_data.to(deviceA)
dataB = cora_A_data_manual.to(deviceB)
dataC = cora_A_data_one.to(deviceC)

In [10]:
cora_AA = accuracy(GraphSage,deviceA, cora_A, dataA)
cora_AA.train(50) 
evalA = cora_AA.evaluate()

Training the model

Evaluating the model

Accuracy: 0.6349


In [11]:
cora_AB = accuracy(GraphSage, deviceB, cora_A, dataB)
cora_AB.train(50) 
evalB = cora_AB.evaluate()

Training the model

Evaluating the model

Accuracy: 0.2144


In [12]:
cora_AC = accuracy(GraphSage, deviceC, cora_A, dataC)
cora_AC.train(50) 
evalB = cora_AC.evaluate()

Training the model

Evaluating the model

Accuracy: 0.2181


In [13]:
# idea: build a class that returns a custom train_mask tensor? 
# train_mask usage: data[train_mask]

In [14]:
cora_P_class = random_train_class(cora_P, [0])
print(cora_P_class.node_stores)

[{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'edge_index': tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]]), 'y': tensor([3, 4, 4,  ..., 3, 3, 3]), 'train_mask': tensor([ True,  True,  True,  ..., False, False, False]), 'val_mask': tensor([False, False, False,  ..., False, False, False]), 'test_mask': tensor([False, False, False,  ...,  True,  True,  True])}]
