In [5]:
import torch
from torch_geometric.data import Data
import pandas as pd
import pickle
import numpy as np

from torch_geometric.nn import SAGEConv
import torch
import torch_geometric
import torch.nn.functional as F
torch_geometric.set_debug(True)

<torch_geometric.debug.set_debug at 0x7f90db269710>

# Helper Functions

In [6]:
def read_data(nodes_df_path, edges_df_path, subject_mapping_path):
    nodes_df = pd.read_csv(nodes_df_path)
    edges_df = pd.read_csv(edges_df_path)
    with open(subject_mapping_path, 'rb') as f:
        subject_mapping = pickle.load(f)
    return nodes_df, edges_df, subject_mapping


def get_node_id_mapping(nodes_df):
    node_id_mapping, inverse_node_id_mapping = dict(), dict()
    for i, node_id in enumerate(nodes_df['nodeId']):
        node_id_mapping[i] = node_id
        inverse_node_id_mapping[node_id] = i
    return node_id_mapping, inverse_node_id_mapping

In [18]:
nodes_df

Unnamed: 0,nodeId,subject,features
0,31336,Neural_Networks,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1061127,Rule_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,1106406,Reinforcement_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,13195,Reinforcement_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,37879,Probabilistic_Methods,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
2703,1128975,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2704,1128977,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2705,1128978,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2706,117328,Case_Based,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [71]:
def get_feature_vectors(nodes_df):
    features = nodes_df['features'].apply((lambda x: x.strip('][').split(', ')))
    features = np.array([[float(val) for val in feature] for feature in features])
    return torch.from_numpy(features).to(torch.double)


def get_edges(edges_df, id_mapping):
    source_labels = edges_df['sourceNodeId'].apply(lambda x: id_mapping[x]).to_numpy()
    target_labels = edges_df['targetNodeId'].apply(lambda x: id_mapping[x]).to_numpy()
    edges_indices = np.stack((source_labels, target_labels), axis=0)
    return torch.from_numpy(edges_indices)


def get_labels(nodes_df, subject_mapping):
    labels = nodes_df['subject'].apply(lambda x: subject_mapping[x]).to_numpy()
    return torch.from_numpy(labels)

# Graph Generation

In [72]:
nodes_df_path = 'nodes.csv'
edges_df_path = 'edges.csv'
subject_mapping_path = 'subject_mapping.pkl'
nodes_df, edges_df, subject_mapping = read_data(nodes_df_path, edges_df_path, subject_mapping_path)

In [73]:
node_id_mapping, inverse_node_id_mapping = get_node_id_mapping(nodes_df)
# TODO: These functions need to be implemented. You can decide what are the input arguments to these functions.
x = get_feature_vectors(nodes_df)
edge_index = get_edges(edges_df, inverse_node_id_mapping)
y = get_labels(nodes_df, subject_mapping)

In [74]:
with open('indices_dict_part2.pkl', 'rb') as f:
    indices_dict = pickle.load(f)

In [75]:
train_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['train_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)
valid_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['valid_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)
test_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['test_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)

In [93]:
data = Data(x=x.float(), y=y, edge_index=edge_index, train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)

# GNN Model

In [134]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, hidden_channels, output_dim, seed):
        super().__init__()
        torch.cuda.manual_seed(seed)
        self.conv1 = SAGEConv(in_channels=hidden_channels, out_channels=hidden_channels)
        self.conv2 = SAGEConv(in_channels=hidden_channels, out_channels=output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.dropout(x, training=self.training)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

# Training

In [135]:
output_dim = len(subject_mapping)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

In [136]:
def evaluate(best_model, rel_mask):
    best_model.eval()
    with torch.no_grad():
        preds = best_model(data.x, data.edge_index).argmax(dim=1)
        correct = (preds[rel_mask] == data.y[rel_mask]).sum()
        return round(int(correct) / int(rel_mask.sum()), 3)

In [141]:
def train(model, optimizer, criterion, data, num_epochs=100, save_path='best_model.pt'):
    loss_steps = list()
    best_val_acc = 0
    best_loss = np.inf
    for epoch in range(1, num_epochs+1):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        val_acc = evaluate(model, data.valid_mask)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_loss = loss.item()
            torch.save(model.state_dict(), save_path)
    
        if epoch % 20 == 0 or epoch == 1:
            print(f"Epoch: {epoch:03d}  "
                  f"Best Val Acc: {best_val_acc:.4f}  "
                  f"Best Loss: {best_loss:.4f}  "
            )
        loss_steps.append(loss.item())
    return loss_steps
    

# Evaluation

In [142]:
test_scores = []
for seed in range(1, 4):
    model = GraphSAGE(x.shape[1], output_dim, seed).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    # TODO: Change the call for train if needed
    train(model, optimizer, torch.nn.CrossEntropyLoss(), data)
    best_model = torch.load('best_model.pt')
    model.load_state_dict(best_model)
    curr_seed_score = evaluate(model, data.test_mask)
    test_scores.append(curr_seed_score)
    print(f"Seed: {seed}, Test score: {curr_seed_score}")

Epoch: 001  Best Val Acc: 0.3220  Best Loss: 1.9422  
Epoch: 020  Best Val Acc: 0.7730  Best Loss: 0.1375  
Epoch: 040  Best Val Acc: 0.7810  Best Loss: 0.0037  
Epoch: 060  Best Val Acc: 0.7870  Best Loss: 0.0026  
Epoch: 080  Best Val Acc: 0.7870  Best Loss: 0.0026  
Epoch: 100  Best Val Acc: 0.7910  Best Loss: 0.0027  
Seed: 1, Test score: 0.821


  best_model = torch.load('best_model.pt')


Epoch: 001  Best Val Acc: 0.3030  Best Loss: 1.9512  
Epoch: 020  Best Val Acc: 0.7930  Best Loss: 0.1344  
Epoch: 040  Best Val Acc: 0.7930  Best Loss: 0.1344  
Epoch: 060  Best Val Acc: 0.7930  Best Loss: 0.1344  
Epoch: 080  Best Val Acc: 0.7930  Best Loss: 0.1344  
Epoch: 100  Best Val Acc: 0.7950  Best Loss: 0.0032  
Seed: 2, Test score: 0.822
Epoch: 001  Best Val Acc: 0.3070  Best Loss: 1.9403  
Epoch: 020  Best Val Acc: 0.7850  Best Loss: 0.1681  
Epoch: 040  Best Val Acc: 0.7890  Best Loss: 0.0033  
Epoch: 060  Best Val Acc: 0.7910  Best Loss: 0.0028  
Epoch: 080  Best Val Acc: 0.7970  Best Loss: 0.0030  
Epoch: 100  Best Val Acc: 0.7970  Best Loss: 0.0030  
Seed: 3, Test score: 0.819


In [143]:
print(test_scores)

[0.821, 0.822, 0.819]
