In [9]:
# Pip Install If you do not have these packages
# Python version = 3.10
!pip install torch==2.0.1
!pip install numpy==1.24.3
!pip install scikit-learn==1.3.0
!pip install torch-geometric==2.3.1

You should consider upgrading via the '/Users/jasonzhao/PycharmProjects/blogcatalog/venv/bin/python/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/jasonzhao/PycharmProjects/blogcatalog/venv/bin/python/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/jasonzhao/PycharmProjects/blogcatalog/venv/bin/python/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/jasonzhao/PycharmProjects/blogcatalog/venv/bin/python/bin/python -m pip install --upgrade pip' command.[0m


In [10]:
import torch
import numpy as np



In [11]:
# read the actual graph from network.txt
# assuming the graph is undirected
def read_graph():
    with open('graphs.npz', 'r') as f:
        lines = f.readlines()

    edges = []
    for line in lines[1:]:
        node1, node2 = map(int, line.split())
        edges.append((node1, node2))  # Add both directions to make the graph undirected
        edges.append((node2, node1))

    edge_index = torch.tensor(edges).t().contiguous()
    return edge_index

In [12]:
# read the true labels for the nodes
# same format as the input obtained from read_features
def read_labels(file_path):
    num_nodes = 10312
    num_labels = 39
    labels = torch.zeros((num_nodes, num_labels), dtype=torch.float)

    with open(file_path, 'r') as file:
        # Skip the first line
        next(file)
        for line in file:
            node, *label = map(int, line.strip().split())
            for l in label:
                labels[node][l] = 1  # set the label position to 1
    return labels

In [13]:
from torch.nn import Linear, Dropout
from torch_geometric.nn import GCNConv

# The Graph Convolutional Network
# Need refinement and fine tuning of hyperparameters
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes, num_intermediate_features=64, dropout_rate=0.5):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        # 4 convolutional layers, with dropout in between, and a final linear classifier
        self.conv1 = GCNConv(num_node_features, num_intermediate_features)
        self.conv2 = GCNConv(num_intermediate_features, num_intermediate_features)
        self.conv3 = GCNConv(num_intermediate_features, num_intermediate_features)
        self.conv4 = GCNConv(num_intermediate_features, num_intermediate_features)
        self.classifier = Linear(num_intermediate_features, num_classes)
        self.dropout = Dropout(dropout_rate)

    def forward(self, x, edge_index):
        # forward pass, apply the convolutions, and relu
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.dropout(x)

        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.dropout(x)

        x = self.conv4(x, edge_index)
        x = x.relu()
        x = self.dropout(x)

        # Final GNN embedding space.
        out = self.classifier(x)

        return out, x

In [14]:

from sklearn.metrics import f1_score
from torch_geometric.data import Data

# read the features
x, train_mask, test_mask = read_features()
# read the labels
y = read_labels('blogcatalogue-group.txt')
# read the graph
edge_index = read_graph()
# define the GCN Data class
data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask,test_mask=test_mask)


model = GCN(num_node_features=data.num_node_features, num_classes=39) # Create the model
criterion = torch.nn.BCEWithLogitsLoss()  # Initialize the CrossEntropyLoss function.

# all potential optimizers to try: SFG, ADAM, RMSprop
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)  # Initialize the Adam optimizer.
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0003)


def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask],
                     data.y[data.train_mask].float()) # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return out, loss, h

def test(data, out):
    model.eval() # evaluate model
    out = torch.sigmoid(out) # use a sigmoid function to map result to probabilities
    pred_labels = (out[train_mask] > 0.5).float()  # Directly use the output
    true_labels = data.y[train_mask].float() # Obtain the true labels

    f1 = f1_score(true_labels.cpu().numpy(), pred_labels.cpu().numpy(), average='micro') # Calculate F1 score

    return f1

for epoch in range(400): # Run 400 epochs
    out, loss, h = train(data)
    f1 = test(data, out)
    print(f'Epoch: {epoch}, Loss: {loss}, f1: {f1}')

Epoch: 0, Loss: 0.6928028464317322, f1: 0.06521590974621365
Epoch: 1, Loss: 0.6901902556419373, f1: 0.059343076049943246
Epoch: 2, Loss: 0.687747597694397, f1: 0.06467518198049718
Epoch: 3, Loss: 0.6849347949028015, f1: 0.06879105137231595
Epoch: 4, Loss: 0.6814907789230347, f1: 0.07258674141079542
Epoch: 5, Loss: 0.677045464515686, f1: 0.0782207218388522
Epoch: 6, Loss: 0.6713627576828003, f1: 0.0828932098619236
Epoch: 7, Loss: 0.6641836762428284, f1: 0.09420823363396527
Epoch: 8, Loss: 0.6551318168640137, f1: 0.10619200908918765
Epoch: 9, Loss: 0.6440539956092834, f1: 0.11876884245951062
Epoch: 10, Loss: 0.6310425400733948, f1: 0.1224390243902439
Epoch: 11, Loss: 0.6160212755203247, f1: 0.11954868154158216
Epoch: 12, Loss: 0.5991202592849731, f1: 0.10820231385345595
Epoch: 13, Loss: 0.5804398655891418, f1: 0.09187655196878326
Epoch: 14, Loss: 0.5599813461303711, f1: 0.06201727787630871


KeyboardInterrupt: 