### Trial Graph NN

Graph Neural Networks (GNNs) are a type of deep learning model specifically designed to work with data that is structured as a graph, where entities are represented by nodes and relationships are captured by edges. Unlike traditional neural networks, GNNs are able to account for the connections between nodes, learning from how they interact with one another. These models have proven highly effective in tasks like predicting links between nodes, classifying nodes, and even analyzing entire graphs, making them useful in fields like social networks, drug discovery, and recommendation systems.

This is why we are trying this technique

### 1 Import already preprocessed dataset 
from `data_splitting_1` notebook

In [46]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

# Step 1: Load and Prepare Data
train_data = pd.read_pickle('Data/train_dataset_title_sim.pkl')
test_data = pd.read_pickle('Data/test_dataset_title_sim.pkl')

X_train_raw = train_data.drop(columns=['title', 'title_b', 'abstract', 'abstract_b', 'citations', 'citations_b', 'index', 'index_b', 'label'])
y_train = train_data['label']
X_test_raw = test_data.drop(columns=['title', 'title_b', 'abstract', 'abstract_b', 'citations', 'citations_b', 'index', 'index_b', 'label'])
y_test = test_data['label']

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [47]:
X_train_raw.columns

Index(['paper_a', 'paper_b', 'year', 'venue', 'category_0', 'category_1',
       'category_2', 'category_3', 'category_4', 'category_5', 'category_6',
       'category_7', 'category_8', 'category_9', 'category_10', 'category_11',
       'category_12', 'category_13', 'category_14', 'category_15',
       'category_16', 'category_17', 'category_18', 'category_19',
       'category_20', 'category_21', 'category_22', 'category_23',
       'category_24', 'category_25', 'category_26', 'category_27',
       'category_28', 'category_29', 'category_30', 'category_31',
       'category_32', 'category_33', 'category_34', 'year_b', 'venue_b',
       'category_0_b', 'category_1_b', 'category_2_b', 'category_3_b',
       'category_4_b', 'category_5_b', 'category_6_b', 'category_7_b',
       'category_8_b', 'category_9_b', 'category_10_b', 'category_11_b',
       'category_12_b', 'category_13_b', 'category_14_b', 'category_15_b',
       'category_16_b', 'category_17_b', 'category_18_b', 'category_19_b

The model is a Graph Convolutional Network (GCN) that leverages graph-structured data to learn node representations through message-passing, using two graph convolution layers followed by a sigmoid output for binary classification. Despite its ability to capture relationships between nodes, the model performs suboptimally with an accuracy around 0.7305, indicating room for improvement in both architecture and tuning.

In [48]:
import pandas as pd
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Step 1: Define the function to create edge indices
def create_edge_index(data, num_nodes):
    # Create a mapping from unique paper IDs to numeric IDs
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge if there's a citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                # Ensure indices are within bounds
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate that all indices are within bounds
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

Dataset in a tensor

In [49]:
# Step 2: Prepare edge indices
num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(train_data, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(test_data, num_nodes=num_nodes_test)

# Step 3: Prepare PyTorch Geometric Data Objects
data_train = Data(
    x=torch.tensor(X_train, dtype=torch.float),
    edge_index=edge_index_train,
    y=torch.tensor(y_train.values, dtype=torch.float)
)

data_test = Data(
    x=torch.tensor(X_test, dtype=torch.float),
    edge_index=edge_index_test,
    y=torch.tensor(y_test.values, dtype=torch.float)
)
data_train = data_train.to(device)
data_test = data_test.to(device)

In [50]:
# Step 4: Define the GCN Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)  # Binary output

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()  # Binary probability output

# Step 5: Train the GCN Model
def train_model(model, data, epochs=1000, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    return model

# Step 6: Initialize and train GCN
gcn = GCN(num_node_features=X_train.shape[1])
gcn = gcn.to(device)
gcn = train_model(gcn, data_train)

Epoch 0, Loss: 0.8140
Epoch 10, Loss: 0.6453
Epoch 20, Loss: 0.6095
Epoch 30, Loss: 0.5901
Epoch 40, Loss: 0.5791
Epoch 50, Loss: 0.5706
Epoch 60, Loss: 0.5643
Epoch 70, Loss: 0.5591
Epoch 80, Loss: 0.5544
Epoch 90, Loss: 0.5505
Epoch 100, Loss: 0.5475
Epoch 110, Loss: 0.5451
Epoch 120, Loss: 0.5433
Epoch 130, Loss: 0.5418
Epoch 140, Loss: 0.5407
Epoch 150, Loss: 0.5394
Epoch 160, Loss: 0.5384
Epoch 170, Loss: 0.5374
Epoch 180, Loss: 0.5365
Epoch 190, Loss: 0.5357
Epoch 200, Loss: 0.5350
Epoch 210, Loss: 0.5343
Epoch 220, Loss: 0.5337
Epoch 230, Loss: 0.5331
Epoch 240, Loss: 0.5325
Epoch 250, Loss: 0.5320
Epoch 260, Loss: 0.5314
Epoch 270, Loss: 0.5308
Epoch 280, Loss: 0.5301
Epoch 290, Loss: 0.5293
Epoch 300, Loss: 0.5283
Epoch 310, Loss: 0.5271
Epoch 320, Loss: 0.5261
Epoch 330, Loss: 0.5253
Epoch 340, Loss: 0.5246
Epoch 350, Loss: 0.5240
Epoch 360, Loss: 0.5236
Epoch 370, Loss: 0.5232
Epoch 380, Loss: 0.5228
Epoch 390, Loss: 0.5225
Epoch 400, Loss: 0.5222
Epoch 410, Loss: 0.5219
Epo

In [51]:
# Step 6: Evaluate GCN Model
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        predictions = (out > 0.5).float()  # Convert probabilities to binary predictions
        accuracy = (predictions == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        return predictions

print("\nEvaluating GCN:")
gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:
Accuracy: 0.7267


## improved model

Trying model with an additional convolutional layer, applies dropout after each convolutional layer to prevent overfitting, and incorporates early stopping during training to halt the process when the loss stops improving.

In [52]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ImprovedGCN(nn.Module):
    def __init__(self, num_node_features, num_classes, hidden_units=64, dropout_rate=0.5):
        super(ImprovedGCN, self).__init__()
        
        self.conv1 = GCNConv(num_node_features, hidden_units)
        self.conv2 = GCNConv(hidden_units, hidden_units)
        self.conv3 = GCNConv(hidden_units, num_classes)  # Third layer
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # First Convolutional Layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        
        # Second Convolutional Layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Third Convolutional Layer
        x = self.conv3(x, edge_index)
        
        return torch.sigmoid(x)  # Use sigmoid for binary classification

In [53]:
def train_model(model, data, epochs=100, lr=0.01, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    best_loss = float('inf')
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data).squeeze()  # Squeeze to match the shape of the labels
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1
            if counter > patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    return model


In [54]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data).squeeze()  # Squeeze to match the labels shape
        pred = (out > 0.5).float()  # Convert to binary predictions (0 or 1)
        
        # Compute accuracy
        accuracy = (pred == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        
        return out, pred

In [55]:
# Initialize and train the Improved GCN
gcn = ImprovedGCN(num_node_features=X_train.shape[1], num_classes=1)
gcn = gcn.to(device)
gcn = train_model(gcn, data_train, epochs=1000, lr=0.005, patience=15)

Epoch 0, Loss: 0.8820
Epoch 10, Loss: 0.6748
Epoch 20, Loss: 0.6379
Epoch 30, Loss: 0.6158
Epoch 40, Loss: 0.6014
Epoch 50, Loss: 0.5921
Epoch 60, Loss: 0.5857
Epoch 70, Loss: 0.5791
Epoch 80, Loss: 0.5727
Epoch 90, Loss: 0.5684
Epoch 100, Loss: 0.5644
Epoch 110, Loss: 0.5610
Epoch 120, Loss: 0.5580
Epoch 130, Loss: 0.5558
Epoch 140, Loss: 0.5540
Epoch 150, Loss: 0.5531
Epoch 160, Loss: 0.5505
Epoch 170, Loss: 0.5502
Epoch 180, Loss: 0.5498
Epoch 190, Loss: 0.5486
Epoch 200, Loss: 0.5473
Epoch 210, Loss: 0.5476
Epoch 220, Loss: 0.5466
Epoch 230, Loss: 0.5461
Epoch 240, Loss: 0.5459
Epoch 250, Loss: 0.5454
Early stopping at epoch 254


In [56]:
print("\nEvaluating GCN:")
gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:
Accuracy: 0.7295


Not really improved

---

### Trying new method
See if improves if the model is more compplex

In [57]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# Step 1: Load Data
train_data = pd.read_pickle('Data/train_dataset_title_sim.pkl')
test_data = pd.read_pickle('Data/test_dataset_title_sim.pkl')

y_train = train_data['label']
y_test = test_data['label']

# Step 2: Preprocess Features
def preprocess_data(df):
    features = []
    for col in df.columns:
        if col.startswith('category'):
            features.append(df[col].values)  # Category features

    # Add additional features
    features.append(df['year'].values)
    features.append(df['venue'].values)
    features.append(df['year_b'].values)
    features.append(df['venue_b'].values)
    features.append(df['title_similarity'].values)

    # Convert to 2D array
    features = np.column_stack(features)
    return features

X_train = preprocess_data(train_data)
X_test = preprocess_data(test_data)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Create Edge Indices
def create_edge_index(data, num_nodes):
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge for citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate edge indices
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(train_data, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(test_data, num_nodes=num_nodes_test)

# Step 4: Create PyTorch Geometric Data Objects
data_train = Data(
    x=torch.tensor(X_train, dtype=torch.float),
    edge_index=edge_index_train,
    y=torch.tensor(y_train.values, dtype=torch.float)
)

data_test = Data(
    x=torch.tensor(X_test, dtype=torch.float),
    edge_index=edge_index_test,
    y=torch.tensor(y_test.values, dtype=torch.float)
)
data_train = data_train.to(device)
data_test = data_test.to(device)

In [58]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Step 1: Define a more complex GCN model with dense layers added
class ComplexGCNWithDense(nn.Module):
    def __init__(self, num_node_features, num_classes, hidden_units=128, dropout_rate=0.5):
        super(ComplexGCNWithDense, self).__init__()

        # Layer 1: First Convolutional Layer
        self.conv1 = GCNConv(num_node_features, hidden_units)
        self.bn1 = nn.BatchNorm1d(hidden_units)  # Batch Normalization
        self.dropout1 = nn.Dropout(dropout_rate)

        # Layer 2: Second Convolutional Layer
        self.conv2 = GCNConv(hidden_units, hidden_units * 2)
        self.bn2 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout2 = nn.Dropout(dropout_rate)

        # Layer 3: Third Convolutional Layer
        self.conv3 = GCNConv(hidden_units * 2, hidden_units * 2)
        self.bn3 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout3 = nn.Dropout(dropout_rate)

        # Layer 4: Fourth Convolutional Layer
        self.conv4 = GCNConv(hidden_units * 2, hidden_units * 2)
        self.bn4 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout4 = nn.Dropout(dropout_rate)

        # Layer 5: Final Convolutional Layer (to reduce to single output node)
        self.conv5 = GCNConv(hidden_units * 2, num_classes)

        # Dense Layers
        self.fc1 = nn.Linear(num_classes, hidden_units * 2)  # Dense layer after GCN layers
        self.bn_fc1 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization for FC layer
        self.dropout_fc1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(hidden_units * 2, hidden_units)  # Dense layer
        self.bn_fc2 = nn.BatchNorm1d(hidden_units)  # Batch Normalization for FC layer
        self.dropout_fc2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(hidden_units, num_classes)  # Output layer for binary classification

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First Layer (GCN + Dropout + BatchNorm)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.bn1(x)
        x = self.dropout1(x)

        # Second Layer (GCN + Dropout + BatchNorm)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        # Third Layer (GCN + Dropout + BatchNorm)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.bn3(x)
        x = self.dropout3(x)

        # Fourth Layer (GCN + Dropout + BatchNorm)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = self.bn4(x)
        x = self.dropout4(x)

        # Fifth Layer: Output (GCN)
        x = self.conv5(x, edge_index)
        
        # Flatten and pass through fully connected (dense) layers
        x = F.relu(self.fc1(x))
        x = self.bn_fc1(x)
        x = self.dropout_fc1(x)
        
        x = F.relu(self.fc2(x))
        x = self.bn_fc2(x)
        x = self.dropout_fc2(x)
        
        x = self.fc3(x)
        return torch.sigmoid(x).squeeze()  # Binary output for classification

# Step 2: Train the Model (same as before)
def train_model(model, data, epochs=1000, lr=0.01, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    best_loss = float('inf')
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data).squeeze()  # Squeeze to match the shape of the labels
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1
            if counter > patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    return model

# Step 3: Initialize and Train the Model
gcn = ComplexGCNWithDense(num_node_features=X_train.shape[1], num_classes=1, hidden_units=128, dropout_rate=0.5)
gcn = gcn.to(device)
gcn = train_model(gcn, data_train)

Epoch 0, Loss: 0.7476
Epoch 10, Loss: 0.7014
Epoch 20, Loss: 0.6429
Epoch 30, Loss: 0.6044
Epoch 40, Loss: 0.5781
Epoch 50, Loss: 0.5615
Epoch 60, Loss: 0.5491
Epoch 70, Loss: 0.5431
Epoch 80, Loss: 0.5360
Epoch 90, Loss: 0.5302
Epoch 100, Loss: 0.5282
Epoch 110, Loss: 0.5703
Early stopping at epoch 117


In [59]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 8: Evaluate the Model
def evaluate_model(model, data, device):
    model.eval()  # Set the model to evaluation mode
    data = data.to(device)  # Move the data to the appropriate device (GPU or CPU)
    with torch.no_grad():
        # Forward pass to get predictions
        out = model(data)
        predictions = (out >= 0.5).float()  # Apply threshold

        # Convert tensors to lists for sklearn metrics
        y_true = data.y.detach().cpu().tolist()
        y_pred = predictions.detach().cpu().tolist()

        # Compute metrics using sklearn
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

# Ensure test data is on the correct device
data_test = data_test.to(device)

# Perform evaluation on test data
accuracy, precision, recall, f1 = evaluate_model(gcn, data_test, device)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


Test Accuracy: 0.7174
Test Precision: 0.6898
Test Recall: 0.7900
Test F1 Score: 0.7365


The results obtained seems to not be better than the normal models, probably these models need to be tuned better and they may be too simple