### Trial Graph NN

In [78]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

# Step 1: Load and Prepare Data
train_data = pd.read_pickle('Data/train_dataset_title_sim.pkl')
test_data = pd.read_pickle('Data/test_dataset_title_sim.pkl')

X_train_raw = train_data.drop(columns=['title', 'title_b', 'abstract', 'abstract_b', 'citations', 'citations_b', 'index', 'index_b', 'label'])
y_train = train_data['label']
X_test_raw = test_data.drop(columns=['title', 'title_b', 'abstract', 'abstract_b', 'citations', 'citations_b', 'index', 'index_b', 'label'])
y_test = test_data['label']

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [79]:
X_train_raw.columns

Index(['paper_a', 'paper_b', 'year', 'venue', 'category_0', 'category_1',
       'category_2', 'category_3', 'category_4', 'category_5', 'category_6',
       'category_7', 'category_8', 'category_9', 'category_10', 'category_11',
       'category_12', 'category_13', 'category_14', 'category_15',
       'category_16', 'category_17', 'category_18', 'category_19',
       'category_20', 'category_21', 'category_22', 'category_23',
       'category_24', 'category_25', 'category_26', 'category_27',
       'category_28', 'category_29', 'category_30', 'category_31',
       'category_32', 'category_33', 'category_34', 'year_b', 'venue_b',
       'category_0_b', 'category_1_b', 'category_2_b', 'category_3_b',
       'category_4_b', 'category_5_b', 'category_6_b', 'category_7_b',
       'category_8_b', 'category_9_b', 'category_10_b', 'category_11_b',
       'category_12_b', 'category_13_b', 'category_14_b', 'category_15_b',
       'category_16_b', 'category_17_b', 'category_18_b', 'category_19_b

In [84]:
import pandas as pd
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Step 1: Define the function to create edge indices
def create_edge_index(data, num_nodes):
    # Create a mapping from unique paper IDs to numeric IDs
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge if there's a citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                # Ensure indices are within bounds
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate that all indices are within bounds
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

# Step 2: Prepare edge indices
num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(train_data, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(test_data, num_nodes=num_nodes_test)

# Step 3: Prepare PyTorch Geometric Data Objects
data_train = Data(
    x=torch.tensor(X_train, dtype=torch.float),
    edge_index=edge_index_train,
    y=torch.tensor(y_train.values, dtype=torch.float)
)

data_test = Data(
    x=torch.tensor(X_test, dtype=torch.float),
    edge_index=edge_index_test,
    y=torch.tensor(y_test.values, dtype=torch.float)
)

# Step 4: Define the GCN Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)  # Binary output

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()  # Binary probability output

# Step 5: Train the GCN Model
def train_model(model, data, epochs=1000, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    return model

# Step 6: Initialize and train GCN
gcn = GCN(num_node_features=X_train.shape[1])
gcn = train_model(gcn, data_train)

Epoch 0, Loss: 0.7682
Epoch 10, Loss: 0.6239
Epoch 20, Loss: 0.5951
Epoch 30, Loss: 0.5823
Epoch 40, Loss: 0.5743
Epoch 50, Loss: 0.5672
Epoch 60, Loss: 0.5608
Epoch 70, Loss: 0.5551
Epoch 80, Loss: 0.5503
Epoch 90, Loss: 0.5468
Epoch 100, Loss: 0.5446
Epoch 110, Loss: 0.5429
Epoch 120, Loss: 0.5416
Epoch 130, Loss: 0.5405
Epoch 140, Loss: 0.5396
Epoch 150, Loss: 0.5387
Epoch 160, Loss: 0.5380
Epoch 170, Loss: 0.5373
Epoch 180, Loss: 0.5366
Epoch 190, Loss: 0.5360
Epoch 200, Loss: 0.5355
Epoch 210, Loss: 0.5351
Epoch 220, Loss: 0.5347
Epoch 230, Loss: 0.5342
Epoch 240, Loss: 0.5339
Epoch 250, Loss: 0.5335
Epoch 260, Loss: 0.5331
Epoch 270, Loss: 0.5327
Epoch 280, Loss: 0.5323
Epoch 290, Loss: 0.5319
Epoch 300, Loss: 0.5315
Epoch 310, Loss: 0.5312
Epoch 320, Loss: 0.5309
Epoch 330, Loss: 0.5305
Epoch 340, Loss: 0.5302
Epoch 350, Loss: 0.5299
Epoch 360, Loss: 0.5297
Epoch 370, Loss: 0.5294
Epoch 380, Loss: 0.5292
Epoch 390, Loss: 0.5289
Epoch 400, Loss: 0.5287
Epoch 410, Loss: 0.5284
Epo

In [85]:
# Step 6: Evaluate GCN Model
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        predictions = (out > 0.5).float()  # Convert probabilities to binary predictions
        accuracy = (predictions == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        return predictions

print("\nEvaluating GCN:")
gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:
Accuracy: 0.7277


## improved model

In [86]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ImprovedGCN(nn.Module):
    def __init__(self, num_node_features, num_classes, hidden_units=64, dropout_rate=0.5):
        super(ImprovedGCN, self).__init__()
        
        self.conv1 = GCNConv(num_node_features, hidden_units)
        self.conv2 = GCNConv(hidden_units, hidden_units)
        self.conv3 = GCNConv(hidden_units, num_classes)  # Third layer
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # First Convolutional Layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        
        # Second Convolutional Layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Third Convolutional Layer
        x = self.conv3(x, edge_index)
        
        return torch.sigmoid(x)  # Use sigmoid for binary classification

In [87]:
def train_model(model, data, epochs=100, lr=0.01, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    best_loss = float('inf')
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data).squeeze()  # Squeeze to match the shape of the labels
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1
            if counter > patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    return model


In [88]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data).squeeze()  # Squeeze to match the labels shape
        pred = (out > 0.5).float()  # Convert to binary predictions (0 or 1)
        
        # Compute accuracy
        accuracy = (pred == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        
        return out, pred

In [89]:
# Initialize and train the Improved GCN
#gcn = ImprovedGCN(num_node_features=X_train.shape[1], num_classes=1)
#gcn = train_model(gcn, data_train, epochs=1000, lr=0.005, patience=15)

In [90]:
print("\nEvaluating GCN:")
#gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:


Not really improved

## Trying new method

In [98]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Step 1: Load and Prepare Data
train_data = pd.read_pickle('Data/train_dataset_title_sim.pkl')
test_data = pd.read_pickle('Data/test_dataset_title_sim.pkl')

# Extracting the label column
y_train = train_data['label']
y_test = test_data['label']


In [99]:
train_data.columns, test_data.columns

(Index(['paper_a', 'paper_b', 'label', 'title', 'year', 'venue', 'index',
        'citations', 'abstract', 'category_0', 'category_1', 'category_2',
        'category_3', 'category_4', 'category_5', 'category_6', 'category_7',
        'category_8', 'category_9', 'category_10', 'category_11', 'category_12',
        'category_13', 'category_14', 'category_15', 'category_16',
        'category_17', 'category_18', 'category_19', 'category_20',
        'category_21', 'category_22', 'category_23', 'category_24',
        'category_25', 'category_26', 'category_27', 'category_28',
        'category_29', 'category_30', 'category_31', 'category_32',
        'category_33', 'category_34', 'title_b', 'year_b', 'venue_b', 'index_b',
        'citations_b', 'abstract_b', 'category_0_b', 'category_1_b',
        'category_2_b', 'category_3_b', 'category_4_b', 'category_5_b',
        'category_6_b', 'category_7_b', 'category_8_b', 'category_9_b',
        'category_10_b', 'category_11_b', 'category_12_b',

In [104]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# Step 1: Load Data
train_data = pd.read_pickle('Data/train_dataset_title_sim.pkl')
test_data = pd.read_pickle('Data/test_dataset_title_sim.pkl')

y_train = train_data['label']
y_test = test_data['label']

# Step 2: Preprocess Features
def preprocess_data(df):
    features = []
    for col in df.columns:
        if col.startswith('category'):
            features.append(df[col].values)  # Category features

    # Add additional features
    features.append(df['year'].values)
    features.append(df['venue'].values)
    features.append(df['year_b'].values)
    features.append(df['venue_b'].values)
    features.append(df['title_similarity'].values)

    # Convert to 2D array
    features = np.column_stack(features)
    return features

X_train = preprocess_data(train_data)
X_test = preprocess_data(test_data)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Create Edge Indices
def create_edge_index(data, num_nodes):
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge for citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate edge indices
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(train_data, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(test_data, num_nodes=num_nodes_test)

# Step 4: Create PyTorch Geometric Data Objects
data_train = Data(
    x=torch.tensor(X_train, dtype=torch.float),
    edge_index=edge_index_train,
    y=torch.tensor(y_train.values, dtype=torch.float)
)

data_test = Data(
    x=torch.tensor(X_test, dtype=torch.float),
    edge_index=edge_index_test,
    y=torch.tensor(y_test.values, dtype=torch.float)
)

# Step 5: Define GCN Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)  # Binary output

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()  # Binary probability output

# Step 6: Train GCN Model
def train_model(model, data, epochs=1000, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    return model

# Step 7: Initialize and Train the Model
gcn = GCN(num_node_features=X_train.shape[1])
gcn = train_model(gcn, data_train)


Epoch 0, Loss: 0.9715
Epoch 10, Loss: 0.6665
Epoch 20, Loss: 0.6163
Epoch 30, Loss: 0.5951
Epoch 40, Loss: 0.5834
Epoch 50, Loss: 0.5752
Epoch 60, Loss: 0.5692
Epoch 70, Loss: 0.5646
Epoch 80, Loss: 0.5608
Epoch 90, Loss: 0.5574
Epoch 100, Loss: 0.5547
Epoch 110, Loss: 0.5522
Epoch 120, Loss: 0.5502
Epoch 130, Loss: 0.5486
Epoch 140, Loss: 0.5473
Epoch 150, Loss: 0.5462
Epoch 160, Loss: 0.5452
Epoch 170, Loss: 0.5443
Epoch 180, Loss: 0.5435
Epoch 190, Loss: 0.5428
Epoch 200, Loss: 0.5421
Epoch 210, Loss: 0.5415
Epoch 220, Loss: 0.5409
Epoch 230, Loss: 0.5403
Epoch 240, Loss: 0.5397
Epoch 250, Loss: 0.5392
Epoch 260, Loss: 0.5387
Epoch 270, Loss: 0.5382
Epoch 280, Loss: 0.5378
Epoch 290, Loss: 0.5374
Epoch 300, Loss: 0.5371
Epoch 310, Loss: 0.5367
Epoch 320, Loss: 0.5364
Epoch 330, Loss: 0.5361
Epoch 340, Loss: 0.5359
Epoch 350, Loss: 0.5356
Epoch 360, Loss: 0.5354
Epoch 370, Loss: 0.5351
Epoch 380, Loss: 0.5349
Epoch 390, Loss: 0.5347
Epoch 400, Loss: 0.5345
Epoch 410, Loss: 0.5343
Epo

In [106]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 8: Evaluate the Model
def evaluate_model(model, data):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Forward pass to get predictions
        out = model(data)
        predictions = (out >= 0.5).float()  # Apply threshold

        # Compute metrics
        y_true = data.y.cpu().numpy()
        y_pred = predictions.cpu().numpy()

        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

# Perform evaluation on test data
accuracy, precision, recall, f1 = evaluate_model(gcn, data_test)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


Test Accuracy: 0.7257
Test Precision: 0.7198
Test Recall: 0.7392
Test F1 Score: 0.7294


The results obtained seems to not be better than the normal models, probably these models need to be tuned better and they may be too simple