### Trial Graph NN

Graph Neural Networks (GNNs) are a type of deep learning model specifically designed to work with data that is structured as a graph, where entities are represented by nodes and relationships are captured by edges. Unlike traditional neural networks, GNNs are able to account for the connections between nodes, learning from how they interact with one another. These models have proven highly effective in tasks like predicting links between nodes, classifying nodes, and even analyzing entire graphs, making them useful in fields like social networks, drug discovery, and recommendation systems.

This is why we are trying this technique

### 1 Import already preprocessed dataset 
from `data_splitting_1` notebook

In [53]:
import pickle
import os
import sys
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [54]:
df = pd.read_pickle('Data\\preprocessed_df.pkl')

In [55]:
df.head(5)

Unnamed: 0,title,year,venue,index,citations,abstract,category_0,category_1,category_2,category_3,...,category_25,category_26,category_27,category_28,category_29,category_30,category_31,category_32,category_33,category_34
57929,Proceedings of the 15th International Conferen...,2008,1,57929,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3640,Digital Reference,2006,1,3640,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
447127,Freenet P,1900,1,447127,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
192455,A linear algebraic theory of complexes,1941,1,192455,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
185904,The embedding of products and joins of complex...,1947,1,185904,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Data split

In [56]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
df.fillna('', inplace=True)
# Split the data by paper indices, ensuring balanced class distribution (stratify)
def split_data_balanced(df, test_size=0.2):
    # Create a label column for stratification (0 = no citations, 1 = citations)
    df['label'] = df['citations'].apply(lambda x: 1 if pd.notna(x) and x != '' else 0)

    # Split the dataset while maintaining the class distribution (stratify by 'label')
    train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=55)
    
    return train_df, test_df

# Optimized function to create positive pairs
def create_positive_pairs(df, max_positive):
    pairs = []
    indices_set = set(df['index'].values)

    for _, row in df.iterrows():
        if len(pairs) >= max_positive:
            break
        citing_paper = row['index']
        cited_papers = row['citations'].split(';') if row['citations'] else []
        valid_cited_papers = [cited for cited in cited_papers if cited in indices_set]
        
        for cited in valid_cited_papers:
            if len(pairs) >= max_positive:
                break
            pairs.append((citing_paper, cited, 1))

    return pairs

# Optimized function to create negative pairs
def create_negative_pairs(df, max_negative, num_negatives=2):
    pairs = []
    indices = df['index'].tolist()
    indices_set = set(indices)
    
    while len(pairs) < max_negative:
        citing_paper = random.choice(indices)
        negative_samples = random.sample(indices, num_negatives)
        
        for neg in negative_samples:
            if len(pairs) >= max_negative:
                break
            if neg != citing_paper and neg in indices_set:
                pairs.append((citing_paper, neg, 0))
    
    return pairs

# Parameters
max_pairs_train = 100000
max_positive_train = max_pairs_train // 2
max_negative_train = max_pairs_train - max_positive_train

max_pairs_test = 20000
max_positive_test = max_pairs_test // 2
max_negative_test = max_pairs_test - max_positive_test
# Example dataset split with balanced classes in train and test sets
train_df, test_df = split_data_balanced(df, test_size=0.2)
print(f"Training set: {len(train_df)} papers, {train_df['label'].sum()} citations")
print(f"Testing set: {len(test_df)} papers, {test_df['label'].sum()} citations")
# Generate positive and negative pairs for training, ensuring only training papers are used
train_positive_pairs = create_positive_pairs(train_df, max_positive_train)
train_negative_pairs = create_negative_pairs(train_df, max_negative_train, num_negatives=2)
train_pairs = train_positive_pairs + train_negative_pairs
random.shuffle(train_pairs)

# Generate positive and negative pairs for testing, ensuring only test papers are used
test_positive_pairs = create_positive_pairs(test_df, max_positive_test)
test_negative_pairs = create_negative_pairs(test_df, max_negative_test, num_negatives=2)
test_pairs = test_positive_pairs + test_negative_pairs
random.shuffle(test_pairs)

# Convert to DataFrames
train_pairs_df = pd.DataFrame(train_pairs, columns=['paper_a', 'paper_b', 'label'])
test_pairs_df = pd.DataFrame(test_pairs, columns=['paper_a', 'paper_b', 'label'])

# Check distribution
print(f"Training pairs: {len(train_pairs_df)} (Positive: {len([p for p in train_pairs if p[2] == 1])}, Negative: {len([p for p in train_pairs if p[2] == 0])})")
print(f"Testing pairs: {len(test_pairs_df)} (Positive: {len([p for p in test_pairs if p[2] == 1])}, Negative: {len([p for p in test_pairs if p[2] == 0])})")

Training set: 503845 papers, 100297 citations
Testing set: 125962 papers, 25075 citations
Training pairs: 100000 (Positive: 50000, Negative: 50000)
Testing pairs: 20000 (Positive: 10000, Negative: 10000)


In [57]:
df.columns

Index(['title', 'year', 'venue', 'index', 'citations', 'abstract',
       'category_0', 'category_1', 'category_2', 'category_3', 'category_4',
       'category_5', 'category_6', 'category_7', 'category_8', 'category_9',
       'category_10', 'category_11', 'category_12', 'category_13',
       'category_14', 'category_15', 'category_16', 'category_17',
       'category_18', 'category_19', 'category_20', 'category_21',
       'category_22', 'category_23', 'category_24', 'category_25',
       'category_26', 'category_27', 'category_28', 'category_29',
       'category_30', 'category_31', 'category_32', 'category_33',
       'category_34', 'label'],
      dtype='object')

In [58]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Function to add venue features
def add_venue_features(merged_ab):
    # Count occurrences of each venue in paper_a and paper_b
    venue_counts = Counter(merged_ab['venue'].fillna('') + merged_ab['venue_b'].fillna(''))

    # Compute common venue and count encoding features
    merged_ab['common_venue'] = (merged_ab['venue'] == merged_ab['venue_b']).astype(int)
    merged_ab['venue_a_count'] = merged_ab['venue'].map(venue_counts).fillna(0)
    merged_ab['venue_b_count'] = merged_ab['venue_b'].map(venue_counts).fillna(0)

    # Drop original venue columns
    merged_ab.drop(columns=['venue', 'venue_b'], inplace=True)
    
    return merged_ab

def prepare_features(pairs_df, df, fit_tfidf=False, tfidf_vectorizer=None):
    # Merge pairs_df with df to get features for paper_a
    merged_a = pairs_df.merge(df, left_on='paper_a', right_on='index', suffixes=('', '_a'))
    merged_ab = merged_a.merge(df, left_on='paper_b', right_on='index', suffixes=('', '_b'))

    # Combine the titles for comparison
    titles_a = merged_ab['title'].fillna('')  # Handle missing titles in paper_a
    titles_b = merged_ab['title_b'].fillna('')  # Handle missing titles in paper_b

    if fit_tfidf:
        # If fit_tfidf is True, fit the TF-IDF vectorizer on the titles of the training set
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_vectorizer.fit(titles_a)
    
    # Transform titles_a and titles_b using the same fitted tfidf_vectorizer
    tfidf_matrix_a = tfidf_vectorizer.transform(titles_a)
    tfidf_matrix_b = tfidf_vectorizer.transform(titles_b)

    # Compute pairwise cosine similarity for aligned pairs
    title_similarity = [cosine_similarity(tfidf_matrix_a[i], tfidf_matrix_b[i])[0][0] for i in range(tfidf_matrix_a.shape[0])]

    # Assign title similarity scores to the merged DataFrame
    merged_ab['title_similarity'] = title_similarity

    # Select features for model (same as in the original implementation)
    features = merged_ab.drop(columns=['citations', 'citations_b', 'index', 'index_b', 'label', 'title', 'title_b', 'abstract', 'abstract_b'])

    # Add venue and author features
    features = add_venue_features(features)

    # Extract labels if they exist in the pairs DataFrame
    labels = merged_ab['label'] if 'label' in merged_ab.columns else None

    return features, labels, tfidf_vectorizer, merged_ab

# Example usage for train set:
train_features, train_labels, tfidf_vectorizer, merged_ab_train = prepare_features(train_pairs_df, df, fit_tfidf=True)

# Example usage for test set:
test_features, test_labels, _, merged_ab_test = prepare_features(test_pairs_df, df, fit_tfidf=False, tfidf_vectorizer=tfidf_vectorizer)

In [59]:
X_train, y_train, X_test, y_test = train_features.copy(), train_labels.copy(), test_features.copy(), test_labels.copy()

In [60]:
X_train.head(5)

Unnamed: 0,paper_a,paper_b,year,category_0,category_1,category_2,category_3,category_4,category_5,category_6,...,category_30_b,category_31_b,category_32_b,category_33_b,category_34_b,label_b,title_similarity,common_venue,venue_a_count,venue_b_count
0,43933,535178,2008,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,1,0,0
1,494732,281876,2004,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,1,0,0
2,381063,47297,2000,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,1,0,0
3,413037,210346,2008,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0.0,1,0,0
4,44872,76716,2007,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.42067,1,0,0


---

In [61]:
# Check if there are any overlapping papers between train and test sets
train_papers_a = set(X_train['paper_a']).union(set(X_train['paper_b']))
test_papers_a = set(X_test['paper_a']).union(set(X_test['paper_b']))

# Find intersection between train and test sets to detect any overlapping papers
overlap_train_test = train_papers_a.intersection(test_papers_a)

if overlap_train_test:
    print("Overlapping papers found between train and test sets:", len(overlap_train_test))
else:
    print("No overlapping papers between train and test sets.")


No overlapping papers between train and test sets.


In [62]:
X_train, X_test = X_train.drop(columns=['paper_a', 'paper_b']), X_test.drop(columns=['paper_a', 'paper_b'])

In [63]:
# add the label to traiun_features and call the var data_train
data_train_initial = train_features.copy()
data_train_initial['label'] = y_train
data_test_initial = test_features.copy()
data_test_initial['label'] = y_test

In [64]:
import networkx as nx
import pandas as pd

def create_graph_from_data(data_train):
    # Create an empty graph
    G = nx.Graph()

    # Add nodes and edges from the data
    for idx, row in data_train.iterrows():
        paper_a = row['paper_a']
        paper_b = row['paper_b']
        label = row['label']

        # Add nodes (papers) and edges (label)
        G.add_edge(paper_a, paper_b, weight=label)

    return G

def calculate_graph_metrics(G):
    # Calculate graph metrics
    metrics = {}

    # Degree centrality
    degree_centrality = nx.degree_centrality(G)
    metrics['degree_centrality'] = degree_centrality

    # Betweenness centrality
    betweenness_centrality = nx.betweenness_centrality(G)
    metrics['betweenness_centrality'] = betweenness_centrality

    # Closeness centrality
    closeness_centrality = nx.closeness_centrality(G)
    metrics['closeness_centrality'] = closeness_centrality

    # Clustering coefficient
    clustering_coefficient = nx.clustering(G)
    metrics['clustering_coefficient'] = clustering_coefficient

    # In-degree and Out-degree (for directed graphs, if applicable)
    in_degree = G.in_degree() if G.is_directed() else {}
    out_degree = G.out_degree() if G.is_directed() else {}
    metrics['in_degree'] = in_degree
    metrics['out_degree'] = out_degree

    # PageRank
    pagerank = nx.pagerank(G)
    metrics['pagerank'] = pagerank

    # Eigenvector centrality
    eigenvector_centrality = nx.eigenvector_centrality(G)
    metrics['eigenvector_centrality'] = eigenvector_centrality

    return metrics

def add_graph_metrics_to_train(data_train, G):
    # Calculate graph metrics
    metrics = calculate_graph_metrics(G)

    # Add metrics as features to data_train
    data_train['degree_centrality_a'] = data_train['paper_a'].map(metrics['degree_centrality'])
    data_train['degree_centrality_b'] = data_train['paper_b'].map(metrics['degree_centrality'])
    
    data_train['betweenness_centrality_a'] = data_train['paper_a'].map(metrics['betweenness_centrality'])
    data_train['betweenness_centrality_b'] = data_train['paper_b'].map(metrics['betweenness_centrality'])
    
    data_train['closeness_centrality_a'] = data_train['paper_a'].map(metrics['closeness_centrality'])
    data_train['closeness_centrality_b'] = data_train['paper_b'].map(metrics['closeness_centrality'])
    
    data_train['clustering_coefficient_a'] = data_train['paper_a'].map(metrics['clustering_coefficient'])
    data_train['clustering_coefficient_b'] = data_train['paper_b'].map(metrics['clustering_coefficient'])

    # Add in-degree and out-degree metrics
    data_train['in_degree_a'] = data_train['paper_a'].map(metrics['in_degree'])
    data_train['out_degree_a'] = data_train['paper_a'].map(metrics['out_degree'])
    data_train['in_degree_b'] = data_train['paper_b'].map(metrics['in_degree'])
    data_train['out_degree_b'] = data_train['paper_b'].map(metrics['out_degree'])

    # Add PageRank and Eigenvector centrality metrics
    data_train['pagerank_a'] = data_train['paper_a'].map(metrics['pagerank'])
    data_train['pagerank_b'] = data_train['paper_b'].map(metrics['pagerank'])
    
    data_train['eigenvector_centrality_a'] = data_train['paper_a'].map(metrics['eigenvector_centrality'])
    data_train['eigenvector_centrality_b'] = data_train['paper_b'].map(metrics['eigenvector_centrality'])

    return data_train

G_train = create_graph_from_data(data_train_initial)  # Create the graph from the training data
G_test = create_graph_from_data(data_test_initial)  # Create the graph from the testing data
data_train_with_metrics = add_graph_metrics_to_train(data_train_initial, G_train)  # Add graph metrics to train data
data_test_with_metrics = add_graph_metrics_to_train(data_test_initial, G_test)  # Add graph

# Print the updated data_train with graph metrics
print(data_train_with_metrics.head())


  paper_a paper_b  year  category_0  category_1  category_2  category_3  \
0   43933  535178  2008           0           0           0           0   
1  494732  281876  2004           0           0           0           0   
2  381063   47297  2000           1           0           0           0   
3  413037  210346  2008           0           0           0           0   
4   44872   76716  2007           0           0           0           0   

   category_4  category_5  category_6  ...  clustering_coefficient_a  \
0           0           0           0  ...                  0.027778   
1           0           0           0  ...                  0.000000   
2           0           0           0  ...                  0.000000   
3           0           0           0  ...                  0.000000   
4           0           0           1  ...                  0.066667   

   clustering_coefficient_b  in_degree_a  out_degree_a  in_degree_b  \
0                       0.0          NaN     

In [65]:
data_train_with_metrics.to_pickle('Data_metrics\\data_train_with_metrics.pkl')
data_test_with_metrics.to_pickle('Data_metrics\\data_test_with_metrics.pkl')

In [82]:
data_train = pd.read_pickle('Data_metrics\\data_train_with_metrics.pkl')
data_test = pd.read_pickle('Data_metrics\\data_test_with_metrics.pkl')

In [83]:
data_train[data_train['in_degree_a'].isnull() & data_train['in_degree_b'].isnull() & data_train['out_degree_a'].isnull() & data_train['out_degree_b'].isnull()]

Unnamed: 0,paper_a,paper_b,year,category_0,category_1,category_2,category_3,category_4,category_5,category_6,...,clustering_coefficient_a,clustering_coefficient_b,in_degree_a,out_degree_a,in_degree_b,out_degree_b,pagerank_a,pagerank_b,eigenvector_centrality_a,eigenvector_centrality_b
0,43933,535178,2008,0,0,0,0,0,0,0,...,0.027778,0.000000,,,,,0.000055,0.000018,6.074048e-05,1.156455e-05
1,494732,281876,2004,0,0,0,0,0,0,0,...,0.000000,0.000000,,,,,0.000003,0.000003,4.207958e-27,2.975476e-27
2,381063,47297,2000,1,0,0,0,0,0,0,...,0.000000,0.000000,,,,,0.000003,0.000003,2.721114e-24,1.408708e-24
3,413037,210346,2008,0,0,0,0,0,0,0,...,0.000000,0.000000,,,,,0.000003,0.000003,3.223694e-25,1.861200e-25
4,44872,76716,2007,0,0,0,0,0,0,1,...,0.066667,0.000000,,,,,0.000048,0.000009,6.856272e-06,6.427559e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,556462,603428,1999,0,0,0,0,0,0,0,...,0.051282,0.000000,,,,,0.000068,0.000011,2.781278e-04,8.142519e-05
99996,535583,477302,1991,0,0,0,0,0,0,0,...,0.047619,0.333333,,,,,0.000056,0.000019,4.840745e-05,3.721918e-04
99997,353302,94400,2008,0,0,0,0,0,0,0,...,0.000000,0.166667,,,,,0.000030,0.000023,2.992129e-05,2.254561e-04
99998,481166,330625,1990,0,0,0,0,0,0,0,...,0.000000,0.000000,,,,,0.000003,0.000003,4.207958e-27,2.975476e-27


In [84]:
# drop columns with more than 5000 nan
data_train = data_train.dropna(axis=1, thresh=5000)
data_test = data_test.dropna(axis=1, thresh=5000)

In [85]:
data_train.head(5)

Unnamed: 0,paper_a,paper_b,year,category_0,category_1,category_2,category_3,category_4,category_5,category_6,...,betweenness_centrality_a,betweenness_centrality_b,closeness_centrality_a,closeness_centrality_b,clustering_coefficient_a,clustering_coefficient_b,pagerank_a,pagerank_b,eigenvector_centrality_a,eigenvector_centrality_b
0,43933,535178,2008,0,0,0,0,0,0,0,...,0.0001928796,8.3e-05,0.051122,0.04801,0.027778,0.0,5.5e-05,1.8e-05,6.074048e-05,1.156455e-05
1,494732,281876,2004,0,0,0,0,0,0,0,...,1.842711e-10,0.0,1.9e-05,1.3e-05,0.0,0.0,3e-06,3e-06,4.207958e-27,2.9754760000000003e-27
2,381063,47297,2000,1,0,0,0,0,0,0,...,9.213553e-10,0.0,2.3e-05,1.7e-05,0.0,0.0,3e-06,3e-06,2.721114e-24,1.408708e-24
3,413037,210346,2008,0,0,0,0,0,0,0,...,5.528132e-10,0.0,2.2e-05,1.5e-05,0.0,0.0,3e-06,3e-06,3.223694e-25,1.8612e-25
4,44872,76716,2007,0,0,0,0,0,0,1,...,7.138653e-05,1.7e-05,0.046427,0.041905,0.066667,0.0,4.8e-05,9e-06,6.856272e-06,6.427559e-07


In [86]:
data_test.head(5)

Unnamed: 0,paper_a,paper_b,year,category_0,category_1,category_2,category_3,category_4,category_5,category_6,...,betweenness_centrality_a,betweenness_centrality_b,closeness_centrality_a,closeness_centrality_b,clustering_coefficient_a,clustering_coefficient_b,pagerank_a,pagerank_b,eigenvector_centrality_a,eigenvector_centrality_b
0,29293,237017,2006,0,0,0,0,0,0,0,...,1.058936e-08,0.0,0.000126,7.6e-05,0.0,0.0,7.8e-05,7.8e-05,1.7774659999999998e-21,1.026221e-21
1,495464,53963,2009,0,0,0,0,0,0,0,...,0.0,3.529786e-09,5.6e-05,8.4e-05,0.0,0.0,6.1e-05,0.00011,1.01036e-23,1.4288640000000002e-23
2,487473,154225,1989,0,1,0,0,0,1,0,...,2.499441e-05,0.0002051617,0.025925,0.028398,0.0,0.0,7.8e-05,0.000194,4.473533e-06,2.99712e-05
3,408586,164311,2008,0,0,0,0,0,0,0,...,3.529786e-09,0.0,8.4e-05,5.6e-05,0.0,0.0,1.2e-05,1.2e-05,1.4288640000000002e-23,1.01036e-23
4,220075,420821,1993,0,0,0,0,0,0,0,...,1.058936e-08,0.0,9.6e-05,6.7e-05,0.0,0.0,1.2e-05,1.2e-05,1.401843e-21,8.093546000000001e-22


---

The model is a Graph Convolutional Network (GCN) that leverages graph-structured data to learn node representations through message-passing, using two graph convolution layers followed by a sigmoid output for binary classification. Despite its ability to capture relationships between nodes, the model performs suboptimally with an accuracy around 0.7305, indicating room for improvement in both architecture and tuning.

In [87]:
X_train, y_train, X_test, y_test = data_train.drop(columns=['label', 'paper_a', 'paper_b']), data_train['label'], data_test.drop(columns=['label','paper_a', 'paper_b']), data_test['label']

In [88]:
import pandas as pd
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Step 1: Define the function to create edge indices
def create_edge_index(data, num_nodes):
    # Create a mapping from unique paper IDs to numeric IDs
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge if there's a citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                # Ensure indices are within bounds
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate that all indices are within bounds
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

In [89]:
# Step 2: Prepare edge indices
num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(data_train_initial, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(data_test_initial, num_nodes=num_nodes_test)


In [90]:
# i want hewre to create a graph for the data_train without the label and add the label as a edge in the graph, then i want to calculate the metrics for the graph and add 9it to the train data 
# Step 3: Create PyG Data objects
data_train = Data(x=torch.tensor(X_train.values, dtype=torch.float),
                  edge_index=edge_index_train, y=torch.tensor(y_train.values, dtype=torch.float),
                  pos_edge_index=None, neg_edge_index=None).to(device)

data_test = Data(x=torch.tensor(X_test.values, dtype=torch.float),
                    edge_index=edge_index_test, y=torch.tensor(y_test.values, dtype=torch.float),
                    pos_edge_index=None, neg_edge_index=None).to(device)

print(data_train)
print(data_test)

Data(x=[100000, 90], edge_index=[2, 48693], y=[100000])
Data(x=[20000, 90], edge_index=[2, 8678], y=[20000])


In [113]:
# Step 4: Define the GCN Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)  # Binary output

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()  # Binary probability output

# Step 5: Train the GCN Model
def train_model(model, data, epochs=1000, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    return model

# Step 6: Initialize and train GCN
gcn = GCN(num_node_features=X_train.shape[1])
gcn = gcn.to(device)
#gcn = train_model(gcn, data_train)

In [114]:
# Step 6: Evaluate GCN Model
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        predictions = (out > 0.5).float()  # Convert probabilities to binary predictions
        accuracy = (predictions == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        return predictions

print("\nEvaluating GCN:")
#gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:


## improved model

Trying model with an additional convolutional layer, applies dropout after each convolutional layer to prevent overfitting, and incorporates early stopping during training to halt the process when the loss stops improving.

In [93]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ImprovedGCN(nn.Module):
    def __init__(self, num_node_features, num_classes, hidden_units=64, dropout_rate=0.5):
        super(ImprovedGCN, self).__init__()
        
        self.conv1 = GCNConv(num_node_features, hidden_units)
        self.conv2 = GCNConv(hidden_units, hidden_units)
        self.conv3 = GCNConv(hidden_units, num_classes)  # Third layer
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # First Convolutional Layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        
        # Second Convolutional Layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Third Convolutional Layer
        x = self.conv3(x, edge_index)
        
        return torch.sigmoid(x)  # Use sigmoid for binary classification

In [94]:
def train_model(model, data, epochs=100, lr=0.01, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    best_loss = float('inf')
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data).squeeze()  # Squeeze to match the shape of the labels
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1
            if counter > patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    return model


In [95]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data).squeeze()  # Squeeze to match the labels shape
        pred = (out > 0.5).float()  # Convert to binary predictions (0 or 1)
        
        # Compute accuracy
        accuracy = (pred == data.y).sum().item() / len(data.y)
        print(f"Accuracy: {accuracy:.4f}")
        
        return out, pred

In [115]:
# Initialize and train the Improved GCN
gcn = ImprovedGCN(num_node_features=X_train.shape[1], num_classes=1)
gcn = gcn.to(device)
# gcn = train_model(gcn, data_train, epochs=1000, lr=0.005, patience=15)

In [117]:
print("\nEvaluating GCN:")
#gcn_predictions = evaluate_model(gcn, data_test)


Evaluating GCN:


Not really improved

---

### Trying new method
See if improves if the model is more compplex

In [124]:
train_data = pd.read_pickle('Data_metrics\\data_train_with_metrics.pkl')
test_data = pd.read_pickle('Data_metrics\\data_test_with_metrics.pkl')

In [125]:
train_data, test_data = train_data.drop(columns=['label_a', 'label_b']), test_data.drop(columns=['label_a', 'label_b'])

In [126]:
train_data = train_data.dropna(axis=1, thresh=5000)
test_data = test_data.dropna(axis=1, thresh=5000)

In [127]:
train_data.columns

Index(['paper_a', 'paper_b', 'year', 'category_0', 'category_1', 'category_2',
       'category_3', 'category_4', 'category_5', 'category_6', 'category_7',
       'category_8', 'category_9', 'category_10', 'category_11', 'category_12',
       'category_13', 'category_14', 'category_15', 'category_16',
       'category_17', 'category_18', 'category_19', 'category_20',
       'category_21', 'category_22', 'category_23', 'category_24',
       'category_25', 'category_26', 'category_27', 'category_28',
       'category_29', 'category_30', 'category_31', 'category_32',
       'category_33', 'category_34', 'year_b', 'category_0_b', 'category_1_b',
       'category_2_b', 'category_3_b', 'category_4_b', 'category_5_b',
       'category_6_b', 'category_7_b', 'category_8_b', 'category_9_b',
       'category_10_b', 'category_11_b', 'category_12_b', 'category_13_b',
       'category_14_b', 'category_15_b', 'category_16_b', 'category_17_b',
       'category_18_b', 'category_19_b', 'category_20_b', 

In [133]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

y_train = train_data['label']
y_test = test_data['label']

# Step 2: Preprocess Features
def preprocess_data(df):
    features = []
    for col in df.columns:
        if col not in ['paper_a', 'paper_b', 'label']:
            features.append(df[col].values)
    features = np.column_stack(features)
    return features

X_train = preprocess_data(train_data)
X_test = preprocess_data(test_data)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Create Edge Indices
def create_edge_index(data, num_nodes):
    paper_ids = pd.concat([data['paper_a'], data['paper_b']]).unique()
    paper_id_map = {paper: idx for idx, paper in enumerate(paper_ids)}

    edge_index = []
    for _, row in data.iterrows():
        if row['label'] == 1:  # Add an edge for citation
            paper_a_id = paper_id_map.get(row['paper_a'])
            paper_b_id = paper_id_map.get(row['paper_b'])
            if paper_a_id is not None and paper_b_id is not None:
                if paper_a_id < num_nodes and paper_b_id < num_nodes:
                    edge_index.append([paper_a_id, paper_b_id])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Validate edge indices
    if edge_index.numel() > 0 and edge_index.max().item() >= num_nodes:
        raise ValueError("Edge index contains out-of-bound indices!")

    return edge_index

num_nodes_train = X_train.shape[0]
num_nodes_test = X_test.shape[0]

edge_index_train = create_edge_index(train_data, num_nodes=num_nodes_train)
edge_index_test = create_edge_index(test_data, num_nodes=num_nodes_test)

# Step 4: Create PyTorch Geometric Data Objects
data_train = Data(
    x=torch.tensor(X_train, dtype=torch.float),
    edge_index=edge_index_train,
    y=torch.tensor(y_train.values, dtype=torch.float)
)

data_test = Data(
    x=torch.tensor(X_test, dtype=torch.float),
    edge_index=edge_index_test,
    y=torch.tensor(y_test.values, dtype=torch.float)
)
data_train = data_train.to(device)
data_test = data_test.to(device)

In [134]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Step 1: Define a more complex GCN model with dense layers added
class ComplexGCNWithDense(nn.Module):
    def __init__(self, num_node_features, num_classes, hidden_units=128, dropout_rate=0.5):
        super(ComplexGCNWithDense, self).__init__()

        # Layer 1: First Convolutional Layer
        self.conv1 = GCNConv(num_node_features, hidden_units)
        self.bn1 = nn.BatchNorm1d(hidden_units)  # Batch Normalization
        self.dropout1 = nn.Dropout(dropout_rate)

        # Layer 2: Second Convolutional Layer
        self.conv2 = GCNConv(hidden_units, hidden_units * 2)
        self.bn2 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout2 = nn.Dropout(dropout_rate)

        # Layer 3: Third Convolutional Layer
        self.conv3 = GCNConv(hidden_units * 2, hidden_units * 2)
        self.bn3 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout3 = nn.Dropout(dropout_rate)

        # Layer 4: Fourth Convolutional Layer
        self.conv4 = GCNConv(hidden_units * 2, hidden_units * 2)
        self.bn4 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization
        self.dropout4 = nn.Dropout(dropout_rate)

        # Layer 5: Final Convolutional Layer (to reduce to single output node)
        self.conv5 = GCNConv(hidden_units * 2, num_classes)

        # Dense Layers
        self.fc1 = nn.Linear(num_classes, hidden_units * 2)  # Dense layer after GCN layers
        self.bn_fc1 = nn.BatchNorm1d(hidden_units * 2)  # Batch Normalization for FC layer
        self.dropout_fc1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(hidden_units * 2, hidden_units)  # Dense layer
        self.bn_fc2 = nn.BatchNorm1d(hidden_units)  # Batch Normalization for FC layer
        self.dropout_fc2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(hidden_units, num_classes)  # Output layer for binary classification

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First Layer (GCN + Dropout + BatchNorm)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.bn1(x)
        x = self.dropout1(x)

        # Second Layer (GCN + Dropout + BatchNorm)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        # Third Layer (GCN + Dropout + BatchNorm)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.bn3(x)
        x = self.dropout3(x)

        # Fourth Layer (GCN + Dropout + BatchNorm)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = self.bn4(x)
        x = self.dropout4(x)

        # Fifth Layer: Output (GCN)
        x = self.conv5(x, edge_index)
        
        # Flatten and pass through fully connected (dense) layers
        x = F.relu(self.fc1(x))
        x = self.bn_fc1(x)
        x = self.dropout_fc1(x)
        
        x = F.relu(self.fc2(x))
        x = self.bn_fc2(x)
        x = self.dropout_fc2(x)
        
        x = self.fc3(x)
        return torch.sigmoid(x).squeeze()  # Binary output for classification

# Step 2: Train the Model (same as before)
def train_model(model, data, epochs=1000, lr=0.01, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    best_loss = float('inf')
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data).squeeze()  # Squeeze to match the shape of the labels
        loss = F.binary_cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1
            if counter > patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    return model

# Step 3: Initialize and Train the Model
gcn = ComplexGCNWithDense(num_node_features=X_train.shape[1], num_classes=1, hidden_units=128, dropout_rate=0.5)
gcn = gcn.to(device)
gcn = train_model(gcn, data_train)

Epoch 0, Loss: 0.7626
Epoch 10, Loss: 0.6629
Epoch 20, Loss: 0.5370
Epoch 30, Loss: 0.4676
Epoch 40, Loss: 0.4245
Epoch 50, Loss: 0.3854
Epoch 60, Loss: 0.3572
Epoch 70, Loss: 0.3431
Epoch 80, Loss: 0.3316
Epoch 90, Loss: 0.3233
Epoch 100, Loss: 0.3157
Epoch 110, Loss: 0.3135
Epoch 120, Loss: 0.3072
Epoch 130, Loss: 0.3082
Early stopping at epoch 139


In [135]:
X_train.shape[1]

88

In [136]:
X_train.shape[1]

88

In [137]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 8: Evaluate the Model
def evaluate_model(model, data, device):
    model.eval()  # Set the model to evaluation mode
    data = data.to(device)  # Move the data to the appropriate device (GPU or CPU)
    with torch.no_grad():
        # Forward pass to get predictions
        out = model(data)
        predictions = (out >= 0.5).float()  # Apply threshold

        # Convert tensors to lists for sklearn metrics
        y_true = data.y.detach().cpu().tolist()
        y_pred = predictions.detach().cpu().tolist()

        # Compute metrics using sklearn
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

# Ensure test data is on the correct device
data_test = data_test.to(device)

# Perform evaluation on test data
accuracy, precision, recall, f1 = evaluate_model(gcn, data_test, device)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


Test Accuracy: 0.8550
Test Precision: 0.8425
Test Recall: 0.8733
Test F1 Score: 0.8576


The results obtained seems to not be better than the normal models, probably these models need to be tuned better and they may be too simple