Our Model has been trained and we will now be using its saved weights for inference purposes

In [1]:
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from dgl.nn import SAGEConv
import dgl
import dgl.data
import pandas as pd

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
# Load the saved model
model_path = "../model/dgl_model.pt"  # Replace with the actual path to your saved model
model = torch.load(model_path, map_location=torch.device('cpu'))
model.eval()

GraphSAGE(
  (conv1): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=224, out_features=16, bias=False)
    (fc_self): Linear(in_features=224, out_features=16, bias=True)
  )
  (conv2): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=16, out_features=16, bias=False)
    (fc_self): Linear(in_features=16, out_features=16, bias=True)
  )
)

In [2]:
author_ids = pd.read_csv('../data/author_id.csv')
author_id_to_number = {author_id: idx for idx, author_id in enumerate(author_ids['Author'])}
author_num_to_id = {v: k for k, v in author_id_to_number.items()}

In [3]:
def get_top_coauthors(author_number, model, g, author_id_to_number, author_num_to_id, num_top_coauthors=5):
    with torch.no_grad():
        h = model(g, g.ndata['feat'].type(torch.float32))
        author_index = author_number - 1  # Convert author number to 0-based index
        author_embedding = h[author_index]

        # Compute similarity scores between the input author and all other authors
        similarity_scores = torch.cosine_similarity(author_embedding, h, dim=1)

        # Get indices of top coauthors based on similarity scores
        top_indices = np.argsort(similarity_scores.numpy())[-(num_top_coauthors+1):-1][::-1]  # Exclude the input author
        top_coauthors = [author_num_to_id[i] for i in top_indices]
        likeliness_scores = [float(similarity_scores[i]) for i in top_indices]

        return top_coauthors, likeliness_scores

In [7]:
def main():
    dataset = dgl.data.CSVDataset('../data/author_data')
    g = dataset[0]
    g = dgl.add_self_loop(g)
    input_author_number = int(input("Enter an author number (1 to 333): "))
    if input_author_number < 1 or input_author_number > 333:
        print("Invalid author number. Please enter a number between 1 and 333.")
        return

    top_coauthors, likeliness_scores = get_top_coauthors(input_author_number, model, g, author_id_to_number, author_num_to_id)

    print("Input Author Number:", input_author_number)
    print("Top Coauthors:", top_coauthors)
    print("Likeliness Scores:", likeliness_scores)


In [8]:
main()

Done loading data from cached files.
Input Author Number: 97
Top Coauthors: ['authorID_36ebe_205bc_dfc49_9a25e_6923f', 'authorID_3f980_7cb9a_e9fb6_c3094_2af61', 'authorID_0a5b0_46d07_f6f97_1b777_6de68', 'authorID_33512_00784_0ced1_bb0aa_b68f4', 'authorID_1253e_9373e_781b7_50026_6caa5']
Likeliness Scores: [0.9611250162124634, 0.9605594873428345, 0.9423330426216125, 0.9376060962677002, 0.9365097284317017]
