In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from ogb.nodeproppred import NodePropPredDataset
import dask.dataframe as dd
from dask.multiprocessing import get
import matplotlib.pyplot as plt
from ogb.linkproppred import Evaluator
import sys
sys.path.append('modelling/')
import random
import os
if os.path.basename(os.getcwd()) != 'link-prediction-in-graphs':
    parent_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    os.chdir(parent_dir)
from comet_ml import Experiment
from modelling.dataset_split.dataset_splitter import Dataset_Splitter

import os
print(f"current working directory: {os.getcwd()}")

current working directory: c:\Users\User\link-prediction-in-graphs


In [2]:
def get_edge_split(dataset_selected = "ogbn-papers100M"):
    ds_split = Dataset_Splitter()
    dataset = ds_split.load_dataset(dataset_selected)
    split_edge, _ = ds_split.get_edges_split(dataset)
    n_samples = len(split_edge["valid"]["source_node"])
    random_sampled_train = random.sample(range(len(split_edge["train"]["source_node"])),k=n_samples)
    split_edge["train"]["source_node"] = split_edge["train"]["source_node"][random_sampled_train]
    split_edge["train"]["target_node"] = split_edge["train"]["target_node"][random_sampled_train]
    split_edge["train"]["target_node_neg"] = [random.sample(range(split_edge["train"]["target_node"].max()),k=1000) for x in range(n_samples)]
    split_edge["train"]["target_node_neg"] = torch.tensor(split_edge["train"]["target_node_neg"])
    return split_edge,dataset

## Calculate Cosine Similarity and Evaluate

In [3]:
def calc_mrr_cosine(split_edge,dataset,split:str="valid"):

    pred_pos_df = pd.DataFrame((split_edge[split]["source_node"],split_edge[split]["target_node"])).T.rename(columns={0:"source",1:"target"})
    def calc_cosine(source, target, node_features):
        return cosine_similarity(node_features[source].reshape(1, -1), node_features[target].reshape(1, -1))[0][0]


    node_features = dataset[0].x
    pred_pos_df['cosine_similarity'] = pred_pos_df.apply(lambda row: calc_cosine(row['source'], row['target'], node_features), axis=1)
    pred_neg_df = pd.DataFrame({'source': split_edge[split]["source_node"].view(-1, 1).repeat(1, 1000).view(-1), 'target': split_edge[split]["target_node_neg"].view(-1)})
    
    # calc cosine similarity
    ddf = dd.from_pandas(pred_neg_df, npartitions=64)
    def calc_cosine_row(row):
        return calc_cosine(row['source'], row['target'], node_features)
    ddf['cosine_similarity'] = ddf.apply(calc_cosine_row, axis=1, meta=('float'))
    pred_neg_df = ddf.compute(scheduler=get)
    evaluator = Evaluator(name='ogbl-citation2')
    eval_dict = evaluator.eval({
                'y_pred_pos': torch.tensor(pred_pos_df["cosine_similarity"].values),
                'y_pred_neg': torch.tensor(pred_neg_df["cosine_similarity"].values).view(-1,1000),
            })
    return eval_dict["mrr_list"].mean()

In [5]:
def calc_and_save_baseline_cosine(dataset_name):
    edge_split,dataset = get_edge_split(dataset_name)
    train_mrr = calc_mrr_cosine(edge_split,dataset,"train")
    valid_mrr = calc_mrr_cosine(edge_split,dataset,"valid")
    test_mrr = calc_mrr_cosine(edge_split,dataset,"test")
    experiment = Experiment(
                    api_key="fMjtHh9OnnEygtraNMjP7Wpig",
                    project_name="link-prediction-baselines",
                    workspace="swiggy123"
                )
    experiment.set_name(f"cosine_similarity_{dataset_name}")

    metrics = {
            "train_mrr": train_mrr,
            "valid_mrr": valid_mrr,
            "test_mrr": test_mrr}
    experiment.log_metrics(metrics)
    experiment.end()

In [47]:
calc_and_save_baseline_cosine("ogbn-papers100M")
calc_and_save_baseline_cosine("ogbn-arxiv")
calc_and_save_baseline_cosine("ogbl-citation2")

ogbn-papers100M has been updated.
Creating Splits for dataset ogbn-papers100M
Returning Splits for dataset ogbn-papers100M
