In [2]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd
from collections import OrderedDict
import numpy as np
import random
import faiss
from sklearn.metrics import precision_score, recall_score, f1_score
random.seed(42)

In [3]:
initial_data = pd.read_csv('/home/student/FinalProject/PaperFeedback/Datasets/acm_citation_network_v8_labeled.csv')
network_data = initial_data[initial_data['references'].notna()]
network_data=network_data[network_data['abstract'].notna()]
max_year=network_data['year'].max()
print(max_year)
last_year_data=network_data[network_data['year']==max_year]

2015.0


In [4]:
def load_embeddings(embeddings_path):
    df = pd.read_csv(embeddings_path)
    ids = df['id'].values 
    embeddings = df.drop(columns=['id']).values.astype('float32')
    return ids, embeddings

In [5]:
co_cite_ids ,co_cite_embeddings = load_embeddings('/home/student/FinalProject/PaperFeedback/Datasets/co_cite_embeddings.csv')
bibliography_coup_ids ,bibliography_coup_embeddings = load_embeddings('/home/student/FinalProject/PaperFeedback/Datasets/bib_coup_embeddings.csv')
author_collab_ids ,author_collab_embeddings = load_embeddings('/home/student/FinalProject/PaperFeedback/Datasets/author_colab.csv')

In [23]:
id_mapper_string_to_int = dict(zip(initial_data['index'], initial_data['id']))
id_mapper_int_to_string = dict(zip(initial_data['id'], initial_data['index']))

In [7]:
API_KEY = '703c7c8a-2b8f-46bc-b2f7-ede6b037b3fa'
index_name = 'ann-embeddings'
TOP_K=100
TOP_N=10
DOC_NUMBER=50

In [8]:
pc = Pinecone(api_key=API_KEY)
ann_index = pc.Index(index_name)

In [9]:
co_cite_index = faiss.read_index('/home/student/FinalProject/PaperFeedback/Utils/co_cite_index.faiss')
author_collab_index = faiss.read_index('/home/student/FinalProject/PaperFeedback/Utils/author_collab_index.faiss')
bibliography_coup_index= faiss.read_index('/home/student/FinalProject/PaperFeedback/Utils/bibliography_coup_index.faiss')

In [10]:
if torch.cuda.is_available():
    print('using GPU')
    device = 'cuda'
else:
    print('using CPUG')
    device = 'cpu'    

using GPU


In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to(device)



## Sample random papers that have references

In [12]:
sampled_documents = last_year_data.sample(n=DOC_NUMBER, random_state=42)

## find top k closest documents for each document

In [13]:
# Create a list to store the results
results = []

# Loop through each sampled document to locate them in the Pinecone index
for _, row in sampled_documents.iterrows():
    # Assuming each document has an 'id' or some unique identifier that corresponds to the index
    document_index = row['index']  # Replace 'index' with the appropriate column name for your document identifiers

    # Create an embedding for the document
    embedded_query = model.encode([row['abstract']])  # Replace 'abstract' with the appropriate column name for the text data

    # Run the query on Pinecone
    response = ann_index.query(
        vector=embedded_query.tolist(),  # Convert to list if needed
        top_k=TOP_K+1,
        include_values=False,
        include_metadata=True
    )

    
    # Filter out the original document from the query response
    filtered_matches = [match for match in response['matches'] if match['id'] != document_index]
    
    # Append the results
    results.append({
        'document': row,
        'closest_documents': filtered_matches
    })

## Rerank docuemnets

In [14]:
def rerank_results(results, pc, top_n=5, max_chars=900):
    """
    Reranks results with limited character text.

    Args:
    - results (list): List of original documents and closest documents.
    - pc (object): Inference client object for reranking.
    - top_n (int): Number of top documents to return after reranking.
    - max_chars (int): Maximum number of characters for abstracts.

    Returns:
    - reranked_results (list): List containing the reranked documents.
    """
    
    def limit_characters(text, max_chars=max_chars):
        """Limit a text to a specified maximum number of characters."""
        return text[:max_chars] if len(text) > max_chars else text

    # Initialize the reranked results list
    reranked_results = []

    # Loop through each document in the results
    for result in results:
        row = result['document']  # The original document
        closest_documents = result['closest_documents']  # The closest documents found

        # Prepare matches for reranking
        matches_with_details = []
        for match in closest_documents:
            doc_id = match['id']
            score = match['score']
            
            # Retrieve the abstract or any other relevant metadata
            abstract = match['metadata'].get('abstract', "")  # Use get to avoid KeyError
            limited_abstract = limit_characters(abstract, max_chars=max_chars)  # Apply character limit
            
            # Prepare match info for reranking
            match_info = {
                'id': doc_id,
                'text': limited_abstract
            }
            matches_with_details.append(match_info)
        
        # Rerank the documents using the BGE reranker
        reranked_documents = pc.inference.rerank(
            model="bge-reranker-v2-m3",
            query=row['abstract'],  # The original query for reranking
            documents=matches_with_details,  # Documents to rerank
            top_n=top_n,  # Number of top documents to return
            return_documents=True  # Specify whether to return documents
        )

        # Save the reranked results
        reranked_results.append({
            'document': row,
            'closest_documents': reranked_documents
        })
    
    return reranked_results

In [15]:
# Call the rerank_results function
reranked_results = rerank_results(results, pc, top_n=TOP_N, max_chars=900)

In [None]:
# print out the reranked results for each document
for reranked_result in reranked_results:
    print("Original Document:", reranked_result['document'])
    print("Reranked Documents:")
    for i, doc in enumerate(reranked_result['closest_documents'].data):
        print(str(i+1)+'. id: '+doc['document']['id']+'   score: '+str(doc['score'])+'   abstract: '+doc['document']['text']+'\n')
    print("\n")

## Gnn retrieval based on the TOP_N closest documents for each document from the sample

In [41]:
dict_N_docs_closest_int_ids={}
# Print out the reranked results for each document
for reranked_result in reranked_results:
    doc_id = reranked_result['document']['index']
    dict_N_docs_closest_int_ids[doc_id]=[id_mapper_string_to_int[doc['document']['id']] for doc in reranked_result['closest_documents'].data]

In [None]:
k = 3  # number of vectors to retrieve

# Initialize OrderedDict to maintain order
dict_N_docs_closest_str_ids_co_cite = OrderedDict()
dict_N_docs_closest_str_ids_author_collab = OrderedDict()
dict_N_docs_closest_str_ids_bibliography_coup = OrderedDict()

dict_gnn_items_without_original_ann_docs_co_cite=OrderedDict()
dict_gnn_items_without_original_ann_docs_author_collab=OrderedDict()
dict_gnn_items_without_original_ann_docs_bibliography_coup=OrderedDict()



for doc_id_str, doc_closest_docs in dict_N_docs_closest_int_ids.items():
    co_cite_int_ids_closest_docs = OrderedDict()
    author_collab_int_ids_closest_docs = OrderedDict()
    bibliography_coup_int_ids_closest_docs = OrderedDict()

    for close_doc in doc_closest_docs:
        str_close_doc = id_mapper_int_to_string[close_doc]

        # Co-citation
        query_co_cite = co_cite_embeddings[close_doc].astype('float32').reshape(1, -1)
        distances_co_cite, indices_co_cite = co_cite_index.search(np.ascontiguousarray(query_co_cite), k+1)
        co_cite_int_ids_closest_docs[str_close_doc] = [id_mapper_int_to_string[item] for item in indices_co_cite[0][1:]]

        # Author collaboration
        query_author_collab = author_collab_embeddings[close_doc].astype('float32').reshape(1, -1)
        distances_author_collab, indices_author_collab = author_collab_index.search(np.ascontiguousarray(query_author_collab), k+1)
        author_collab_int_ids_closest_docs[str_close_doc] = [id_mapper_int_to_string[item] for item in indices_author_collab[0][1:]]

        # Bibliography coupling
        query_bibliography_coup = bibliography_coup_embeddings[close_doc].astype('float32').reshape(1, -1)
        distances_bibliography_coup, indices_bibliography_coup = bibliography_coup_index.search(np.ascontiguousarray(query_bibliography_coup), k+1)
        bibliography_coup_int_ids_closest_docs[str_close_doc] = [id_mapper_int_to_string[item] for item in indices_bibliography_coup[0][1:]]

    # Store ordered dictionaries
    dict_N_docs_closest_str_ids_co_cite[doc_id_str] = co_cite_int_ids_closest_docs
    dict_N_docs_closest_str_ids_author_collab[doc_id_str] = author_collab_int_ids_closest_docs
    dict_N_docs_closest_str_ids_bibliography_coup[doc_id_str] = bibliography_coup_int_ids_closest_docs

In [61]:
def process_closest_docs(dict_of_specific_index):
    # Initialize the output dictionaries
    dict_of_lists_relevant_gnn_and_ann = {}
    dict_of_lists_relevant_only_gnn = {}

    # Iterate over each item in the input dictionary
    for item in dict_of_specific_index.items():
        flattened_list1 = []
        flattened_list2 = []

        # Process each key-value pair in the inner dictionary
        for key, values in item[1].items():
            flattened_list1.append(key)  # Append the key to flattened_list1
            flattened_list1.extend(values[:2])  # Append the first two values for that key
            flattened_list2.extend(values)  # Append all values for that key to flattened_list2

        # Save the processed lists into the respective dictionaries
        dict_of_lists_relevant_gnn_and_ann[item[0]] = flattened_list1
        dict_of_lists_relevant_only_gnn[item[0]] = flattened_list2

    # Return the resulting dictionaries
    return dict_of_lists_relevant_gnn_and_ann, dict_of_lists_relevant_only_gnn


dict_of_lists_relevant_gnn_and_ann_co_cite, dict_of_lists_relevant_only_gnn_co_cite = process_closest_docs(dict_N_docs_closest_str_ids_co_cite)
dict_of_lists_relevant_gnn_and_ann_author_collab, dict_of_lists_relevant_only_gnn_author_collab = process_closest_docs(dict_N_docs_closest_str_ids_author_collab)
dict_of_lists_relevant_gnn_and_ann_bibliography_coup, dict_of_lists_relevant_only_gnn_bibliography_coup = process_closest_docs(dict_N_docs_closest_str_ids_bibliography_coup)

In [62]:
print(dict_of_lists_relevant_only_gnn_co_cite)

{'559147bc0cf232eb904fb961': ['53908b4920f70186a0dbacc4', '53908b4920f70186a0dbacc3', '53908b4920f70186a0dbacc7', '5390a37f20f70186a0e6c397', '5390a37f20f70186a0e6c3fb', '5390a37f20f70186a0e6c405', '55323b9145cec66b6f9da2d8', '558ebe1c0cf2c779a647846b', '55323b9245cec66b6f9da2f1', '55915b220cf232eb904fbe47', '559131230cf232eb904fb2db', '55914ec60cf232eb904fbb1c', '5390a80f20f70186a0e971f9', '5390a80f20f70186a0e971fa', '5390a80f20f70186a0e97164', '5390adfd20f70186a0ec6424', '5390adfd20f70186a0ec642a', '5390adfd20f70186a0ec6428', '53909ed120f70186a0e309fe', '53909ed120f70186a0e30965', '53909ed120f70186a0e309fb', '53909ee020f70186a0e3391a', '53909ee020f70186a0e33923', '53909ee020f70186a0e33919', '5390ae2e20f70186a0ec841b', '539087c720f70186a0d57437', '5390ae2e20f70186a0ec83e1', '5390980720f70186a0e01fcf', '5390980720f70186a0e01fd0', '5390980720f70186a0e01fd2'], '55323b8945cec66b6f9da198': ['5390bf1320f70186a0f50d88', '5390bf1320f70186a0f50d2b', '5390bf1320f70186a0f50d8c', '539087d920f7018

In [84]:
import numpy as np
from scipy.spatial.distance import cosine

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)  # Cosine similarity is 1 - cosine distance

# Function to sort embeddings by cosine similarity with the first embedding
def sort_embeddings_and_ids_by_cosine_similarity(doc_id, embeddings):
    # Get the embedding of the first entry in the list (this is the reference embedding)
    reference_embedding = embeddings[0][1]
    
    # Sort the embeddings based on the cosine similarity to the reference embedding
    sorted_embeddings = sorted(embeddings, key=lambda x: cosine_similarity(reference_embedding, x[1]), reverse=True)
    
    # Extract the sorted document IDs (the first element of each tuple in sorted_embeddings)
    sorted_doc_ids = [doc for doc, _ in sorted_embeddings]
    
    return sorted_doc_ids

# Define a function that fetches the embeddings based on IDs
def extract_embeddings_from_ids(dict_of_lists_relevant_gnn_and_ann, dict_of_lists_relevant_only_gnn):
    dict_of_embeddings_gnn_and_ann = {}
    dict_of_embeddings_only_gnn = {}
    
    dict_of_sorted_doc_ids_gnn_and_ann = {}
    dict_of_sorted_doc_ids_only_gnn = {}
    
    # Assuming you have a function to fetch embeddings by document ID (e.g., `get_embedding_by_id`)
    for doc_id, relevant_docs_ids in dict_of_lists_relevant_gnn_and_ann.items():
        # Fetch embeddings for each document
        embeddings = [(doc, ann_index.fetch(ids=[doc])['vectors'][doc]['values']) for doc in relevant_docs_ids]
        # Sort the embeddings and extract sorted document IDs
        sorted_doc_ids = sort_embeddings_and_ids_by_cosine_similarity(doc_id, embeddings)
        dict_of_sorted_doc_ids_gnn_and_ann[doc_id] = sorted_doc_ids
        
    for doc_id, relevant_docs_ids in dict_of_lists_relevant_only_gnn.items():
        # Fetch embeddings for each document
        embeddings = [(doc, ann_index.fetch(ids=[doc])['vectors'][doc]['values']) for doc in relevant_docs_ids]
        # Sort the embeddings and extract sorted document IDs
        sorted_doc_ids = sort_embeddings_and_ids_by_cosine_similarity(doc_id, embeddings)
        dict_of_sorted_doc_ids_only_gnn[doc_id] = sorted_doc_ids

    return dict_of_sorted_doc_ids_gnn_and_ann, dict_of_sorted_doc_ids_only_gnn



# Call the function for each dict and extract the embeddings
dict_of_embeddings_relevant_gnn_and_ann_co_cite, dict_of_embeddings_relevant_only_gnn_co_cite = extract_embeddings_from_ids(
    dict_of_lists_relevant_gnn_and_ann_co_cite, dict_of_lists_relevant_only_gnn_co_cite)

dict_of_embeddings_relevant_gnn_and_ann_author_collab, dict_of_embeddings_relevant_only_gnn_author_collab = extract_embeddings_from_ids(
    dict_of_lists_relevant_gnn_and_ann_author_collab, dict_of_lists_relevant_only_gnn_author_collab)

dict_of_embeddings_relevant_gnn_and_ann_bibliography_coup, dict_of_embeddings_relevant_only_gnn_bibliography_coup = extract_embeddings_from_ids(
    dict_of_lists_relevant_gnn_and_ann_bibliography_coup, dict_of_lists_relevant_only_gnn_bibliography_coup)


### evaluate gnns

In [99]:
def evaluate_reranker_gnn(dict_results):
    # Initialize lists to hold true and predicted labels for evaluation
    y_true = []
    y_pred = []

    # Loop through each document in the reranked results
    for original_doc_id,retrieved_list in dict_results.items():
        # Get the actual references (as a set) from the original document
        original_doc = initial_data[initial_data['index'] == original_doc_id]
        # print(original_doc['references'].iloc[0].split(';'))
        actual_references = set(original_doc['references'].iloc[0].split(';'))

        # Create a combined set of all document IDs (true and predicted)
        all_ids = actual_references.union(retrieved_list)

        # Populate y_true and y_pred for metrics calculation
        for doc_id in all_ids:
            y_true.append(1 if doc_id in actual_references else 0)
            y_pred.append(1 if doc_id in retrieved_list else 0)

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)

    return precision, recall, f1

# Sample code to calculate MRR
def mean_reciprocal_rank_gnn(dict_results):
    reciprocal_ranks = []
    for original_doc_id,retrieved_list in dict_results.items():
        original_doc = initial_data[initial_data['index'] == original_doc_id]
        # Find the rank of the first relevant document in reranked results
        for i, doc in enumerate(retrieved_list):
            if doc in original_doc['references'].iloc[0].split(';'):
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)  # No relevant doc found
    return np.mean(reciprocal_ranks)

# Sample code to calculate MAP
def mean_average_precision_gnn(dict_results):
    average_precisions = []
    for original_doc_id,retrieved_list in dict_results.items():
        num_relevant = 0
        score_sum = 0
        original_doc = initial_data[initial_data['index'] == original_doc_id]
        for i, doc in enumerate(retrieved_list):
            if doc in original_doc['references'].iloc[0].split(';'):
                num_relevant += 1
                precision_at_k = num_relevant / (i + 1)
                score_sum += precision_at_k
        if num_relevant > 0:
            average_precisions.append(score_sum / num_relevant)
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)

### evaluation original ann reranker:

In [102]:
def evaluate_reranker(reranked_results):
    # Initialize lists to hold true and predicted labels for evaluation
    y_true = []
    y_pred = []

    # Loop through each document in the reranked results
    for reranked_result in reranked_results:
        # Get the actual references (as a set) from the original document
        actual_references = set(reranked_result['document']['references'].split(';'))
        
        # Get the reranked document IDs from the closest documents data
        predicted_ids = {doc['document']['id'] for doc in reranked_result['closest_documents'].data}

        # Create a combined set of all document IDs (true and predicted)
        all_ids = actual_references.union(predicted_ids)

        # Populate y_true and y_pred for metrics calculation
        for doc_id in all_ids:
            y_true.append(1 if doc_id in actual_references else 0)
            y_pred.append(1 if doc_id in predicted_ids else 0)

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)

    return precision, recall, f1

# Sample code to calculate MRR
def mean_reciprocal_rank(results):
    reciprocal_ranks = []
    for result in results:
        # Find the rank of the first relevant document in reranked results
        for i, doc in enumerate(result['closest_documents'].data):
            if doc['document']['id'] in result['document']['references']:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)  # No relevant doc found
    return np.mean(reciprocal_ranks)

# Sample code to calculate MAP
def mean_average_precision(results):
    average_precisions = []
    for result in results:
        num_relevant = 0
        score_sum = 0
        for i, doc in enumerate(result['closest_documents'].data):
            if doc['document']['id'] in result['document']['references']:
                num_relevant += 1
                precision_at_k = num_relevant / (i + 1)
                score_sum += precision_at_k
        if num_relevant > 0:
            average_precisions.append(score_sum / num_relevant)
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)

In [None]:
#run on 30 docs also to be like other methods with gnn
reranked_results_final_ann = rerank_results(results, pc, top_n=30, max_chars=900)

In [112]:
final_evaluation_results={}
#firstly only ann reranker results:
precision, recall, f1 = evaluate_reranker(reranked_results_final_ann)
mrr = mean_reciprocal_rank(reranked_results_final_ann)
map_score = mean_average_precision(reranked_results_final_ann)

final_evaluation_results['ann_reranker'] = {
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'mrr': mrr,
    'map_score': map_score
}
print(f'results for ann_reranker {final_evaluation_results["ann_reranker"]}')

# List of tuples where each tuple contains a descriptive name and the corresponding dictionary
all_dicts = [
    ("dict_of_embeddings_relevant_gnn_and_ann_co_cite", dict_of_embeddings_relevant_gnn_and_ann_co_cite),
    ("dict_of_embeddings_relevant_only_gnn_co_cite", dict_of_embeddings_relevant_only_gnn_co_cite),
    ("dict_of_embeddings_relevant_gnn_and_ann_author_collab", dict_of_embeddings_relevant_gnn_and_ann_author_collab),
    ("dict_of_embeddings_relevant_only_gnn_author_collab", dict_of_embeddings_relevant_only_gnn_author_collab),
    ("dict_of_embeddings_relevant_gnn_and_ann_bibliography_coup", dict_of_embeddings_relevant_gnn_and_ann_bibliography_coup),
    ("dict_of_embeddings_relevant_only_gnn_bibliography_coup", dict_of_embeddings_relevant_only_gnn_bibliography_coup)
]

for name, gnn_dict in all_dicts:
    precision, recall, f1 = evaluate_reranker_gnn(gnn_dict)
    mrr = mean_reciprocal_rank_gnn(gnn_dict)
    map_score = mean_average_precision_gnn(gnn_dict)
    final_evaluation_results[name] = {
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'mrr': mrr,
    'map_score': map_score
    }
    print(f'results for {name} {final_evaluation_results[name]}')



results for ann_reranker {'precision': 0.03933333333333333, 'recall': 0.14285714285714285, 'f1_score': 0.06168322007318348, 'mrr': 0.21151995226995227, 'map_score': 0.16490257957208942}
results for dict_of_embeddings_relevant_gnn_and_ann_co_cite {'precision': 0.02336448598130841, 'recall': 0.0847457627118644, 'f1_score': 0.036630036630036625, 'mrr': 0.17479156399156398, 'map_score': 0.1631167932285579}
results for dict_of_embeddings_relevant_only_gnn_co_cite {'precision': 0.0013351134846461949, 'recall': 0.004842615012106538, 'f1_score': 0.002093144950287807, 'mrr': 0.0018859649122807015, 'map_score': 0.0018859649122807015}
results for dict_of_embeddings_relevant_gnn_and_ann_author_collab {'precision': 0.02334889926617745, 'recall': 0.0847457627118644, 'f1_score': 0.03661087866108787, 'mrr': 0.17464163614163614, 'map_score': 0.16332885632885635}
results for dict_of_embeddings_relevant_only_gnn_author_collab {'precision': 0.00133422281521014, 'recall': 0.004842615012106538, 'f1_score': 