In [1]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

In [2]:
initial_data = pd.read_csv('/home/student/FinalProject/PaperFeedback/Datasets/acm_citation_network_v8_labeled.csv')
network_data = initial_data[initial_data['references'].notna()]

In [3]:
network_data=network_data[network_data['abstract'].notna()]

In [9]:
max_year=network_data['year'].max()
print(max_year)

2015.0


In [10]:
last_year_data=network_data[network_data['year']==max_year]

In [11]:
last_year_data.count()

Unnamed: 0          6578
index               6578
title               6578
authors             6506
year                6578
venue               6578
references          6578
abstract            6578
id                  6578
clustered_labels    6578
dtype: int64

In [36]:
API_KEY = '703c7c8a-2b8f-46bc-b2f7-ede6b037b3fa'
index_name = 'ann-embeddings'
TOP_N=10
TOP_K=100
DOC_NUMBER=300

In [28]:
pc = Pinecone(api_key=API_KEY)
index = pc.Index(index_name)

In [29]:
if torch.cuda.is_available():
    print('using GPU')
    device = 'cuda'
else:
    print('using CPUG')
    device = 'cpu'    

using GPU


In [30]:
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to(device)



## Sample random papers that have references

In [None]:
sampled_documents = last_year_data.sample(n=DOC_NUMBER, random_state=42)  # Set random_state for reproducibility

# Display the sampled documents
print(sampled_documents)

## find top k closest documents for each document

In [None]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize Pinecone and SentenceTransformer
pc = Pinecone(api_key=API_KEY)
index = pc.Index(index_name)
model = SentenceTransformer('all-MiniLM-L6-v2')


# Create a list to store the results
results = []

# Loop through each sampled document to locate them in the Pinecone index
for _, row in sampled_documents.iterrows():
    # Assuming each document has an 'id' or some unique identifier that corresponds to the index
    document_index = row['index']  # Replace 'index' with the appropriate column name for your document identifiers

    # Create an embedding for the document
    embedded_query = model.encode([row['abstract']])  # Replace 'abstract' with the appropriate column name for the text data

    # Run the query on Pinecone
    response = index.query(
        vector=embedded_query.tolist(),  # Convert to list if needed
        top_k=TOP_K+1,
        include_values=False,
        include_metadata=True
    )

    
    # Filter out the original document from the query response
    filtered_matches = [match for match in response['matches'] if match['id'] != document_index]
    print(filtered_matches)
    
    # Append the results
    results.append({
        'document': row,
        'closest_documents': filtered_matches
    })

## evaluate retrieved top k documents

In [39]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_results(results):
    # Create lists to hold true labels and predicted labels
    y_true = []
    y_pred = []
    average_precisions = []
    reciprocal_ranks = []

    for result in results:
        # Get the actual references (ensure it's a set) and closest document IDs
        actual_references = set(result['document']['references'].split(';'))  # Actual references
        predicted_ids = [match['id'] for match in result['closest_documents']]  # Predicted IDs (list for ranking)

        # Create a combined set of all document IDs (true and predicted)
        all_ids = actual_references.union(predicted_ids)

        # Append true positives and false negatives for precision, recall, F1 calculations
        for doc_id in all_ids:
            y_true.append(1 if doc_id in actual_references else 0)
            y_pred.append(1 if doc_id in predicted_ids else 0)

        # Calculate Average Precision (AP) for MAP
        relevant_docs_retrieved = 0
        score_sum = 0
        for i, doc_id in enumerate(predicted_ids):
            if doc_id in actual_references:
                relevant_docs_retrieved += 1
                precision_at_k = relevant_docs_retrieved / (i + 1)
                score_sum += precision_at_k
        if relevant_docs_retrieved > 0:
            average_precisions.append(score_sum / relevant_docs_retrieved)
        else:
            average_precisions.append(0)

        # Calculate Reciprocal Rank (RR) for MRR
        for i, doc_id in enumerate(predicted_ids):
            if doc_id in actual_references:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)  # No relevant doc found

    # Debugging: Print sizes and samples of y_true and y_pred
    print(f'Size of y_true: {len(y_true)}, Size of y_pred: {len(y_pred)}')
    print(f'y_true samples: {y_true[:10]}')  # Print first 10 elements for verification
    print(f'y_pred samples: {y_pred[:10]}')  # Print first 10 elements for verification

    # Ensure y_true and y_pred are the same size before calculating metrics
    if len(y_true) != len(y_pred):
        print("Error: y_true and y_pred are not the same length.")
        return None, None, None, None, None

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
    map_score = sum(average_precisions) / len(average_precisions)
    mrr_score = sum(reciprocal_ranks) / len(reciprocal_ranks)

    return precision, recall, f1, map_score, mrr_score

# Assuming results is your list of results from previous queries
precision, recall, f1, map_score, mrr_score = evaluate_results(results)

# Print evaluation metrics if valid
if precision is not None:
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'MAP: {map_score:.4f}')
    print(f'MRR: {mrr_score:.4f}')


Size of y_true: 32737, Size of y_pred: 32737
y_true samples: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
y_pred samples: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Precision: 0.0295
Recall: 0.2441
F1 Score: 0.0526
MAP: 0.1925
MRR: 0.3029


## Rerank docuemnets

In [40]:
def limit_characters(text, max_chars=900):
    """Limit a text to a specified maximum number of characters."""
    return text[:max_chars] if len(text) > max_chars else text

# Initialize the reranked results list
reranked_results = []

# Loop through each document in the results
for result in results:
    row = result['document']  # The original document
    closest_documents = result['closest_documents']  # The closest documents found

    # Prepare matches for reranking
    matches_with_details = []
    for i, match in enumerate(closest_documents):
        doc_id = match['id']
        score = match['score']
        
        # Retrieve the abstract or any other relevant metadata
        abstract = match['metadata'].get('abstract', "")  # Use get to avoid KeyError
        # abstract_tokens = abstract.split()  # Split the abstract into tokens
        # limited_abstract = ' '.join(abstract_tokens[:400])  # Join only the first 400 tokens
        limited_abstract = limit_characters(abstract, max_chars=900)  # Apply character limit

        

        # Prepare match info for reranking
        match_info = {
            'id': doc_id,
            'text': limited_abstract
        }
        matches_with_details.append(match_info)
    
    # print(row['abstract'])
    
    # Rerank the documents using the BGE reranker
    reranked_documents = pc.inference.rerank(
        model="bge-reranker-v2-m3",
        query=row['abstract'],  # The original query for reranking
        documents=matches_with_details,  # Documents to rerank
        top_n=TOP_N,  # Number of top documents to return
        return_documents=True  # Specify whether to return documents
    )


    # Save the reranked results
    reranked_results.append({
        'document': row,
        'closest_documents': reranked_documents
    })

# print out the reranked results for each document
for reranked_result in reranked_results:
    print("Original Document:", reranked_result['document'])
    print("Reranked Documents:")
    for i, doc in enumerate(reranked_result['closest_documents'].data):
        print(str(i+1)+'. id: '+doc['document']['id']+'   score: '+str(doc['score'])+'   abstract: '+doc['document']['text']+'\n')
    print("\n")


Original Document: Unnamed: 0                                                    1655939
index                                        559147bc0cf232eb904fb961
title               Using a Tangible Versus a Multi-touch Graphica...
authors             Joyce Ma, Lisa Sindorf, Isaac Liao, Jennifer F...
year                                                           2015.0
venue               Proceedings of the Ninth International Confere...
references          5390ba3820f70186a0f373ef;5390881820f70186a0d81...
abstract            We describe a study comparing the behavior of ...
id                                                            1655939
clustered_labels                                                    0
Name: 1655939, dtype: object
Reranked Documents:
1. id: 53908b4920f70186a0dbacc5   score: 0.7821637   abstract: This describes the usability and interaction challenges in creating a unique museum exhibit which utilizes real-time compositing, and hides complex computational tasks b

## evaluate retrieved top k documents **after reranking**

In [41]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_reranker(reranked_results):
    # Initialize lists to hold true and predicted labels for evaluation
    y_true = []
    y_pred = []

    # Loop through each document in the reranked results
    for reranked_result in reranked_results:
        # Get the actual references (as a set) from the original document
        actual_references = set(reranked_result['document']['references'].split(';'))
        
        # Get the reranked document IDs from the closest documents data
        predicted_ids = {doc['document']['id'] for doc in reranked_result['closest_documents'].data}

        # Create a combined set of all document IDs (true and predicted)
        all_ids = actual_references.union(predicted_ids)

        # Populate y_true and y_pred for metrics calculation
        for doc_id in all_ids:
            y_true.append(1 if doc_id in actual_references else 0)
            y_pred.append(1 if doc_id in predicted_ids else 0)

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)

    return precision, recall, f1

# Sample code to calculate MRR
def mean_reciprocal_rank(results):
    reciprocal_ranks = []
    for result in results:
        # Find the rank of the first relevant document in reranked results
        for i, doc in enumerate(result['closest_documents'].data):
            if doc['document']['id'] in result['document']['references']:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)  # No relevant doc found
    return np.mean(reciprocal_ranks)

# Sample code to calculate MAP
def mean_average_precision(results):
    average_precisions = []
    for result in results:
        num_relevant = 0
        score_sum = 0
        for i, doc in enumerate(result['closest_documents'].data):
            if doc['document']['id'] in result['document']['references']:
                num_relevant += 1
                precision_at_k = num_relevant / (i + 1)
                score_sum += precision_at_k
        if num_relevant > 0:
            average_precisions.append(score_sum / num_relevant)
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)


# Assuming reranked_results contains the results from the reranker
precision, recall, f1 = evaluate_reranker(reranked_results)

# Print evaluation metrics
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

mrr = mean_reciprocal_rank(reranked_results)
map_score = mean_average_precision(reranked_results)
print(f'MAP: {map_score:.4f}')
print(f'MRR: {mrr:.4f}')


Precision: 0.0843
Recall: 0.0699
F1 Score: 0.0764
MAP: 0.2150
MRR: 0.2388


## try to filter by year versions

In [None]:
# #added year filtering

# from sklearn.metrics import precision_score, recall_score, f1_score

# def evaluate_results(results):
#     # Create lists to hold true labels and predicted labels
#     y_true = []
#     y_pred = []
#     average_precisions = []
#     reciprocal_ranks = []

#     for result in results:
#         # Get the actual references (ensure it's a set) and closest document IDs
#         actual_references = set(result['document']['references'].split(';'))  # Actual references
#         predicted_ids = [match['id'] for match in result['closest_documents']]  # Predicted IDs (list for ranking)
#         main_document_year = result['document']['year']  # Extracting the document's year


#         # Create a combined set of all document IDs (true and predicted)
#         all_ids = actual_references.union(predicted_ids)

#         # Append true positives and false negatives for precision, recall, F1 calculations
#         for doc_id in all_ids:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc_id, 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 if doc_year > main_document_year: #ensureto consider evaluation only for documents that have year smaller than main doc
#                     is_append=False
#             if is_append==True:
#                 y_true.append(1 if doc_id in actual_references else 0)
#                 y_pred.append(1 if doc_id in predicted_ids else 0)

#         # Calculate Average Precision (AP) for MAP
#         relevant_docs_retrieved = 0
#         score_sum = 0
#         rank=0
#         for doc_id in predicted_ids:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc_id, 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 if doc_year > main_document_year: #ensure to consider evaluation only for documents that have year smaller than main doc
#                     is_append=False

#             if is_append:
#                 if doc_id in actual_references:
#                     relevant_docs_retrieved += 1
#                     precision_at_k = relevant_docs_retrieved / (rank + 1)
#                     score_sum += precision_at_k
#                 rank+=1
            
#         if relevant_docs_retrieved > 0:
#             average_precisions.append(score_sum / relevant_docs_retrieved)
#         else:
#             average_precisions.append(0)
#         rank=0
#         # Calculate Reciprocal Rank (RR) for MRR
#         for doc_id in predicted_ids:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc_id, 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 if doc_year > main_document_year: #ensureto consider evaluation only for documents that have year smaller than main doc
#                     is_append=False
#             if is_append:
#                 if doc_id in actual_references:
#                     reciprocal_ranks.append(1 / (rank + 1))
#                     break
#                 else:
#                     rank+=1
#         else:
#             reciprocal_ranks.append(0)  # No relevant doc found

#     # Debugging: Print sizes and samples of y_true and y_pred
#     print(f'Size of y_true: {len(y_true)}, Size of y_pred: {len(y_pred)}')
#     print(f'y_true samples: {y_true[:10]}')  # Print first 10 elements for verification
#     print(f'y_pred samples: {y_pred[:10]}')  # Print first 10 elements for verification

#     # Ensure y_true and y_pred are the same size before calculating metrics
#     if len(y_true) != len(y_pred):
#         print("Error: y_true and y_pred are not the same length.")
#         return None, None, None, None, None

#     # Calculate evaluation metrics
#     precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
#     recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
#     f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
#     map_score = sum(average_precisions) / len(average_precisions)
#     mrr_score = sum(reciprocal_ranks) / len(reciprocal_ranks)

#     return precision, recall, f1, map_score, mrr_score

# # Assuming results is your list of results from previous queries
# precision, recall, f1, map_score, mrr_score = evaluate_results(results)

# # Print evaluation metrics if valid
# if precision is not None:
#     print(f'Precision: {precision:.4f}')
#     print(f'Recall: {recall:.4f}')
#     print(f'F1 Score: {f1:.4f}')
#     print(f'MAP: {map_score:.4f}')
#     print(f'MRR: {mrr_score:.4f}')


In [None]:
# #added year filtering 
# import numpy as np
# from sklearn.metrics import precision_score, recall_score, f1_score

# def evaluate_reranker(reranked_results):
#     # Initialize lists to hold true and predicted labels for evaluation
#     y_true = []
#     y_pred = []

#     # Loop through each document in the reranked results
#     for reranked_result in reranked_results:
#         # Get the actual references (as a set) from the original document
#         actual_references = set(reranked_result['document']['references'].split(';'))
        
#         # Get the reranked document IDs from the closest documents data
#         predicted_ids = {doc['document']['id'] for doc in reranked_result['closest_documents'].data}

#         # Create a combined set of all document IDs (true and predicted)
#         all_ids = actual_references.union(predicted_ids)

#         main_document_year = reranked_result['document']['year']  # Extracting the document's year


#         # Populate y_true and y_pred for metrics calculation
#         for doc_id in all_ids:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc_id, 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 if doc_year > main_document_year: #ensureto consider evaluation only for documents that have year smaller than main doc
#                     is_append=False
#             if is_append==True:
#                 y_true.append(1 if doc_id in actual_references else 0)
#                 y_pred.append(1 if doc_id in predicted_ids else 0)

#     # Calculate evaluation metrics
#     precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
#     recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
#     f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)

#     return precision, recall, f1

# # Sample code to calculate MRR
# def mean_reciprocal_rank(results):
#     rank=0
#     reciprocal_ranks = []
#     for result in results:
#         main_document_year = result['document']['year']  # Extracting the document's year
#         # Find the rank of the first relevant document in reranked results
#         for doc in result['closest_documents'].data:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc['document']['id'], 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 if doc_year > main_document_year: #ensureto consider evaluation only for documents that have year smaller than main doc
#                     is_append=False
#             if is_append:
#                 if doc['document']['id'] in result['document']['references']:
#                     reciprocal_ranks.append(1 / (rank + 1))
#                     break
#                 else:
#                     rank+=1
#         else:
#             reciprocal_ranks.append(0)  # No relevant doc found
#     return np.mean(reciprocal_ranks)


# # Sample code to calculate MAP
# def mean_average_precision(results):
#     rank=0
#     average_precisions = []
#     for result in results:
#         num_relevant = 0
#         score_sum = 0
#         main_document_year = result['document']['year']  # Extracting the document's year
#         #print(f'main doc year {main_document_year}')
#         for doc in result['closest_documents'].data:
#             is_append=True
#             doc_year_array= network_data.loc[network_data['index'] == doc['document']['id'], 'year'].values
#             if doc_year_array.size > 0:
#                 doc_year = doc_year_array[0]  # Extract the first element
#                 #print(f'reranked doc year {doc_year}')
#                 if doc_year > main_document_year: #ensure to consider evaluation only for documents that have year smaller than main doc
#                     is_append=False

#             if is_append:
#                 if doc['document']['id'] in result['document']['references']:
#                     num_relevant += 1
#                     precision_at_k = num_relevant / (rank + 1)
#                     score_sum += precision_at_k
#                 rank+=1
#         if num_relevant > 0:
#             average_precisions.append(score_sum / num_relevant)
#         else:
#             average_precisions.append(0)
#     return np.mean(average_precisions)


# # Assuming reranked_results contains the results from the reranker
# precision, recall, f1 = evaluate_reranker(reranked_results)

# # Print evaluation metrics
# print(f'Precision: {precision:.4f}')
# print(f'Recall: {recall:.4f}')
# print(f'F1 Score: {f1:.4f}')

# #mrr = mean_reciprocal_rank(reranked_results)
# map_score = mean_average_precision(reranked_results)
# print(f'MRR: {mrr:.4f}')
# print(f'MAP: {map_score:.4f}')