In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple, Iterator
import ast
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import gc
from tqdm import tqdm



In [2]:
df_facts = pd.read_csv('processed_fact_checks_english.csv')

In [3]:
posts = pd.read_csv('posts_with_english_language.csv')

In [4]:
# import nltk
# from nltk.corpus import stopwords

# nltk.download('stopwords')

# # Define the stopwords
# stop_words = set(stopwords.words('english'))

# # Function to remove stopwords from a text
# def remove_stopwords(text):
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

# # Apply the function to the claim column
# df_facts['claim_no_stopwords'] = df_facts['claim'].apply(remove_stopwords)

# # Display the first few rows to verify
# print(df_facts[['claim', 'claim_no_stopwords']].head())

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
def batched_cosine_similarity(embeddings1: np.ndarray,
                            embeddings2: np.ndarray,
                            batch_size: int = 1000) -> np.ndarray:
    """
    Compute the cosine similarity between two sets of embeddings using batches to avoid memory issues.
    :param embeddings1: The first set of embeddings
    :param embeddings2: The second set of embeddings
    :param batch_size: The batch size to use
    :return: The cosine similarity matrix between the two sets of embeddings
    """
    similarity_matrix = np.zeros((len(embeddings1), len(embeddings2)))

    # Convert to torch tensors for faster computation
    if not torch.is_tensor(embeddings1):
        embeddings1 = torch.from_numpy(embeddings1)
        embeddings2 = torch.from_numpy(embeddings2)

    # Normalize the embeddings
    embeddings1_normalized = torch.nn.functional.normalize(embeddings1, p=2, dim=1)
    embeddings2_normalized = torch.nn.functional.normalize(embeddings2, p=2, dim=1)

    # Process in batches
    for i in tqdm(range(0, len(embeddings1), batch_size), desc="Computing similarities"):
        batch_embeddings1 = embeddings1_normalized[i:i + batch_size]

        # Process second dimension in batches too
        for j in range(0, len(embeddings2), batch_size):
            batch_embeddings2 = embeddings2_normalized[j:j + batch_size]

            # Compute similarity for this batch
            batch_similarity = torch.mm(batch_embeddings1, batch_embeddings2.t())

            # Store in the main similarity matrix
            similarity_matrix[i:i + batch_size, j:j + batch_size] = batch_similarity.cpu().numpy()

        # Clear GPU memory if needed
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return similarity_matrix

In [7]:
def stream_cosine_similarity(embeddings1: np.ndarray,
                           embeddings2: np.ndarray,
                           batch_size: int = 100,
                           top_k: int = 5) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    """
    Compute the cosine similarity between two sets of embeddings using batches to avoid memory issues.
    :param embeddings1: The first set of embeddings
    :param embeddings2: The second set of embeddings
    :param batch_size: The batch size to use
    :param top_k: The number of top elements to return
    :return: An iterator that yields tuples of arrays with the top k indices and scores for each query
    """
    # Convert to torch tensors and move to GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if not torch.is_tensor(embeddings2):
        embeddings2 = torch.from_numpy(embeddings2).to(device, dtype=torch.float32)

    # Normalize embeddings2 once
    embeddings2_normalized = torch.nn.functional.normalize(embeddings2, p=2, dim=1)

    # Process first set of embeddings in batches
    for i in tqdm(range(0, len(embeddings1), batch_size), desc="Computing similarities"):
        batch_embeddings1 = embeddings1[i:i + batch_size]

        # Convert and normalize batch
        if not torch.is_tensor(batch_embeddings1):
            batch_embeddings1 = torch.from_numpy(batch_embeddings1).to(device, dtype=torch.float32)
        batch_embeddings1 = torch.nn.functional.normalize(batch_embeddings1, p=2, dim=1)

        # Initialize arrays for top k results
        batch_top_scores = torch.full((len(batch_embeddings1), top_k), float('-inf'), device=device)
        batch_top_indices = torch.zeros((len(batch_embeddings1), top_k), dtype=torch.long, device=device)

        # Process second set in batches
        for j in range(0, len(embeddings2), batch_size):
            batch_embeddings2 = embeddings2_normalized[j:j + batch_size]

            # Compute similarities for this mini-batch
            similarities = torch.mm(batch_embeddings1, batch_embeddings2.t())

            # Update top k for each query in current batch
            for query_idx in range(similarities.shape[0]):
                scores = similarities[query_idx]
                top_scores, top_indices = torch.topk(scores, min(top_k, len(scores)))

                # Merge with existing top k
                all_scores = torch.cat([batch_top_scores[query_idx], top_scores])
                all_indices = torch.cat([batch_top_indices[query_idx], top_indices + j])

                merged_scores, merged_indices = torch.topk(all_scores, top_k)
                batch_top_scores[query_idx] = merged_scores
                batch_top_indices[query_idx] = all_indices[merged_indices]

            # Clear GPU memory
            del similarities
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

        # Convert results to CPU and yield
        yield (
            batch_top_indices.cpu().numpy(),
            batch_top_scores.cpu().numpy()
        )

        # Clear batch GPU memory
        del batch_embeddings1, batch_top_scores, batch_top_indices
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [8]:
def retrieve_claims_minilm(smp_texts: List[str],
                         fc_texts: List[str],
                         batch_size: int = 32,
                         top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
    """
    Memory-efficient retrieval using MiniLM with streaming similarity computation
    :param smp_texts: The list of social media post texts
    :param fc_texts: The list of fact-check texts
    :param batch_size: The batch size to use for encoding
    :param top_k: The number of top-k results to retrieve
    :return: A tuple with two arrays: the top-k indices and scores for each social media post
    """
    # Load model
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    print("Encoding social media posts...")
    smp_embeddings = model.encode(
        smp_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    print("Encoding fact checks...")
    fc_embeddings = model.encode(
        fc_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    print("Computing top-k similarities...")
    # Initialize arrays for final results
    all_top_indices = np.zeros((len(smp_texts), top_k), dtype=np.int64)
    all_top_scores = np.zeros((len(smp_texts), top_k), dtype=np.float32)

    # Process in streaming fashion
    current_idx = 0
    for batch_indices, batch_scores in stream_cosine_similarity(
        smp_embeddings,
        fc_embeddings,
        batch_size=batch_size,
        top_k=top_k
    ):
        batch_size = len(batch_indices)
        all_top_indices[current_idx:current_idx + batch_size] = batch_indices
        all_top_scores[current_idx:current_idx + batch_size] = batch_scores
        current_idx += batch_size

        # Force garbage collection
        gc.collect()

    return all_top_indices, all_top_scores

In [9]:
def retrieve_top_k(similarity_matrix: np.ndarray,
                   k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
    """
    Get top K matches for each query
    :param similarity_matrix: The similarity matrix
    :param k: The number of top elements to return
    :return: A tuple with two arrays: the top-k indices and scores for each query
    """
    top_k_indices = np.argpartition(-similarity_matrix, k, axis=1)[:, :k]
    top_k_scores = similarity_matrix[np.arange(similarity_matrix.shape[0])[:, None], top_k_indices]

    # Sort the top K results
    top_k_ordering = np.argsort(-top_k_scores, axis=1)
    top_k_indices = top_k_indices[np.arange(top_k_indices.shape[0])[:, None], top_k_ordering]
    top_k_scores = top_k_scores[np.arange(top_k_scores.shape[0])[:, None], top_k_ordering]

    return top_k_indices, top_k_scores

In [12]:
# produce results
smp_texts = posts['text'].dropna().tolist()
fc_texts = df_facts['claim'].tolist()

all_top_indices, all_top_scores = retrieve_claims_minilm(smp_texts, fc_texts, batch_size=32, top_k=5)
np.savez_compressed('retrieval_results.npz', indices=all_top_indices, scores=all_top_scores)

# Load the results
results = np.load('retrieval_results.npz')
all_top_indices = results['indices']
all_top_scores = results['scores']

Encoding social media posts...


Batches:   0%|          | 0/224 [00:00<?, ?it/s]

Encoding fact checks...


Batches:   0%|          | 0/2680 [00:00<?, ?it/s]

Computing top-k similarities...


Computing similarities: 100%|██████████| 224/224 [36:21<00:00,  9.74s/it]


In [11]:
# def evaluate_retrieval_streaming(smp_texts: List[str],
#                                fc_texts: List[str],
#                                true_pairs: List[Tuple[int, int]],
#                                batch_size: int = 32,
#                                k_values: List[int] = [1, 3, 5, 10]) -> dict:
#     """
#     Evaluate a retrieval system based on streaming similarity computation
#     :param smp_texts: The list of social media post texts
#     :param fc_texts: The list of fact-check texts
#     :param true_pairs: A list of tuples with the true query-document pairs
#     :param batch_size: The batch size to use for encoding
#     :param k_values: The list of K values to evaluate
#     :return: A dictionary with the evaluation metrics
#     """
#     max_k = max(k_values)

#     # Get top-k matches for all queries
#     top_k_indices, top_k_scores = retrieve_claims_minilm(
#         smp_texts,
#         fc_texts,
#         batch_size=batch_size,
#         top_k=max_k
#     )

#     metrics = {}

#     # Calculate metrics
#     mrr = 0
#     success_counts = {k: 0 for k in k_values}

#     for query_idx, relevant_idx in true_pairs:
#         # Find position of relevant document in top-k results
#         try:
#             rank = np.where(top_k_indices[query_idx] == relevant_idx)[0][0] + 1
#             mrr += 1 / rank

#             # Update success@k counts
#             for k in k_values:
#                 if rank <= k:
#                     success_counts[k] += 1
#         except IndexError:
#             # Relevant document not in top-k
#             pass

#     # Compute final metrics
#     metrics['mrr'] = mrr / len(true_pairs)
#     for k in k_values:
#         metrics[f'success@{k}'] = success_counts[k] / len(true_pairs)

#     return metrics

In [None]:
# # Define true_pairs with your actual data
# # Replace with your actual true pairs data
# true_pairs = [
#     (0, 0),  # First query is relevant to first document
#     (1, 1),  # Second query is relevant to second document
#     (2, 2),  # Third query is relevant to third document
#     (3, 3),  # Fourth query is relevant to fourth document
#     (4, 4),  # Fifth query is relevant to fifth document
# ] 

# metrics = evaluate_retrieval_streaming(
#         smp_texts,
#         fc_texts,
#         true_pairs,
#         batch_size=8
#     )

# print("\nResults:")
# for metric, value in metrics.items():
#     print(f"{metric}: {value:.3f}")