# Loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json

with open("/content/drive/My Drive/val_claims_quantemp.json") as f:
  val_data = json.load(f)

with open("/content/drive/My Drive/train_claims_quantemp.json") as f:
  train_data = json.load(f)

with open("/content/drive/My Drive/test_claims_quantemp.json") as f:
  test_data = json.load(f)

with open("/content/drive/My Drive/corpus_evidence_unified.json") as f:
  evidence_data = json.load(f)

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util

# !pip install rank_bm25
# from rank_bm25 import BM25Okapi
# corpus = list(evidence_data.values())
# tokenized_corpus = [doc.split(" ") for doc in corpus]
# bm25 = BM25Okapi(tokenized_corpus)

!pwd
!cp /content/drive/MyDrive/fast_bm25.py /content/fast_bm25.py
from fast_bm25 import BM25
import re

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
# Preprocessing function
def bm25_preprocess(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split(' ')
    return tokens

corpus = list(evidence_data.values())
tokenized_corpus = [bm25_preprocess(doc) for doc in corpus]
bm25 = BM25(tokenized_corpus)

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)


In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize BERT tokenizer and model

MODEL_NAME_RERANKER = "sentence-transformers/all-MiniLM-L6-v2"
bert_model = SentenceTransformer(MODEL_NAME_RERANKER)

# Function to filter out similar documents based on cosine similarity
def filter_similar_documents(docs, embeddings, threshold=0.75):
    cos_sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
    to_remove = set()
    for i in range(len(docs)):
        for j in range(i + 1, len(docs)):
            if cos_sim_matrix[i, j] > threshold:
                to_remove.add(j)
    filtered_docs = [doc for i, doc in enumerate(docs) if i not in to_remove]
    return filtered_docs

# Function to re-rank and filter documents
def retrieve_evidence(query, batch_size=16):
    documents = bm25.get_top_n(bm25_preprocess(query), corpus, n=100)

    # Encode query and documents
    query_embedding = bert_model.encode(query, convert_to_tensor=True)
    document_embeddings = bert_model.encode(documents, convert_to_tensor=True, batch_size=batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    query_embedding = query_embedding.to(device)
    document_embeddings = document_embeddings.to(device)
    # Calculate cosine similarity scores
    scores = util.pytorch_cos_sim(query_embedding, document_embeddings).squeeze().tolist()

    # Pair documents with their scores
    reranked = list(zip(documents, scores))

    # Sort documents by BERT score (higher is better)
    reranked.sort(key=lambda x: x[1], reverse=True)

    # Extract the documents and their embeddings
    sorted_docs = [doc for doc, score in reranked]
    sorted_embeddings = torch.stack([embedding for embedding, (doc, score) in zip(document_embeddings, reranked)])


    doc_to_score = {doc: score for (doc, score) in reranked}

    # Filter similar documents
    final_filtered_docs = filter_similar_documents(sorted_docs, sorted_embeddings, threshold=0.75)
    return [(doc, doc_to_score[doc]) for doc in final_filtered_docs]



In [None]:
doc_evidences = {}

In [None]:
for x in range(0, len(val_data)):
  claim = val_data[x]['claim']
  doc_evidences[claim] = retrieve_evidence(claim)

In [None]:
data = {
  "label": [d['label'].upper() for d in val_data],
  "claim": [d['claim'] for d in val_data],
  "evidences": [[e[0] for e in doc_evidences[d['claim']][:20]] for d in val_data],
  "scores": [[e[1] for e in doc_evidences[d['claim']][:20]] for d in val_data]
}

In [None]:
import pandas as pd

df = pd.DataFrame(data)

df.to_csv('/content/drive/My Drive/val_top_20.csv', index=True)

            label                                              claim  \
0           FALSE  Amit Shah said Narendra Modi sleeps for 24 hou...   
1           FALSE  Video of show Pakistani players celebrating th...   
2            TRUE  Says Dino Rossi "stripped" health care "from 4...   
3           FALSE  Durch einen Vergleich mit den Symptomen einer ...   
4           FALSE  A gun-toting Australian granny blew the testic...   
...           ...                                                ...   
3079        FALSE  Mark Zuckerberg is giving $1,000 away to Faceb...   
3080        FALSE  Joe Biden said Democrats would cheat in 2022 b...   
3081        FALSE  "CNN to permanently close its doors as ratings...   
3082  CONFLICTING  A recent study found "that cities where Uber o...   
3083         TRUE  The Biden administration "published a study co...   

                                              evidences  \
0     [prime minister narendra modi sleeps for only ...   
1     [sep 10, 20