# Loading data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [3]:
import pandas as pd
from datasets import Dataset
from ast import literal_eval

BASE_LOCATION = "/content/drive/My Drive/"
TRAIN_LOCATION = BASE_LOCATION + "train_top_20.csv"
VAL_LOCATION = BASE_LOCATION + "val_top_20.csv"

train_dataset = pd.read_csv(TRAIN_LOCATION, index_col=0)
train_dataset['evidences'] = train_dataset['evidences'].apply(literal_eval)
train_dataset['scores'] = train_dataset['scores'].apply(literal_eval)
train_dataset = Dataset.from_pandas(train_dataset)

In [4]:
import json

data = []
with open('/content/drive/My Drive/decomp.jsonl') as f:
    for line in f:
        data.append(json.loads(line))

with open("/content/drive/My Drive/corpus_evidence_unified.json") as f:
  evidence_data = json.load(f)


In [90]:
train_data[0].keys()

dict_keys(['crawled_date', 'country_of_origin', 'label', 'url', 'lang', 'claim', 'doc', 'taxonomy_label', 'label_original'])

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util

# !pip install rank_bm25
# from rank_bm25 import BM25Okapi
# corpus = list(evidence_data.values())
# tokenized_corpus = [doc.split(" ") for doc in corpus]
# bm25 = BM25Okapi(tokenized_corpus)

!pwd
!cp /content/drive/MyDrive/fast_bm25.py /content/fast_bm25.py
from fast_bm25 import BM25
import re

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [12]:
# Preprocessing function
def bm25_preprocess(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split(' ')
    return tokens

corpus = list(evidence_data.values())
tokenized_corpus = [bm25_preprocess(doc) for doc in corpus]
bm25 = BM25(tokenized_corpus)

Dropping 204 terms


In [13]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize BERT tokenizer and model

MODEL_NAME_RERANKER = "sentence-transformers/all-MiniLM-L6-v2"
bert_model = SentenceTransformer(MODEL_NAME_RERANKER)

# Function to filter out similar documents based on cosine similarity
def filter_similar_documents(docs, embeddings, threshold=0.75):
    cos_sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
    to_remove = set()
    for i in range(len(docs)):
        for j in range(i + 1, len(docs)):
            if cos_sim_matrix[i, j] > threshold:
                to_remove.add(j)
    filtered_docs = [doc for i, doc in enumerate(docs) if i not in to_remove]
    return filtered_docs

# Function to re-rank and filter documents
def retrieve_evidence(query, batch_size=16):
    documents = bm25.get_top_n(bm25_preprocess(query), corpus, n=100)

    # Encode query and documents
    query_embedding = bert_model.encode(query, convert_to_tensor=True)
    document_embeddings = bert_model.encode(documents, convert_to_tensor=True, batch_size=batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    query_embedding = query_embedding.to(device)
    document_embeddings = document_embeddings.to(device)
    # Calculate cosine similarity scores
    scores = util.pytorch_cos_sim(query_embedding, document_embeddings).squeeze().tolist()

    # Pair documents with their scores
    reranked = list(zip(documents, scores))

    # Sort documents by BERT score (higher is better)
    reranked.sort(key=lambda x: x[1], reverse=True)

    # Extract the documents and their embeddings
    sorted_docs = [doc for doc, score in reranked]
    sorted_embeddings = torch.stack([embedding for embedding, (doc, score) in zip(document_embeddings, reranked)])


    doc_to_score = {doc: score for (doc, score) in reranked}

    # Filter similar documents
    final_filtered_docs = filter_similar_documents(sorted_docs, sorted_embeddings, threshold=0.75)
    return final_filtered_docs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
doc_evidences = {}

In [33]:
for i in range(len(data)):
  if data[i]['claim'] not in doc_evidences:
    e = []
    for j in range(len(data[i]['subquestions'])):
      subquestion = data[i]['subquestions'][j]
      e.append(retrieve_evidence(subquestion)[0:3])
    doc_evidences[data[i]['claim']] = e

In [11]:
original_evidence = dict(zip(train_dataset['claim'], train_dataset['evidences']))

# original_evidence[train_dataset['claim'][0]]

# for i in range(len(train_dataset['claim'])):
#   original_evidence[train_dataset['claim'][i]] = train_dataset['evidences'][i]

["conclusion: nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video fo the speech it is clear that she said 35 crores. nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video of the speech, it is clear that she said 35 crores, not 35000 crores.",
 '7 feb 2017  "the government has distributed 21 crore led bulbs and due to this power bills have been reduced and rs 11,000 crore was saved," the prime ...',
 'jul 5, 2019  approximately 35 crore led bulbs have been distributed under ujala yojana leading to cost saving of 18,341 crores annually.66 pages',
 '2019-07-05  approximately 35 crore led bulbs have been distributed under ujala yojana leading to cost saving of 18,341 crores annually. india is going ...66 puslapiai',
 "in 2016 government of india launched 'ujala led bulb scheme' to lower india's carbon footprint and save electricity, it distributed 370 million led bulbs free, ...",
 "23 apr 2017  prime ministe

In [None]:
combined_evidence = {}

# data[0].keys()

for i, entry in enumerate(data):
  ce = []
  print
  for j, retrieve in enumerate(entry['subquestions_retrieve']):
    if retrieve:
      ce.append(doc_evidences[entry['claim']][j][0])
    else:
      break
  og = 0
  while len(ce) < 5 and len(original_evidence[entry['claim']]) > og:
    ce.append(original_evidence[entry['claim']][og])
    og += 1
  combined_evidence[entry['claim']] = ce

# print(len(original_evidence["Image showing results of opinion polls by India TV for upcoming 2022 Gujarat assembly elections which predicts AAP could win 98 out of 182 seats"]))

In [35]:
len(combined_evidence.keys())

1000

In [16]:
len(doc_evidences.keys())

897

In [106]:
# data[1]['claim']
doc_evidences_2[data[1]['claim']][0]

['18 janv. 2022  the total number of tests. totalenergies caf champions league and caf. confederation cup 2021/2022 ... democratic republic of congo, togo, cte d ...',
 '19 sept. 2023  congo free state, french tat indpendant du congo, former state in africa that occupied almost all of the congo river basin, coextensive with ...',
 '12 oct. 2023  after a lull in cross-border security incidents between the democratic republic of the congo and rwanda between march and july, on 27 july, a ...']

In [97]:
# doc_evidences_2 = doc_evidences
doc_evidences([data['claim']][0])

TypeError: list indices must be integers or slices, not str

In [70]:
final_data = {
  "label": [d['label'].upper() for d in data],
  "claim": [d['claim'] for d in data],
  "subquestions": [d['subquestions'] for d in data],
  "subquestions_retrieve": [d['subquestions_retrieve'] for d in data],
  "claim_evidences": [original_evidence[d['claim']] for d in data],
  "subquestions_evidence": [doc_evidences[d['claim']] for d in data],
  "combined_evidence": [combined_evidence[d['claim']] for d in data]
}

In [78]:
final_data["combined_evidence"][7]

['2023-10-06  discover videos related to how many likes did the video have on tiktok.',
 'the news agency avia.pro obtained exclusive video footage showing how moldovan citizens blocked the movement of the nato military convoy, blocking the road and preventing the western military from advancing in the originally intended direction. citizens of moldova blocked the movement of a convoy of nato military equipment in the area of the settlement novi anenii. a column of military equipment was seen here against the background of mass protests expected on september 18 in moldova against the actions of the government - earlier, the movement of military equipment of the moldovan army was already recorded in a number of regions of moldova. judging by the video footage, we are talking about a convoy of at least 10 pieces of military equipment from nato countries. moreover, which is very remarkable, there are no identification marks on the latter. this only confirms the version that nato forces ca

In [72]:
ds = Dataset.from_dict(final_data)

In [79]:
ds['combined_evidence'][0]

['30, 2019, fns approved florida for early issuance of september benefits if you think you may be eligible for disaster food assistance in',
 'irma had the highest average intensity of all the hurricanes since 2016 and also caused the most people to lose power in the u.s.: about 6.3 million. a category 5 hurricane and the third strongest atlantic hurricane at landfall ever recorded, irma had 35 million people living within 75 miles of its eyes path. irma had the most outsized impact on power outages of all the storms analyzed, and more than 4 million people more were left without power after the storm than would be predicted. hurricane headlines often point to high wind speeds as the culprit for widespread power outages, but this is only part of the equation. our analysis found the single most significant factor in power outage impact is the path of the storm, and specifically the number of people living within 75 miles of the eye of the hurricane as it moves overland.',
 'florida due 

In [77]:
ds.to_json(f"/content/drive/My Drive/dataset-decomposed.jsonl")
ds.to_csv(f"/content/drive/My Drive/dataset-decomposed.csv")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

7503529

In [69]:
import pandas as pd

df = pd.DataFrame(final_data)
df.to_csv('/content/drive/My Drive/dataset-decomposed.csv', index=True)

In [68]:
import json
with open('/content/drive/My Drive/dataset-decomposed.json', 'w') as fp:
    json.dump(final_data, fp)