In [1]:
import torch
import pandas as pd
import json
import transformers
import warnings
from tqdm import tqdm
from datasets import Dataset
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BartTokenizer, BartForSequenceClassification

In [8]:
tqdm.pandas()
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# Paths to your datasets
train_path = 'NumTemp-E9C0/data/raw_data/train_claims_quantemp.json'
val_path = 'NumTemp-E9C0/data/raw_data/val_claims_quantemp.json'
test_path = 'NumTemp-E9C0/data/raw_data/test_claims_quantemp.json'
evidence_path = 'NumTemp-E9C0/data/corpus_evidence_unified.json'

In [20]:
def extract_claims_and_labels(file_path):
    """
    Extracts claims and labels from the given JSON file and converts them into a DataFrame.

    Parameters:
    file_path (str): The path to the JSON file.

    Returns:
    pd.DataFrame: A DataFrame containing the claims and labels.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Extract only the "label" and "claim" fields
    extracted_data = [{'label': item['label'], 'claim': item['claim']} for item in data]

    # Convert the extracted data to a DataFrame
    df_claims = pd.DataFrame(extracted_data)

    return df_claims

# Convert the extracted data to a DataFrame
df_claim_train = extract_claims_and_labels(train_path)
df_claim_val = extract_claims_and_labels(val_path)

In [21]:
#Extract evidence snippits from the corpus
with open(evidence_path, 'r', encoding='utf-8') as file:
    evidence_data = json.load(file)

In [38]:
# BM25 Evidence Collector
# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
bert_model = BertForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Move model to the GPU
bert_model.to(device)

corpus = list(evidence_data.values())

# Tokenize the entire corpus once
def tokenize_corpus(corpus):
    return [doc.split(" ") for doc in corpus]

# Function to retrieve top-k documents using BM25 and re-rank using BERT
def retrieve_evidence(query, bm25, corpus, k=5):
    # BM25 retrieval
    tokenized_query = query.split(" ")
    bm25_top_k = bm25.get_top_n(tokenized_query, corpus, n=250)  # Adjust 'n' as needed
    
    # Tokenize the query once for BERT
    tokenized_query_bert = bert_tokenizer.encode_plus(query, add_special_tokens=True, return_tensors='pt', truncation=True, max_length=512)
    tokenized_query_bert = {key: value.to(device) for key, value in tokenized_query_bert.items()}
    
    # Tokenize the BM25 top-k documents for BERT
    tokenized_corpus_bert = [bert_tokenizer.encode_plus(doc, add_special_tokens=True, return_tensors='pt', truncation=True, max_length=512) for doc in bm25_top_k]
    
    # Process documents in batches
    batch_size = 64
    scores = []
    
    for i in range(0, len(tokenized_corpus_bert), batch_size):
        batch = tokenized_corpus_bert[i:i+batch_size]
        batch = [{key: value.to(device) for key, value in doc.items()} for doc in batch]  # Move each document to GPU
        
        with torch.no_grad():
            outputs = [bert_model(**{**tokenized_query_bert, **doc}) for doc in batch]  # Perform inference on GPU
            batch_scores = [output.logits.squeeze().item() for output in outputs]
        
        scores.extend(batch_scores)
    
    # Re-rank documents based on scores
    ranked_documents = sorted(zip(bm25_top_k, scores), key=lambda x: x[1], reverse=True)
    top_k_evidences = [doc for doc, score in ranked_documents[:k]]
    
    return top_k_evidences

In [39]:


def add_top_k_evidences(df, bm25, corpus, k=5):
    df['top_k_evidences'] = df['claim'].progress_apply(lambda claim: retrieve_evidence(claim, bm25, corpus, k))
    return df

# Assuming df_claim_train and df_claim_val are your DataFrames containing the claims and labels
# Tokenize the corpus once for BM25
tokenized_corpus = tokenize_corpus(corpus)
bm25 = BM25Okapi(tokenized_corpus)

df_claim_train = add_top_k_evidences(df_claim_train, bm25, corpus, k=5)
df_claim_val = add_top_k_evidences(df_claim_val, bm25, corpus, k=5)

100%|███████████████████████████████████████████████████████████████████████████| 9935/9935 [16:14:29<00:00,  5.89s/it]
100%|████████████████████████████████████████████████████████████████████████████| 3084/3084 [4:12:07<00:00,  4.91s/it]


In [2]:
#Save the DataFrames from CSV files
#df_claim_train.to_csv('df_claim_train.csv', index=False)
#df_claim_val.to_csv('df_claim_val.csv', index=False)

# Load the DataFrames from CSV files
df_claim_train = pd.read_csv('df_claim_train.csv')
df_claim_val = pd.read_csv('df_claim_val.csv')

In [3]:
df_claim_train['label'] = df_claim_train['label'].replace({'False':0, 'Conflicting':1, 'True':2})
df_claim_val['label'] = df_claim_val['label'].replace({'False':0, 'Conflicting':1, 'True':2})

In [None]:
transformers.logging.set_verbosity_error()
final_dataset_train = Dataset.from_pandas(df_claim_train)
final_dataset_val = Dataset.from_pandas(df_claim_val)

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli', num_labels=3)

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['top_k_evidences'], examples['claim'], truncation=True, padding='max_length', max_length=1024)

encoded_dataset_train = final_dataset_train.map(preprocess_function, batched=True)
encoded_dataset_val = final_dataset_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/9935 [00:00<?, ? examples/s]

Map:   0%|          | 0/3084 [00:00<?, ? examples/s]

In [None]:
import logging
transformers.logging.set_verbosity(logging.WARNING)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps = 5000,
    # Specify to use CUDA
    use_cpu=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_val
)

# Train the model
trainer.train()