In [None]:
import torch
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import json
import sentencepiece
import rouge
import numpy as np
import transformers
import warnings
from tqdm import tqdm
from datasets import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer,AutoModelForSeq2SeqLM, AutoModelForMaskedLM, EvalPrediction, BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BartTokenizer, BartForSequenceClassification

In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data loading and processing

In [None]:
# Paths to your datasets
train_path = 'NumTemp-E9C0/data/raw_data/train_claims_quantemp.json'
val_path = 'NumTemp-E9C0/data/raw_data/val_claims_quantemp.json'
test_path = 'NumTemp-E9C0/data/raw_data/test_claims_quantemp.json'
evidence_path = 'NumTemp-E9C0/data/corpus_evidence_unified.json'

In [None]:
#helper function to extract and create dataframes

def extract_claims_and_labels(file_path):
    """
    Extracts claims and labels from the given JSON file and converts them into a DataFrame.

    Parameters:
    file_path (str): The path to the JSON file.

    Returns:
    pd.DataFrame: A DataFrame containing the claims and labels.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Extract only the "label" and "claim" fields
    extracted_data = [{'label': item['label'], 'claim': item['claim']} for item in data]

    # Convert the extracted data to a DataFrame
    df_claims = pd.DataFrame(extracted_data)

    return df_claims

# Convert the extracted data to a DataFrame
df_claim_train = extract_claims_and_labels(train_path)
df_claim_val = extract_claims_and_labels(val_path)

In [None]:
#Extract evidence snippits from the corpus
with open(evidence_path, 'r', encoding='utf-8') as file:
    evidence_data = json.load(file)

In [None]:
# BM25 Evidence Collector
# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained("facebook/bart-large-mnli")
bert_model = BertForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

# Move model to the GPU
bert_model.to(device)

corpus = list(evidence_data.values())

# Tokenize the entire corpus once
def tokenize_corpus(corpus):
    return [doc.split(" ") for doc in corpus]

# Function to retrieve top-k documents using BM25 and re-rank using BERT
def retrieve_evidence(query, bm25, corpus, k=5):
    # BM25 retrieval
    tokenized_query = query.split(" ")
    bm25_top_k = bm25.get_top_n(tokenized_query, corpus, n=250)  # Adjust 'n' as needed
    
    # Tokenize the query once for BERT
    tokenized_query_bert = bert_tokenizer.encode_plus(query, add_special_tokens=True, return_tensors='pt', truncation=True, max_length=512)
    tokenized_query_bert = {key: value.to(device) for key, value in tokenized_query_bert.items()}
    
    # Tokenize the BM25 top-k documents for BERT
    tokenized_corpus_bert = [bert_tokenizer.encode_plus(doc, add_special_tokens=True, return_tensors='pt', truncation=True, max_length=512) for doc in bm25_top_k]
    
    # Process documents in batches
    batch_size = 64
    scores = []
    
    for i in range(0, len(tokenized_corpus_bert), batch_size):
        batch = tokenized_corpus_bert[i:i+batch_size]
        batch = [{key: value.to(device) for key, value in doc.items()} for doc in batch]  # Move each document to GPU
        
        with torch.no_grad():
            outputs = [bert_model(**{**tokenized_query_bert, **doc}) for doc in batch]  # Perform inference on GPU
            batch_scores = [output.logits.squeeze().item() for output in outputs]
        
        scores.extend(batch_scores)
    
    # Re-rank documents based on scores
    ranked_documents = sorted(zip(bm25_top_k, scores), key=lambda x: x[1], reverse=True)
    top_k_evidences = [doc for doc, score in ranked_documents[:k]]
    
    return top_k_evidences

In [None]:
def add_top_k_evidences(df, bm25, corpus, k=5):
    df['top_k_evidences'] = df['claim'].progress_apply(lambda claim: retrieve_evidence(claim, bm25, corpus, k))
    return df

# Assuming df_claim_train and df_claim_val are your DataFrames containing the claims and labels
# Tokenize the corpus once for BM25
#tokenized_corpus = tokenize_corpus(corpus)
#bm25 = BM25Okapi(tokenized_corpus)

#df_claim_train = add_top_k_evidences(df_claim_train, bm25, corpus, k=5)
#df_claim_val = add_top_k_evidences(df_claim_val, bm25, corpus, k=5)

In [None]:
#Save the DataFrames from CSV files
#df_claim_train.to_csv('df_claim_train.csv', index=False)
#df_claim_val.to_csv('df_claim_val.csv', index=False)

# Load the DataFrames from CSV files
#df_claim_train = pd.read_csv('df_claim_train.csv')
#df_claim_val = pd.read_csv('df_claim_val.csv')

df_claim_train = pd.read_csv('evidences_train.csv')
df_claim_val = pd.read_csv('evidences_val.csv')
df_claim_test = pd.read_csv('evidences_test2.csv')

In [None]:
label_encoder = LabelEncoder()

df_claim_train['label'] = label_encoder.fit_transform(df_claim_train['label'])
df_claim_val['label'] = label_encoder.fit_transform(df_claim_val['label'])
df_claim_test['label'] = label_encoder.fit_transform(df_claim_test['label'])

In [None]:
df_claim_val.drop(columns=['Unnamed: 0', 'scores'], inplace=True)
df_claim_train.drop(columns=['Unnamed: 0', 'scores'], inplace=True)
#df_claim_test.drop(columns=['Unnamed: 0', 'scores'], inplace=True)

In [None]:
transformers.logging.set_verbosity_error()
final_dataset_train = Dataset.from_pandas(df_claim_train)
final_dataset_val = Dataset.from_pandas(df_claim_val)
final_dataset_test = Dataset.from_pandas(df_claim_test)

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['evidences'], examples['claim'], truncation=True, padding='max_length', max_length=1024)

#encoded_dataset_train = final_dataset_train.map(preprocess_function, batched=True)
#encoded_dataset_val = final_dataset_val.map(preprocess_function, batched=True)
encoded_dataset_test = final_dataset_test.map(preprocess_function, batched=True)

# Model Training and Evaluation

In [None]:
encoded_dataset_test['input_ids']


In [None]:
# Choose and load tokenizer and model

#BART
model_name = "facebook/bart-large-mnli" 
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForSequenceClassification.from_pretrained(model_name, num_labels=3)

#MathRoberta
#model_name = "nielsr/nt5-small-rc1"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model =  AutoModelForSeq2SeqLM.from_pretrained(model_name, num_labels=3)

In [None]:
import logging
transformers.logging.set_verbosity(logging.WARNING)

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions

    print(f'Predictions shape: {preds}')
    print(f'Predictions shape: {p.predictions}')
    print(f'Labels shape: {p.label_ids}')
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_weighted_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    #roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1 micro': f1_micro_average,
               'f1 macro': f1_macro_average,
               'f1 weighted': f1_weighted_average,
               #'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [None]:
#Checkpoint
#checkpoint = './results/checkpoint-2000'

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps = 1000,
    # Specify to use CUDA
    use_cpu=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

# Train the model
#trainer.train()

#if from checkpoint
#trainer.train(resume_from_checkpoint=checkpoint)

In [None]:
#Save or Load a model
#model.save_pretrained('./finetuned_math')
#tokenizer.save_pretrained('./finetuned_bart')

# Load the fine-tuned BART model
model = BartForSequenceClassification.from_pretrained('./models/finetuned_BART', device=0)

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('./models/finetuned_BART', device=0)

# Move the model to GPU if available
model.to(device)

In [None]:
#Evaluate a model
trainer.evaluate()

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

context = """Saint Jean de Brébeuf was a French Jesuit missionary who
travelled to New France in 1625. There he worked primarily with the Huron
for the rest of his life, except for a few years in France from 1629 to
1633. He learned their language and culture, writing extensively about
each to aid other missionaries. In 1649, Br´ebeuf and another missionary
were captured when an Iroquois raid took over a Huron village . Together
with Huron captives, the missionaries were ritually tortured and killed
on March 16, 1649. Br´ebeuf was beatified in 1925 and among eight Jesuit
missionaries canonized as saints in the Roman Catholic Church in 1930."""

question = "How many years did Saint Jean de Brébeuf stay in New France before he went back to France for a few years?"

tokenizer = T5Tokenizer.from_pretrained("nielsr/nt5-small-rc1")
model = T5ForConditionalGeneration.from_pretrained("nielsr/nt5-small-rc1")

# encode context & question
input_text = f"answer_me: {question} context: {context}"
encoded_query = tokenizer(
                    input_text, 
                    return_tensors='pt', 
                    padding='max_length', 
                    truncation=True, 
                    max_length=512)

# generate answer
generated_answer = model.generate(input_ids=encoded_query["input_ids"], 
                                  attention_mask=encoded_query["attention_mask"], 
                                  max_length=54)

decoded_answer = tokenizer.decode(generated_answer.numpy()[0])
print("T5 Answer: ", decoded_answer)