In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
# Use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model.to(device)

In [None]:
import json
from transformers import pipeline
from datasets import load_dataset


#in small part inspired by the oracle code
#https://colab.research.google.com/drive/1gZJCakmY28cKGMj8B7wd1GUM3r72pdbi?usp=sharing#scrollTo=8eKFjiC3i8Yx
# Function to get labels and claims from the dataset

train_df = pd.read_json('train_claims_quantemp.json')
val_df = pd.read_json('val_claims_quantemp.json')
#test_df = pd.read_json('test_claims_quantemp.json')

train_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])
#test_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in test_df.to_dict(orient='records')])

#old evidence file for bm25
#evidence_df = pd.read_json('bm25_top_100_claimdecomp.json')
#evidence_df = pd.DataFrame([{'claim': item['claim'], 'docs': item['docs'], 'scores': item['scores']} for item in evidence_df.to_dict(orient='records')])

#load csv evidence files for training and validation
train_evidence_df = pd.read_csv('NLP_Group16/evidences_train.csv')
val_evidence_df = pd.read_csv('NLP_Group16/evidences_val.csv')
train_evidence_df = train_evidence_df[['claim', 'evidences', 'scores']]
val_evidence_df = val_evidence_df[['claim', 'evidences', 'scores']]


# Get top_k relevant evidence for each claim, default is 5
def combine_top_k_evidence(row, top_k = 5):
    evidences = row['evidences'].strip('[]').split(', ')
    scores = list(map(float, row['scores'].strip('[]').split(', ')))
    ranked_documents = sorted(zip(evidences, scores), key=lambda x: x[1], reverse=True)
    top_k_documents = [doc.strip('"') for doc, score in ranked_documents[:top_k]]
    return ' '.join(top_k_documents)

# Apply the function to get top_k evidence for each claim
train_evidence_df['top_k_docs'] = train_evidence_df.apply(lambda row: combine_top_k_evidence(row), axis=1)
val_evidence_df['top_k_docs'] = val_evidence_df.apply(lambda row: combine_top_k_evidence(row), axis=1)

# Drop docs and scores columns as they are no longer needed
train_evidence_df = train_evidence_df.drop(columns=['evidences', 'scores'])
val_evidence_df = val_evidence_df.drop(columns=['evidences', 'scores'])

# Merge the combined evidence with the train, validation, and test DataFrames
train_df = train_df.merge(train_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
val_df = val_df.merge(val_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
#test_df = test_df.merge(evidence_df[['claim', 'top_k_docs']], on='claim', how='left')

In [None]:
def tokenize_data(df, max_length = 256):
    tokenized_inputs = tokenizer(
        df['claim'].tolist(),
        df['top_k_docs'].fillna('').tolist(),  # Fill NaN with empty strings
        truncation=True,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    labels = df['label'].apply(lambda x: {'False': 0, 'True': 1, 'Conflicting': 2}[x]).tolist()
    return tokenized_inputs, labels

train_tokenized, train_labels = tokenize_data(train_df)
val_tokenized, val_labels = tokenize_data(val_df)
#test_tokenized, test_labels = tokenize_data(test_df)

print(train_tokenized['input_ids'][0])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class for the tokenized data
class ClaimsDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = torch.tensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return {
            'input_ids': self.input_ids[index],
            'attention_mask': self.attention_mask[index],
            'labels': self.labels[index]
        }

train_dataset = ClaimsDataset(train_tokenized, train_labels)
val_dataset = ClaimsDataset(val_tokenized, val_labels)
#test_dataset = ClaimsDataset(test_tokenized, test_labels)

train_tokenized = {key: val.to(device) for key, val in train_tokenized.items()}
val_tokenized = {key: val.to(device) for key, val in val_tokenized.items()}

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers= 4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers= 4, pin_memory=True)
#test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, pin_memory=True)

In [None]:
torch.set_printoptions(threshold=torch.inf)
print(train_evidence_df[:3])

# This is just to check if the data is properly processed
for i, batch in enumerate(train_dataset):
    assert batch['input_ids'].shape == batch['attention_mask'].shape
    print(f"Batch {i+1}")
    print(batch)
    if i == 2:  
        break

In [None]:
from transformers import Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, AdamW
from torch.nn import CrossEntropyLoss
from sklearn.utils.class_weight import compute_class_weight
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

optimizer = AdamW(model.parameters(), lr=2e-5)

output_dir = './results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output directory is set to: {output_dir}")

# Function to get metrics from the model predictions
def compute_metrics(inputs):
    predictions, labels = inputs
    preds = np.argmax(predictions, axis=1)
    metrics = {}
    # Compute overall accuracy
    metrics['accuracy'] = accuracy_score(labels, preds)
    # Compute macro F1-score and weighted scores for the rest of the metrics
    _, _, f1_macro_overall, _ = precision_recall_fscore_support(labels, preds, average='macro')
    metrics['f1_macro'] = f1_macro_overall
    
    precision_overall, recall_overall, f1_weighted_overall, support_overall = precision_recall_fscore_support(labels, preds, average='weighted')
    metrics['f1_weighted'] = f1_weighted_overall
    metrics['precision'] = precision_overall
    metrics['recall'] = recall_overall
    metrics['support'] = support_overall
    
    # Compute metrics for each label
    for label_name, label_id in {'False': 0, 'True': 1, 'Conflicting': 2}.items():
        precision, recall, f1_weighted, _ = precision_recall_fscore_support(labels, preds, labels=[label_id], average='weighted')
        metrics[f'{label_name}_precision'] = precision
        metrics[f'{label_name}_recall'] = recall
        metrics[f'{label_name}_f1_weighted'] = f1_weighted    
    return metrics


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    save_steps = 5000,
    use_cpu=False, # Use GPU for training
    fp16=True,  # Enable mixed precision training
)


labels = train_df['label'].apply(lambda x: {'False': 0, 'True': 1, 'Conflicting': 2}[x])
label_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=labels)
label_weights = torch.tensor(label_weights, dtype=torch.float).to(device)
print(f"Label weights: {label_weights}")


# Logging predictions and labels to check if the model is learning
# Used AI tools to help make this function since I wasn't sure of the proper syntax
# not really relevant to the results regardless, just so I don't have to run the entire model to see if it's learning
class PredictionLoggerCallback(TrainerCallback):
   def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % args.logging_steps == 0:
            model = kwargs.get('model')
            data_loader = kwargs.get('train_dataloader')
            
            # Show how training is progressing by logging predictions and labels
            if model and data_loader:
                model.eval()
                batch = next(iter(data_loader))
                inputs = {key: value.to(args.device) for key, value in batch.items()}
                
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    preds = torch.argmax(logits, axis=-1)
                    labels = inputs['labels']
                    
                    print(f"Step {state.global_step}:")
                    print(f"Predictions: {preds.cpu().numpy()}")
                    print(f"Labels: {labels.cpu().numpy()}")
                model.train()
        return control               

# Custom trainer class that accounts for imbalances in label occurrences
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = CrossEntropyLoss(weight=label_weights)(logits, labels)
        if(return_outputs):
            return loss, outputs
        return loss
        
    
log_callback = PredictionLoggerCallback()

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics,
    callbacks=[log_callback]
)

trainer.train()


In [None]:
results = trainer.evaluate(val_dataset)

# Writing results + metrics to file
results_file = os.path.join(training_args.output_dir, "evaluation_results.txt")
with open(results_file, "w") as writer:
    for key, value in results.items():
        writer.write(f"{key}: {value}\n")

print(results)


In [None]:
import random

# Run predictions on the test set to perform qualitative analysis and find failed predictions
predictions_output = trainer.predict(val_dataset)
predictions = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids
failed_indices = np.where(predictions != labels)[0]

# Extract the corresponding claims and predictions
failed_claims = val_df.iloc[failed_indices]
failed_predictions = predictions[failed_indices]
failed_labels = labels[failed_indices]

# Randomly select claims for qualitative analysis get the corresponding labels and predictions
num_samples = 20
random_indices = random.sample(range(len(failed_indices)), min(num_samples, len(failed_indices)))
selected_failed_claims = failed_claims.iloc[random_indices]
selected_failed_predictions = failed_predictions[random_indices]
selected_failed_labels = failed_labels[random_indices]

qualitative_analysis_df = pd.DataFrame({
    'claim': selected_failed_claims['claim'].values,
    'top_k_docs': selected_failed_claims['top_k_docs'].values,
    'real_label': selected_failed_labels,
    'predicted_label': selected_failed_predictions
})

# Map numerical labels to their string representations
label_map = {0: 'False', 1: 'True', 2: 'Conflicting'}
qualitative_analysis_df['real_label'] = qualitative_analysis_df['real_label'].map(label_map)
qualitative_analysis_df['predicted_label'] = qualitative_analysis_df['predicted_label'].map(label_map)

# Save the qualitative analysis results to a text file
qualitative_analysis_file = os.path.join(training_args.output_dir, "qualitative_analysis.txt")
with open(qualitative_analysis_file, "w") as writer:
    for index, row in qualitative_analysis_df.iterrows():
        writer.write(f"Claim: {row['claim']}\n")
        writer.write(f"Evidence: {row['top_k_docs']}\n")
        writer.write(f"Real Label: {row['real_label']}\n")
        writer.write(f"Predicted Label: {row['predicted_label']}\n")
        writer.write("\n")


In [None]:
# Check distribution of labels in predictions
train_label_counts = pd.Series(train_labels).value_counts()
print(train_label_counts)
val_label_counts = pd.Series(val_labels).value_counts()
print(val_label_counts)
prediction_label_counts = pd.Series(predictions).value_counts()
print(prediction_label_counts)