In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
# Use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model.to(device)

cuda


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [2]:
import json
from transformers import pipeline
from datasets import load_dataset


#inspired by the oracle code
#https://colab.research.google.com/drive/1gZJCakmY28cKGMj8B7wd1GUM3r72pdbi?usp=sharing#scrollTo=8eKFjiC3i8Yx
# Function to get labels and claims from the dataset

train_df = pd.read_json('train_claims_quantemp.json')
val_df = pd.read_json('val_claims_quantemp.json')
#test_df = pd.read_json('test_claims_quantemp.json')

train_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])
#test_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in test_df.to_dict(orient='records')])

#old evidence file for bm25
#evidence_df = pd.read_json('bm25_top_100_claimdecomp.json')
#evidence_df = pd.DataFrame([{'claim': item['claim'], 'docs': item['docs'], 'scores': item['scores']} for item in evidence_df.to_dict(orient='records')])

#load csv evidence files for training and validation
train_evidence_df = pd.read_csv('NLP_Group16/evidences_train.csv')
val_evidence_df = pd.read_csv('NLP_Group16/evidences_val.csv')
train_evidence_df = train_evidence_df[['claim', 'evidences', 'scores']]
val_evidence_df = val_evidence_df[['claim', 'evidences', 'scores']]


# Get top_k relevant evidence for each claim
def combine_top_k_evidence(row, top_k):
    evidences = row['evidences'].strip('[]').split(', ')
    scores = list(map(float, row['scores'].strip('[]').split(', ')))
    ranked_documents = sorted(zip(evidences, scores), key=lambda x: x[1], reverse=True)
    top_k_documents = [doc.strip('"') for doc, score in ranked_documents[:top_k]]
    return ' '.join(top_k_documents)

top_k = 5
# Apply the function to get top_k evidence for each claim
train_evidence_df['top_k_docs'] = train_evidence_df.apply(lambda row: combine_top_k_evidence(row, top_k), axis=1)
val_evidence_df['top_k_docs'] = val_evidence_df.apply(lambda row: combine_top_k_evidence(row, top_k), axis=1)

# Drop docs and scores columns as they are no longer needed
train_evidence_df = train_evidence_df.drop(columns=['evidences', 'scores'])
val_evidence_df = val_evidence_df.drop(columns=['evidences', 'scores'])

# Merge the combined evidence with the train, validation, and test DataFrames
train_df = train_df.merge(train_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
val_df = val_df.merge(val_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
#test_df = test_df.merge(evidence_df[['claim', 'top_k_docs']], on='claim', how='left')

In [3]:
def tokenize_data(df, max_length = 256):
    tokenized_inputs = tokenizer(
        df['claim'].tolist(),
        df['top_k_docs'].fillna('').tolist(),  # Fill NaN with empty strings
        truncation=True,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    labels = df['label'].apply(lambda x: {'False': 0, 'True': 1, 'Conflicting': 2}[x]).tolist()
    return tokenized_inputs, labels

train_tokenized, train_labels = tokenize_data(train_df)
val_tokenized, val_labels = tokenize_data(val_df)
#test_tokenized, test_labels = tokenize_data(test_df)

print(train_tokenized['input_ids'][0])



tensor([    0,  1121,    69,  1229,  1901,     6,   234,  9856,  2331, 33922,
          271,  7243,  1695,    14,     5,  1621,  7664,  1718,     6,   151,
         4963, 10918, 27353,    11,     5,   247,     4,     2,     2,  3865,
        27953,    35,   295,  9856,  2331,   579,  3432,   271,  7243,   399,
           75,  2026,    14,   213, 26390,  7664,  1718,     6,   151, 11398,
         1535,   669, 27353,     4,    31,     5,   569,  9565,     5,  1901,
           24,    16,   699,    14,    79,    26,  1718, 11398,  1535,     4,
          295,  9856,  2331,   579,  3432,   271,  7243,   399,    75,  2026,
           14,   213, 26390,  7664,  1718,     6,   151, 11398,  1535,   669,
        27353,     4,    31,     5,   569,     9,     5,  1901,    24,    16,
          699,    14,    79,    26,  1718, 11398,  1535,    45,  1718,   151,
        11398,  1535,     4,   128,   406, 10668,   428,   193,  1437,    22,
          627,   168,    34,  7664,   733,  4963,   669, 27353, 

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class ClaimsDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = torch.tensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = ClaimsDataset(train_tokenized, train_labels)
val_dataset = ClaimsDataset(val_tokenized, val_labels)
#test_dataset = ClaimsDataset(test_tokenized, test_labels)

train_tokenized = {key: val.to(device) for key, val in train_tokenized.items()}
val_tokenized = {key: val.to(device) for key, val in val_tokenized.items()}

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers= 4, pin_memory=False)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers= 4, pin_memory=False)
#test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, pin_memory=True)

In [5]:
torch.set_printoptions(threshold=torch.inf)
print(train_evidence_df[:3])

for i, batch in enumerate(train_dataset):
    assert batch['input_ids'].shape == batch['attention_mask'].shape
    print(f"Batch {i+1}")
    print(batch)
    if i == 2:  # Only print the first 3 batches
        break

                                               claim  \
0  In her budget speech, Nirmala Sitharaman claim...   
1  Florida residents affected by Hurricane Irma c...   
2  Bill Gates' foundation tested a polio vaccine ...   

                                          top_k_docs  
0  conclusion: nirmala sitharaman didn't claim th...  
1  'florida due to hurricane irma are greater tha...  
2  '490,000 children paralyzed. "bill gates found...  
Batch 1
{'input_ids': tensor([    0,  1121,    69,  1229,  1901,     6,   234,  9856,  2331, 33922,
          271,  7243,  1695,    14,     5,  1621,  7664,  1718,     6,   151,
         4963, 10918, 27353,    11,     5,   247,     4,     2,     2,  3865,
        27953,    35,   295,  9856,  2331,   579,  3432,   271,  7243,   399,
           75,  2026,    14,   213, 26390,  7664,  1718,     6,   151, 11398,
         1535,   669, 27353,     4,    31,     5,   569,  9565,     5,  1901,
           24,    16,   699,    14,    79,    26,  1718, 11398,  

In [6]:
from transformers import Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, AdamW
from torch.nn import CrossEntropyLoss
from sklearn.utils.class_weight import compute_class_weight
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

optimizer = AdamW(model.parameters(), lr=2e-5)

output_dir = './results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output directory is set to: {output_dir}")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    
    metrics = {}

    # Compute macro F1-score
    _, _, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro')
    metrics['f1_macro'] = f1_macro

    # Compute overall accuracy
    metrics['accuracy'] = accuracy_score(labels, preds)
    
    # Compute metrics for each label
    for label_name, label_id in {'False': 0, 'True': 1, 'Conflicting': 2}.items():
        precision, recall, f1_weighted, support = precision_recall_fscore_support(labels, preds, labels=[label_id], average='weighted')
        metrics[f'{label_name}_precision'] = precision
        metrics[f'{label_name}_recall'] = recall
        metrics[f'{label_name}_f1_weighted'] = f1_weighted
        metrics[f'{label_name}_support'] = support
    
    return metrics

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=2,
    save_steps = 5000,
    evaluation_strategy='steps',
    eval_steps=100,
    )

label_weights = compute_class_weight(
    class_weight='balanced',
    classes=[0, 1, 2],
    y=train_df['label'].apply(lambda x: {'False': 0, 'True': 1, 'Conflicting': 2}[x])
)
label_weights = torch.tensor(label_weights, dtype=torch.float).to(device)
print(f"Label weights: {label_weights}")


# logging predictions and labels to check if the model is learning
class PredictionLoggerCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % args.logging_steps == 0:
            model = kwargs.get('model')
            dataloader = kwargs.get('train_dataloader')
            
            if model and dataloader:
                model.eval()
                batch = next(iter(dataloader))
                inputs = {key: value.to(args.device) for key, value in batch.items()}
                
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    preds = torch.argmax(logits, axis=-1)
                    labels = inputs['labels']
                    
                    print(f"Step {state.global_step}:")
                    print(f"Predictions: {preds.cpu().numpy()}")
                    print(f"Labels: {labels.cpu().numpy()}")
                model.train()

        return control

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=label_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
log_callback = PredictionLoggerCallback()


trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics,
    callbacks=[log_callback]
)

trainer.train()


Output directory is set to: ./results




Label weights: tensor([0.5740, 1.8138, 1.4151], device='cuda:0')


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/1244 [00:00<?, ?it/s]

Step 2:
Predictions: [1 1 1 0 1 1 1 0]
Labels: [0 0 0 2 0 0 0 2]
{'loss': 2.1111, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.0}
Step 4:
Predictions: [1 0 2 1 1 1 1 1]
Labels: [0 1 1 0 0 0 2 2]
{'loss': 1.6463, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.0}
Step 6:
Predictions: [1 2 0 2 1 0 1 1]
Labels: [2 0 0 0 0 0 0 0]
{'loss': 2.3153, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.0}
Step 8:
Predictions: [0 1 1 2 0 1 0 1]
Labels: [1 0 1 0 0 0 0 1]
{'loss': 1.5396, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.01}
Step 10:
Predictions: [1 0 1 1 1 1 1 1]
Labels: [0 0 0 1 1 1 1 1]
{'loss': 1.8784, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
Step 12:
Predictions: [0 2 1 1 1 0 2 1]
Labels: [1 0 1 0 0 2 0 0]
{'loss': 1.3376, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.01}
Step 14:
Predictions: [1 0 1 2 1 1 1 1]
Labels: [0 0 0 0 2 0 1 0]
{'loss': 1.4406, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.01}


In [None]:
# Save eval results to file
results = trainer.evaluate(val_dataset)

results_file = os.path.join(training_args.output_dir, "evaluation_results.txt")
with open(results_file, "w") as writer:
    for key, value in results.items():
        writer.write(f"{key}: {value}\n")

print(results)


In [None]:
import random

# Run predictions on the test set and identify failed cases
predictions_output = trainer.predict(val_dataset)
predictions = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids
failed_indices = np.where(predictions != labels)[0]

# Get claims and predictions
failed_claims = val_df.iloc[failed_indices]
failed_predictions = predictions[failed_indices]
failed_labels = labels[failed_indices]

# Randomly select claims for qualitative analysis
num_samples = 20
random_indices = random.sample(range(len(failed_indices)), min(num_samples, len(failed_indices)))

# Extract the randomly selected failed claims, predictions, and the labels they should have had
selected_failed_claims = failed_claims.iloc[random_indices]
selected_failed_predictions = failed_predictions[random_indices]
selected_failed_labels = failed_labels[random_indices]

# DataFrame for qualitative analysis
qualitative_analysis_df = pd.DataFrame({
    'claim': selected_failed_claims['claim'].values,
    'top_k_docs': selected_failed_claims['top_k_docs'].values,
    'real_label': selected_failed_labels,
    'predicted_label': selected_failed_predictions
})

# Mapping numerical labels to their string representations
label_map = {0: 'False', 1: 'True', 2: 'Conflicting'}
qualitative_analysis_df['real_label'] = qualitative_analysis_df['real_label'].map(label_map)
qualitative_analysis_df['predicted_label'] = qualitative_analysis_df['predicted_label'].map(label_map)

# Saving the results to a text file
qualitative_analysis_file = os.path.join(training_args.output_dir, "qualitative_analysis.txt")
with open(qualitative_analysis_file, "w") as writer:
    for index, row in qualitative_analysis_df.iterrows():
        writer.write(f"Claim: {row['claim']}\n")
        writer.write(f"Evidence: {row['top_k_docs']}\n")
        writer.write(f"Real Label: {row['real_label']}\n")
        writer.write(f"Predicted Label: {row['predicted_label']}\n")
        writer.write("\n")


In [None]:
# Just checking if the model is learning
train_label_counts = pd.Series(train_labels).value_counts()
print(train_label_counts)
val_label_counts = pd.Series(val_labels).value_counts()
print(val_label_counts)
prediction_label_counts = pd.Series(predictions).value_counts()
print(prediction_label_counts)