## Llama7B on the squad dataset

The purpose of this is to measure the coherence of an LLM on the Cohence of solving general problems

# Model set up

In [1]:

# import the hugging face transformers library
import wandb
import torch
import os
from transformers import  Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader



In [2]:
#making sure I am using the gpu
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA L40S


In [3]:
#Including code to get from token from environment
token= "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"

In [4]:


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Gather dataset

In [6]:
squad_dataset = load_dataset("squad")

### Visualizing the Squad dataset

In [7]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [8]:
squad_dataset['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [9]:
def preprocess_function(examples):
    questions = examples['question']
    contexts = examples['context']

    # Tokenize the questions and contexts
    inputs = tokenizer(
        questions,
        contexts,
        padding="max_length",
        truncation=True,
        max_length=30,
        return_tensors="pt"
    )

    # Extract the answer text and start positions
    answers = examples['answers']
    start_positions = []
    labels = []

    for i in range(len(answers)):
        answer = answers[i]
        start_position = answer['answer_start'][0]
        label = tokenizer.encode(answer['text'][0], add_special_tokens=False)  # Tokenize the answer text
        
        # Truncate or pad the label to match the max_length
        if len(label) > 30:
            label = label[:30]
        else:
            label = label + [tokenizer.pad_token_id] * (30 - len(label))
        
        start_positions.append(start_position)
        labels.append(label)

    inputs['start_positions'] = start_positions
    inputs['labels'] = labels

    return inputs


squad_dataset_validation = squad_dataset['validation']
squad_dataset_train = squad_dataset['train']

# Map preprocessing function to the dataset
encoded_train_dataset = squad_dataset_train.map(preprocess_function, batched=True, remove_columns=squad_dataset_train.column_names)
encoded_validation_dataset = squad_dataset_validation.map(preprocess_function, batched=True, remove_columns=squad_dataset_validation.column_names)


#changing the size of the dataset in order to reduce the errors when having an out of memeory issue.
encoded_validation_dataset = encoded_validation_dataset.select(range(100))
encoded_train_dataset = encoded_train_dataset.select(range(1000))


In [10]:
encoded_validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'labels'],
    num_rows: 100
})

In [11]:
encoded_validation_dataset[0]

{'input_ids': [1,
  8449,
  25167,
  3815,
  9875,
  278,
  319,
  8610,
  472,
  5670,
  27207,
  29871,
  29945,
  29900,
  29973,
  1,
  5670,
  27207,
  29871,
  29945,
  29900,
  471,
  385,
  3082,
  5733,
  3748,
  304,
  8161,
  278,
  8064],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'start_positions': 177,
 'labels': [3384,
  369,
  14165,
  3944,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2]}

### Measuring Existing performance of LLama7B

In [12]:
def calculate_metrics(predicted_answer, true_answer):
    predicted_tokens = predicted_answer.lower().split()
    true_tokens = true_answer.lower().split()

    common_tokens = set(predicted_tokens) & set(true_tokens)
    exact_match = int(predicted_tokens == true_tokens)

    if len(predicted_tokens) == 0 or len(true_tokens) == 0:
        f1_score = int(predicted_tokens == true_tokens)
    else:
        precision = len(common_tokens) / len(predicted_tokens)
        recall = len(common_tokens) / len(true_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return exact_match, f1_score

In [13]:
#this code below is meant to just check the existing perfomance of llama 7b on the dataset

def evaluate_model(model, tokenizer, dataset, num_samples=100):
    exact_match_scores = []
    f1_scores = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    current = 0
    for example in dataset["validation"].select(range(num_samples)):
        #print("p1")
        question = example["question"]
        context = example["context"]
        true_answer = example["answers"]["text"][0]

        #print("p2")
        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        #print("p3")
        with torch.no_grad():
            output_ids = model.generate(input_ids, max_new_tokens=50, num_return_sequences=1)

        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        #print("p4")
        exact_match, f1_score = calculate_metrics(predicted_answer, true_answer)
        exact_match_scores.append(exact_match)
        f1_scores.append(f1_score)
        current+=1
        print("Current value,",current)

    print("p5")
    avg_exact_match = sum(exact_match_scores) / len(exact_match_scores)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    

    return avg_exact_match, avg_f1_score

num_samples = 10  # Specify the number of samples to evaluate
exact_match, f1_score = evaluate_model(model, tokenizer, squad_dataset, num_samples)
print(f"Exact Match: {exact_match:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Current value, 1
Current value, 2
Current value, 3
Current value, 4
Current value, 5
Current value, 6
Current value, 7
Current value, 8
Current value, 9
Current value, 10
p5
Exact Match: 0.0000
F1 Score: 0.0283


### Sparse Reward Function

In [14]:
def sparse_reward(predictions, references, threshold=0.8):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(1 if f1_score >= threshold else 0)
    return torch.tensor(f1_scores)

## Dense Reward Function

In [15]:
#testing new version of the dense reward
# Define the dense reward function
def dense_reward(predictions, references):
    print("predictions are", predictions)
    print("references are",references)
        
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    return torch.tensor(f1_scores)

In [16]:
tokenizer.pad_token = tokenizer.eos_token

## Model Fine-tuning

In [17]:
from transformers import Trainer, TrainingArguments
import wandb

def process_predictions_and_references(predictions, references, tokenizer):
    decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    decoded_references = [tokenizer.decode(ref, skip_special_tokens=True) for ref in references]  # Decode the flattened references
    return decoded_predictions, decoded_references

def dense_reward(predictions, references):
    """Calculate dense rewards based on decoded predictions and text references."""
    f1_scores = []
    for pred_text, ref_text in zip(predictions, references):
        pred_tokens = pred_text.split()
        ref_tokens = ref_text.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    print(f"F1 scores data type: {type(f1_scores)}")
    print(f"F1 scores element data type: {type(f1_scores[0])}")
    return torch.tensor(f1_scores)

def train_model(model, tokenizer, input_train_dataset, input_validation_dataset, reward_function, reward_type):
    # Initialize WandB with specific configurations
   # wandb.init(project="Coherence", name=f"Training with {reward_type} Reward")
    print("in training model")
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{reward_type}",
        num_train_epochs=2,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=1000,
        evaluation_strategy="steps",
        logging_steps=500,
        save_steps=1000,
        save_total_limit=2,
        seed=42,
        load_best_model_at_end=True,
        fp16=True,
        report_to=None
        
        #report_to="wandb"
    )
    print("after training args")

    # Define the compute_metrics function
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        print(f"Predictions data type: {type(predictions)}")
        print(f"Labels data type: {type(labels)}")
        decoded_predictions, decoded_labels = process_predictions_and_references(predictions, labels, tokenizer)
        return {"reward": reward_function(decoded_predictions, decoded_labels)}

    print("after compute metrics")
    # Create the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=input_train_dataset,
        eval_dataset=input_validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    print("train datset",input_train_dataset.shape)
    print("eval dataset",input_validation_dataset.shape)
    
    print("train datset",input_train_dataset[0])
    print("eval dataset",input_validation_dataset[0])

    # Train the model
    trainer.train()
    
    print("after train()")

    # Finish the WandB run
    #wandb.finish()

print("starting training")
#encoded_train_dataset = encoded_train_dataset.remove_columns(squad_dataset_train.column_names)
#encoded_validation_dataset = encoded_validation_dataset.remove_columns(squad_dataset_validation.column_names)
# Example usage
train_model(model, tokenizer, encoded_train_dataset, encoded_validation_dataset, dense_reward, "Dense")


starting training
in training model
after training args
after compute metrics
train datset (1000, 4)
eval dataset (100, 4)
train datset {'input_ids': [1, 1763, 6029, 1258, 278, 9167, 6182, 16831, 23244, 2615, 297, 29871, 29896, 29947, 29945, 1, 2595, 4496, 332, 635, 29892, 278, 3762, 756, 263, 11865, 2931, 29889, 2180, 459], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'start_positions': 515, 'labels': [4107, 6209, 328, 2353, 9194, 20397, 681, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
eval dataset {'input_ids': [1, 8449, 25167, 3815, 9875, 278, 319, 8610, 472, 5670, 27207, 29871, 29945, 29900, 29973, 1, 5670, 27207, 29871, 29945, 29900, 471, 385, 3082, 5733, 3748, 304, 8161, 278, 8064], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'start_positions': 177, 'labels': [3384, 369, 14165, 3944, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjprivera44[0m ([33mcs7643_jp[0m). Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 44.53 GiB total capacity; 42.84 GiB already allocated; 104.31 MiB free; 43.91 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
stop

In [None]:
#top

In [None]:
# Extract a single sample from training and validation datasets
train_sample = encoded_train_dataset[0]
validation_sample = encoded_validation_dataset[0]

# Print out the samples to understand their structure
print("Training Sample:", train_sample)
print("Validation Sample:", validation_sample)

In [None]:
def process_predictions_and_references(predictions, references, tokenizer):
    # Decode predictions (list of token IDs) to text
    decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    return decoded_predictions, references  # References are already text

# Example usage (assuming some model outputs and references)
predictions = [train_sample['input_ids'], validation_sample['input_ids']]  # Simulated model predictions
references = [train_sample['labels'], validation_sample['labels']]  # Actual references

decoded_predictions, processed_references = process_predictions_and_references(predictions, references, tokenizer)

# Now pass the decoded predictions and the text references to the reward function
rewards = dense_reward(decoded_predictions, processed_references)
print("Computed Rewards:", rewards)

In [None]:
# If using a DataLoader, you can set batch_size directly.
# For datasets library, manually slice the dataset for simplicity
batch_size = 10  # Example batch size
train_batch = encoded_train_dataset[:batch_size]
validation_batch = encoded_validation_dataset[:batch_size]



In [None]:
# Assuming train_batch is a dictionary with keys 'input_ids' and 'labels' each mapping to a list
predictions = train_batch['input_ids']  # Directly access the list of input_ids
references = train_batch['labels']      # Directly access the list of labels


In [None]:
# Example of handling batches correctly
#batch_size = 10
for i in range(0, len(encoded_train_dataset), batch_size):
    # Ensure that the slicing returns the correct format
    batch = encoded_train_dataset[i:i+batch_size]
    if isinstance(batch, dict):
        predictions = batch['input_ids']
        references = batch['labels']
    else:
        predictions = [item['input_ids'] for item in batch]
        references = [item['labels'] for item in batch]

    # Continue processing as before
    decoded_predictions, processed_references = process_predictions_and_references(predictions, references, tokenizer)
    batch_rewards = dense_reward(decoded_predictions, processed_references)
    print(f"Batch {i//batch_size} Computed Rewards:", batch_rewards)


In [None]:


def process_predictions_and_references(predictions, references, tokenizer):
    """Decode predictions to text and pair with references."""
    decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    return decoded_predictions, references  # References are already text

def dense_reward(predictions, references):
    """Calculate dense rewards based on decoded predictions and text references."""
    f1_scores = []
    for pred_text, ref_text in zip(predictions, references):
        pred_tokens = pred_text.split()
        ref_tokens = ref_text.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    return torch.tensor(f1_scores)

def evaluate_batch(batch, tokenizer):
    """Evaluate a single batch and return rewards."""
    decoded_predictions, processed_references = process_predictions_and_references(batch['input_ids'], batch['labels'], tokenizer)
    return dense_reward(decoded_predictions, processed_references)

# Assuming batch_size is set and data_loader or dataset is ready
batch_size = 10  # Example batch size
for i in range(0, len(encoded_train_dataset), batch_size):
    batch = encoded_train_dataset[i:i+batch_size]
    batch_rewards = evaluate_batch(batch, tokenizer)
    print(f"Batch {i // batch_size} Computed Rewards:", batch_rewards)


Here is the expected correct output from above

Batch 0 Computed Rewards: tensor([0.0000, 0.1600, 0.0952, 0.0800, 0.2308, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000])
Batch 1 Computed Rewards: tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0833, 0.0000, 0.0000, 0.2222, 0.0000,
        0.0000])
Batch 2 Computed Rewards: tensor([0.0000, 0.0000, 0.0833, 0.0000, 0.0000, 0.1600, 0.0714, 0.0000, 0.0690,
        0.5161])
Batch 3 Computed Rewards: tensor([0.2759, 0.0000, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000])
Batch 4 Computed Rewards: tensor([0.0000, 0.1290, 0.0000, 0.0741, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.3333])
Batch 5 Computed Rewards: tensor([0.0000, 0.0000, 0.0000, 0.0909, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.2609])
Batch 6 Computed Rewards: tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.3810, 0.0000, 0.0000, 0.2609, 0.0000,
        0.0000])
Batch 7 Computed Rewards: tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0741, 0.0000, 0.1176,

In [None]:
#The End