## Llama7B on the squad dataset

The purpose of this is to measure the coherence of an LLM on the Cohence of solving general problems

# Model set up

In [1]:

# import the hugging face transformers library
import wandb
import torch
import os
from transformers import  Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader



In [2]:
#making sure I am using the gpu
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA A40


In [3]:
#Including code to get from token from environment
token= ""

In [4]:


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Gather dataset

In [6]:
squad_dataset = load_dataset("squad")

### Visualizing the Squad dataset

In [7]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [8]:
squad_dataset['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [9]:
def preprocess_function(examples):
    questions = examples['question']
    contexts = examples['context']

    # Tokenize the questions and contexts
    inputs = tokenizer(
        questions,
        contexts,
        padding="max_length",
        truncation=True,
        max_length=30,
        return_tensors="pt"
    )

    # Extract the answer text and start positions
    answers = examples['answers']
    start_positions = []
    labels = []

    for i in range(len(answers)):
        answer = answers[i]
        start_position = answer['answer_start'][0]
        labels.append(answer['text'][0])
        start_positions.append(start_position)

    inputs['start_positions'] = start_positions
    inputs['labels'] = labels

    return inputs


squad_dataset_validation = squad_dataset['validation']
squad_dataset_train = squad_dataset['train']

# Map preprocessing function to the dataset
encoded_train_dataset = squad_dataset_train.map(preprocess_function, batched=True, remove_columns=squad_dataset_train.column_names)
encoded_validation_dataset = squad_dataset_validation.map(preprocess_function, batched=True, remove_columns=squad_dataset_validation.column_names)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [16]:
encoded_validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'labels'],
    num_rows: 10570
})

In [17]:
encoded_validation_dataset[0]

{'input_ids': [1,
  8449,
  25167,
  3815,
  9875,
  278,
  319,
  8610,
  472,
  5670,
  27207,
  29871,
  29945,
  29900,
  29973,
  1,
  5670,
  27207,
  29871,
  29945,
  29900,
  471,
  385,
  3082,
  5733,
  3748,
  304,
  8161,
  278,
  8064],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'start_positions': 177,
 'labels': 'Denver Broncos'}

### Measuring Existing performance of LLama7B

In [10]:
def calculate_metrics(predicted_answer, true_answer):
    predicted_tokens = predicted_answer.lower().split()
    true_tokens = true_answer.lower().split()

    common_tokens = set(predicted_tokens) & set(true_tokens)
    exact_match = int(predicted_tokens == true_tokens)

    if len(predicted_tokens) == 0 or len(true_tokens) == 0:
        f1_score = int(predicted_tokens == true_tokens)
    else:
        precision = len(common_tokens) / len(predicted_tokens)
        recall = len(common_tokens) / len(true_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return exact_match, f1_score

In [12]:
#this code below is meant to just check the existing perfomance of llama 7b on the dataset

def evaluate_model(model, tokenizer, dataset, num_samples=100):
    exact_match_scores = []
    f1_scores = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    current = 0
    for example in dataset["validation"].select(range(num_samples)):
        #print("p1")
        question = example["question"]
        context = example["context"]
        true_answer = example["answers"]["text"][0]

        #print("p2")
        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        #print("p3")
        with torch.no_grad():
            output_ids = model.generate(input_ids, max_new_tokens=50, num_return_sequences=1)

        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        #print("p4")
        exact_match, f1_score = calculate_metrics(predicted_answer, true_answer)
        exact_match_scores.append(exact_match)
        f1_scores.append(f1_score)
        current+=1
        print("Current value,",current)

    print("p5")
    avg_exact_match = sum(exact_match_scores) / len(exact_match_scores)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    

    return avg_exact_match, avg_f1_score

num_samples = 10  # Specify the number of samples to evaluate
exact_match, f1_score = evaluate_model(model, tokenizer, squad_dataset, num_samples)
print(f"Exact Match: {exact_match:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Current value, 1


Current value, 2
Current value, 3
Current value, 4
Current value, 5
Current value, 6
Current value, 7
Current value, 8
Current value, 9
Current value, 10
p5
Exact Match: 0.0000
F1 Score: 0.0273


### Sparse Reward Function

In [13]:
def sparse_reward(predictions, references, threshold=0.8):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(1 if f1_score >= threshold else 0)
    return torch.tensor(f1_scores)

## Dense Reward Function

In [18]:
#changing the reward function to handle tokens instead of text.
def dense_reward(predictions, references, tokenizer):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        # Decode the tokens to strings
        pred_text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        ref_text = tokenizer.decode(ref, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        
        pred_tokens = pred_text.split()
        ref_tokens = ref_text.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens) if len(pred_tokens) > 0 else 0
        recall = len(common_tokens) / len(ref_tokens) if len(ref_tokens) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    return torch.tensor(f1_scores)


## Model Fine-tuning

In [26]:
from transformers import Trainer, TrainingArguments
import wandb

def train_model(model, tokenizer, input_train_dataset,input_validation_dataset, reward_function, reward_type):
    # Initialize WandB with specific configurations
    wandb.init(project="Coherence", name=f"Training with {reward_type} Reward")
    
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{reward_type}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="steps",
        logging_steps=500,
        save_steps=1000,
        save_total_limit=2,
        seed=42,
        load_best_model_at_end=True,
        report_to="wandb"
    )
    
    # Create the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=input_train_dataset,
        eval_dataset=input_validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=lambda pred: {"reward": reward_function(pred.predictions, pred.label_ids)}
    )
    
    # Train the model
    trainer.train()
    
    # Finish the WandB run
    wandb.finish()

# Example usage
train_model(model, tokenizer, encoded_train_dataset, encoded_validation_dataset, dense_reward, "Dense")

#now train the sparse reward
#squad_dataset_validation = squad_dataset['validation']
#squad_dataset_train = squad_dataset['train']


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112841653327148, max=1.0…

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

## Creating mock versions of both for error checking

In [41]:
#testing new version of the dense reward
# Define the dense reward function
def dense_reward(predictions, references):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    return torch.tensor(f1_scores)

In [21]:
import torch

def mock_reward(predictions, references):
    # This simple function will calculate a mock reward based simply on the number of elements in predictions
    # Assuming predictions are lists or similar iterable of outputs from the model
    rewards = []
    for pred in predictions:
        # For simplicity, let's assume a reward of 1.0 for any prediction of length > 0
        reward = 1.0 if len(pred) > 0 else 0.0
        rewards.append(reward)
    return torch.tensor(rewards)


In [22]:
# Simulate some prediction outputs from a model
sample_predictions = [
    torch.tensor([23, 45, 67]),  # Non-empty tensor
    torch.tensor([]),            # Empty tensor
    torch.tensor([5, 8, 2])      # Another non-empty tensor
]

# Simulate some reference data (not used in the reward calculation here)
sample_references = [
    torch.tensor([1]),           # Placeholder data
    torch.tensor([2]),           # Placeholder data
    torch.tensor([3])            # Placeholder data
]


In [23]:
# Call the mock reward function with the sample data
rewards = mock_reward(sample_predictions, sample_references)
print("Rewards computed:", rewards)


Rewards computed: tensor([1., 0., 1.])


In [24]:
#taking a look at the inputs fromt the encoded training and validation data
squad_dataset_train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [27]:
encoded_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'labels'],
    num_rows: 87599
})

In [37]:
# Extract a single sample from training and validation datasets
train_sample = encoded_train_dataset[0]
validation_sample = encoded_validation_dataset[0]

# Print out the samples to understand their structure
print("Training Sample:", train_sample)
print("Validation Sample:", validation_sample)


Training Sample: {'input_ids': [1, 1763, 6029, 1258, 278, 9167, 6182, 16831, 23244, 2615, 297, 29871, 29896, 29947, 29945, 1, 2595, 4496, 332, 635, 29892, 278, 3762, 756, 263, 11865, 2931, 29889, 2180, 459], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'start_positions': 515, 'labels': 'Saint Bernadette Soubirous'}
Validation Sample: {'input_ids': [1, 8449, 25167, 3815, 9875, 278, 319, 8610, 472, 5670, 27207, 29871, 29945, 29900, 29973, 1, 5670, 27207, 29871, 29945, 29900, 471, 385, 3082, 5733, 3748, 304, 8161, 278, 8064], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'start_positions': 177, 'labels': 'Denver Broncos'}


In [38]:
# Simulate a prediction output from the model, for demonstration let's just reuse the input_ids as predictions
simulated_predictions = [train_sample['input_ids'], validation_sample['input_ids']]
references = [train_sample['labels'], validation_sample['labels']]


In [33]:
# Call the mock reward function
rewards = mock_reward(simulated_predictions, references)
print("Computed Rewards:", rewards)


Computed Rewards: tensor([1., 1.])


In [43]:
def process_predictions_and_references(predictions, references, tokenizer):
    # Decode predictions (list of token IDs) to text
    decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    return decoded_predictions, references  # References are already text

# Example usage (assuming some model outputs and references)
predictions = [train_sample['input_ids'], validation_sample['input_ids']]  # Simulated model predictions
references = [train_sample['labels'], validation_sample['labels']]  # Actual references

decoded_predictions, processed_references = process_predictions_and_references(predictions, references, tokenizer)

# Now pass the decoded predictions and the text references to the reward function
rewards = dense_reward(decoded_predictions, processed_references)
print("Computed Rewards:", rewards)


Computed Rewards: tensor([0, 0])


In [42]:
# Call the mock reward function
rewards = dense_reward(simulated_predictions, references)
print("Computed Rewards:", rewards)


AttributeError: 'list' object has no attribute 'split'

In [None]:
#The End