## Llama7B on the squad dataset

The purpose of this is to measure the coherence of an LLM on the Cohence of solving general problems

# Model set up

In [1]:
# import the hugging face transformers library
import wandb
import torch
import os
from transformers import  Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader


In [2]:
#making sure I am using the gpu
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA A40


In [3]:
#Including code to get from token from environment
token= ""

In [4]:


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Gather dataset

In [6]:
squad_dataset = load_dataset("squad")

### Visualizing the Squad dataset

In [7]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [8]:
squad_dataset['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [16]:
def preprocess_function(examples):
    questions = examples['question']
    contexts = examples['context']

    # Tokenize the questions and contexts
    inputs = tokenizer(
        questions,
        contexts,
        padding="max_length",
        truncation=True,
        max_length=30,
        return_tensors="pt"
    )

    # Extract the answer text and start positions
    answers = examples['answers']
    start_positions = []
    labels = []

    for i in range(len(answers)):
        answer = answers[i]
        start_position = answer['answer_start'][0]
        labels.append(answer['text'][0])
        start_positions.append(start_position)

    inputs['start_positions'] = start_positions
    inputs['labels'] = labels

    return inputs


squad_dataset_validation = squad_dataset['validation']
squad_dataset_train = squad_dataset['train']

# Map preprocessing function to the dataset
encoded_train_dataset = squad_dataset_train.map(preprocess_function, batched=True, remove_columns=squad_dataset_train.column_names)
encoded_validation_dataset = squad_dataset_validation.map(preprocess_function, batched=True, remove_columns=squad_dataset_validation.column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

### Measuring Existing performance of LLama7B

In [11]:
def calculate_metrics(predicted_answer, true_answer):
    predicted_tokens = predicted_answer.lower().split()
    true_tokens = true_answer.lower().split()

    common_tokens = set(predicted_tokens) & set(true_tokens)
    exact_match = int(predicted_tokens == true_tokens)

    if len(predicted_tokens) == 0 or len(true_tokens) == 0:
        f1_score = int(predicted_tokens == true_tokens)
    else:
        precision = len(common_tokens) / len(predicted_tokens)
        recall = len(common_tokens) / len(true_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return exact_match, f1_score

In [17]:
import torch

def evaluate_model(model, tokenizer, dataset, num_samples=100):
    exact_match_scores = []
    f1_scores = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    current = 0
    for example in dataset["validation"].select(range(num_samples)):
        #print("p1")
        question = example["question"]
        context = example["context"]
        true_answer = example["answers"]["text"][0]

        #print("p2")
        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        #print("p3")
        with torch.no_grad():
            output_ids = model.generate(input_ids, max_new_tokens=50, num_return_sequences=1)

        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        #print("p4")
        exact_match, f1_score = calculate_metrics(predicted_answer, true_answer)
        exact_match_scores.append(exact_match)
        f1_scores.append(f1_score)
        current+=1
        print("Current value,",current)

    print("p5")
    avg_exact_match = sum(exact_match_scores) / len(exact_match_scores)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    

    return avg_exact_match, avg_f1_score

num_samples = 100  # Specify the number of samples to evaluate
exact_match, f1_score = evaluate_model(model, tokenizer, squad_dataset, num_samples)
print(f"Exact Match: {exact_match:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Current value, 1


Current value, 2
Current value, 3
Current value, 4
Current value, 5
Current value, 6
Current value, 7
Current value, 8
Current value, 9
Current value, 10
Current value, 11
Current value, 12
Current value, 13
Current value, 14
Current value, 15
Current value, 16
Current value, 17
Current value, 18
Current value, 19
Current value, 20
Current value, 21
Current value, 22
Current value, 23
Current value, 24
Current value, 25
Current value, 26
Current value, 27
Current value, 28
Current value, 29
Current value, 30
Current value, 31
Current value, 32
Current value, 33
Current value, 34
Current value, 35
Current value, 36
Current value, 37
Current value, 38
Current value, 39
Current value, 40
Current value, 41
Current value, 42
Current value, 43
Current value, 44
Current value, 45
Current value, 46
Current value, 47
Current value, 48
Current value, 49
Current value, 50
Current value, 51
Current value, 52
Current value, 53
Current value, 54
Current value, 55
Current value, 56
Current value, 57


### Sparse Reward Function

In [None]:
def sparse_reward(predictions, references, threshold=0.8):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(1 if f1_score >= threshold else 0)
    return torch.tensor(f1_scores)

## Dense Reward Function

In [None]:
def dense_reward(predictions, references):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    return torch.tensor(f1_scores)

## Model Fine-tuning

In [None]:
#The End