In [1]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoTokenizer, AdamW, AutoModelForCausalLM
import torch

In [2]:
#check if there is a gpu device if so set it, else cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
token = "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"

In [4]:
#bringing in chat version in order to understand question and answer scenarios
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token,device=device)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)

#Moving the model to the gpu
model = model.to(device)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
#def tokenize_function(examples):
 #   return tokenizer(examples["text"], padding="max_length", truncation=True)
# Load the SQuAD dataset
raw_datasets = load_dataset("squad")

#tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [6]:
#set up the pad token
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation=True,
        padding="max_length",
    )
    return inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10570
    })
})

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if context_start >= len(offset) or context_end >= len(offset) or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [10]:
# Create the training and evaluation datasets
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["validation"]

In [11]:
# Define the training arguments and create the Trainer
training_args = TrainingArguments(
    output_dir="llama2-7b-chat-squad",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjprivera44[0m ([33mcs7643_jp[0m). Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 66.00 MiB (GPU 0; 44.35 GiB total capacity; 43.86 GiB already allocated; 11.75 MiB free; 44.03 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:


#training_args = TrainingArguments("test_trainer")

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="test_trainer",
    num_train_epochs=3.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,  # Assuming 500 training steps, adjust based on your dataset
    weight_decay=0.01,
    logging_dir="test_trainer/runs",
    logging_steps=10,
    eval_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_steps=50,
    learning_rate=5e-05,
    lr_scheduler_type="linear",
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    max_grad_norm=1.0,
    seed=42,
    report_to=["wandb"],
    load_best_model_at_end=True  # Optional: Load the best model at the end of training based on loss
)



In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    optimizers=(optimizer, None)  # Custom optimizer, no scheduler
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0085,1.086287
2,0.0004,0.848492
3,0.0811,0.90878


TrainOutput(global_step=375, training_loss=0.057011402130592616, metrics={'train_runtime': 118.7379, 'train_samples_per_second': 25.266, 'train_steps_per_second': 3.158, 'total_flos': 789333166080000.0, 'train_loss': 0.057011402130592616, 'epoch': 3.0})

In [None]:
# new section

In [None]:
from datasets import load_dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

# Load the SQuAD dataset
raw_datasets = load_dataset("squad")

# Load the llama7b model and tokenizer
model_name = "meta-llama/llama-7B-hf-transformers"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name)



OSError: meta-llama/llama-7B-hf-transformers is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:

# Preprocess the SQuAD dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

In [None]:

# Create the training and evaluation datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Define the training arguments
training_args = TrainingArguments(
    output_dir="llama7b-squad",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

In [None]:
#The end