<a href="https://colab.research.google.com/github/zakcroft/fine-tuning-notebooks/blob/main/Lamini_fine_tuning_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets config transformers[torch] lamini evaluate

In [None]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/deeplearning_fine_tuning

In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time
import torch
import transformers
import numpy as np
import evaluate

# from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import Trainer

In [None]:
!ls
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


In [None]:
device_count = torch.cuda.device_count()
logger.debug("Checking device")
if device_count > 0:
    print("Select GPU device")
    device = torch.device("cuda")
else:
    print("Select CPU device")
    device = torch.device("cpu")

In [None]:
model_name = "EleutherAI/pythia-70m"
base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
dataset_path = "lamini/lamini_docs"

dataset = datasets.load_dataset(dataset_path)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(train_dataset[0]['question'])
print(train_dataset[0]['answer'])

print(test_dataset[0]['question'])
print(test_dataset[0]['answer'])

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["question"], padding=True, truncation=True,  return_tensors="pt", max_length=512)

small_train_dataset = dataset["train"].select(range(1))
small_test_dataset = dataset["test"].select(range(1))

# print(train_dataset)
print(small_train_dataset[0])
print(small_test_dataset[0])

# encode
encoding_dataset = small_test_dataset.map(tokenize_function, batched=True)
input_ids=torch.tensor(encoding_dataset['input_ids']).to(device)
attention_mask = torch.tensor(encoding_dataset['attention_mask']).to(device)

# ask
base_model_generated_tokens_with_prompt = base_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=512
  )

# decode
generated_text_with_prompt = tokenizer.batch_decode(base_model_generated_tokens_with_prompt, skip_special_tokens=True)
print(generated_text_with_prompt)

In [None]:
# print("Question input:", small_train_dataset[0]['question'])
# print("Correct answer from Lamini docs:", small_train_dataset[0]['answer'])

# # Strip the prompt
base_model_generated_answer = generated_text_with_prompt[0][len(small_test_dataset[0]['question']):].replace('.', '.\n')

print('Models answer:', base_model_generated_answer)

In [None]:

# Now train

max_steps = -1
epochs=2
batch_size=1

trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=epochs,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=batch_size,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, 2048)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

# print(base_model)
# print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
# print("Flops", model_flops / 1e9, "GFLOPs")

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # Assumes `tokenizer` is previously defined


class FilteredDataset(Dataset):
    def __init__(self, original_dataset):
        self.filtered_data = [item for item in original_dataset if len(item['input_ids']) > 0]

    def __getitem__(self, idx):
        return self.filtered_data[idx]

    def __len__(self):
        return len(self.filtered_data)
small_train_dataset = dataset["train"].select(range(230))
small_test_dataset = dataset["train"].select(range(230))
filtered_train_dataset = FilteredDataset(small_train_dataset)
filtered_test_dataset = FilteredDataset(small_test_dataset)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=filtered_train_dataset,
    eval_dataset=filtered_test_dataset,
    data_collator=data_collator,  # Add this line
    compute_metrics=compute_metrics,
)


In [None]:
training_output = trainer.train()

print(training_output)

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

finetuned_model_output = finetuned_model.to(device)


In [None]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10))
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(10))

# encode
encoding_dataset = small_train_dataset.map(tokenize_function, batched=True)
input_ids=torch.tensor(encoding_dataset['input_ids']).to(device)
attention_mask = torch.tensor(encoding_dataset['attention_mask']).to(device)

# ask
finetuned_generated_tokens_with_prompt = finetuned_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=512
  )

print(finetuned_generated_tokens_with_prompt)

In [None]:
# decode
finetuned_generated_text_with_prompt = tokenizer.batch_decode(finetuned_generated_tokens_with_prompt, skip_special_tokens=True)


print("Question input:", small_train_dataset[0]['question'])
print("Correct answer from Lamini docs:", small_train_dataset[0]['answer'])
print(finetuned_generated_text_with_prompt)
# # Strip the prompt
finetuned_generated_answer = finetuned_generated_text_with_prompt[0][len(small_train_dataset[0]['question']):].replace('?', '?\n')

print('Models answer:', finetuned_generated_answer)

In [None]:
# Strip the prompt
finetuned_generated_text_answer = finetuned_generated_text_with_prompt[0][len(small_train_dataset[0]['question']):]

base_model_modified_text = base_model_generated_answer.replace("?", "?\n")
finetuned_modified_text = finetuned_generated_text_answer.replace("?", "?\n")

print(base_model_modified_text)
print('===============')
print(finetuned_modified_text)