# Fine-tuning GPT-2

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# paths to dataset files stored in Google Drive
DATA_DIR = '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/'

# instructed dataset
PATH_HYROX_JSON_INSTRUCTED = f'{DATA_DIR}/hyrox_json_instructed.jsonl'
PATH_HYROX_JSON_PROMPT_INSTRUCTED = f'{DATA_DIR}/hyrox_json_prompt_instructed.jsonl'
PATH_HYROX_SPLIT_DATASET = f'{DATA_DIR}/hyrox_split_dataset'
PATH_HYROX_TOKENIZER_GPT2 = f'{DATA_DIR}/hyrox_gpt2_tokenizer'
PATH_HYROX_MODEL_GPT2 = f'{DATA_DIR}/hyrox_gpt2_model'

In [None]:
!pip install accelerate bitsandbytes datasets transformers trl -qU

In [None]:
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GenerationConfig, pipeline, set_seed
from transformers import TrainingArguments
from trl import SFTTrainer


In [None]:
# load the data into a pandas DataFrame
hyrox_df = pd.read_json(PATH_HYROX_JSON_INSTRUCTED, lines=True)

# convert it to a Dataset object
hyrox_dataset = Dataset.from_pandas(hyrox_df)

# shape
hyrox_dataset.shape

In [None]:
hyrox_dataset[0]

In [None]:
hyrox_dataset_train_test = hyrox_dataset.train_test_split(test_size=0.2)

In [None]:
hyrox_dataset_train_test

In [None]:
hyrox_dataset_val_test = hyrox_dataset_train_test['test'].train_test_split(test_size=0.5)

In [None]:
hyrox_dataset_val_test

In [None]:
split_hyrox_dataset = DatasetDict({
    'train' : hyrox_dataset_train_test['train'],
    'val' : hyrox_dataset_val_test['train'],
    'test' : hyrox_dataset_val_test['test']
})

In [None]:
split_hyrox_dataset

# Training Prompt Creation

Two promts, one for training and other for inference which doesn't include the response.

In [None]:
HYROX_TRAINING_PROMPT_TEMPLATE = """\
{bos_token}### Instruction:
{system_message}

### Input:
{input}

### Context:
{context}

### Response:
{response}{eos_token}
"""

HYROX_INFERENCE_PROMPT_TEMPLATE = """\
{bos_token}### Instruction:
{system_message}

### Input:
{input}

### Context:
{context}

### Response:{eos_token}
"""

# System Message

In [None]:
SYSTEM_MESSAGE = "You are a performance analysis assistant for Hyrox athletes. Your job is to analyze performance data provided by atheles once they complete a Hyrox race and generate personalized, insightful feedback that helps the athlete improve."

In [None]:
def create_hyrox_prompt(sample):
  full_prompt = HYROX_TRAINING_PROMPT_TEMPLATE.format(
      bos_token = '<|startoftext|>',
      eos_token = '<|endoftext|>',
      system_message = SYSTEM_MESSAGE,
      input = sample['input'],
      context = sample['context'],
      response = sample['response']
  )

  return {"text" : full_prompt}

In [None]:
def create_hyrox_prompt_and_response(sample):
  full_prompt = HYROX_INFERENCE_PROMPT_TEMPLATE.format(
      bos_token = '<|startoftext|>',
      eos_token = '<|endoftext|>',
      system_message = SYSTEM_MESSAGE,
      input = sample['input'],
      context = sample['context']
  )

  ground_truth = sample['response']

  return {"full_prompt" : full_prompt, "ground_truth" : ground_truth}

In [None]:
split_hyrox_dataset['train'][0]

In [None]:
create_hyrox_prompt(split_hyrox_dataset['train'][0])

In [None]:
# apply it to whole dataset
split_hyrox_dataset = split_hyrox_dataset.map(create_hyrox_prompt)

In [None]:
split_hyrox_dataset['train'][0]

In [None]:
# save the split dataset
split_hyrox_dataset.save_to_disk(PATH_HYROX_SPLIT_DATASET)

<|startoftext|>

### Instruction:

You are a performance analysis assistant for Hyrox athletes. Your job is to analyze performance data provided by atheles once they complete a Hyrox race and generate personalized, insightful feedback that helps the athlete improve.

### Input:

Generate performance feedback based on prediction and true time

### Context:
gender: 0, age: 35-39, total_time: 5438, predicted: 5780, residual: -342, cluster: None\n\n

### Response:\
You outperformed the model's prediction significantly — great job! You are performing in line with your cluster's average strengths.

<|endoftext|>

# Load the model to tune and preprocess it

In [None]:
model_id = "gpt2"
# model_id = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.resize_token_embeddings(len(tokenizer))

Training pipeline

In [None]:
# init the text generation pipeline with model and tokenizer
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# set the seed
set_seed(42)

def generate_sample(sample):
  # Prepare the prompt generation, including the question and answer expected.
  prompt_package = create_hyrox_prompt_and_response(sample)

  # Config for text generation, setting params
  generation_config = GenerationConfig(
      max_new_tokens=50,
      do_sample=True,
      top_k=50,
      temperature=1e-4,
      eos_token_id=model.config.eos_token_id,
  )

  # Generate text based on the prompt and config
  generation = generator(prompt_package["full_prompt"], generation_config=generation_config)
  print("---------------")
  print("Question:")
  print(sample["input"])
  print("---------------")
  print("Dataset Response")
  # Imprime la respuesta esperada para comparación.
  print(prompt_package["ground_truth"])
  print("---------------")
  print("Model Response:")
  # Imprime la respuesta del modelo, eliminando el texto del prompt para mostrar solo el texto generado nuevo.
  print(generation[0]["generated_text"].replace(prompt_package["full_prompt"], ""))


In [None]:
generate_sample(split_hyrox_dataset["test"][20])

The model is unabled to perform the task.

In [None]:
training_args = TrainingArguments(
 per_device_train_batch_size=4,
 gradient_accumulation_steps=4,
 gradient_checkpointing =True,
 max_grad_norm= 0.3,
 #num_train_epochs=2,
 max_steps=100,
 learning_rate=2e-4,
 save_total_limit=3,
 logging_steps=10,
 output_dir=PATH_HYROX_MODEL_GPT2,
 optim="paged_adamw_32bit",
 lr_scheduler_type="cosine",
 #evaluation_strategy="steps",
 eval_steps=50,
 warmup_ratio=0.05,
    report_to="none",
)

In [None]:
trainer = SFTTrainer(
 model,
 train_dataset=split_hyrox_dataset['train'],
 eval_dataset=split_hyrox_dataset['val'],
 #tokenizer=gpt2_tokenizer,
 args=training_args
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained(PATH_HYROX_TOKENIZER_GPT2)

# Test the model

In [None]:
hyrox_gpt2_tokenizer = AutoTokenizer.from_pretrained(PATH_HYROX_TOKENIZER_GPT2)

In [None]:
hyrox_gpt2_model = AutoModelForCausalLM.from_pretrained(PATH_HYROX_MODEL_GPT2)

In [None]:
generator = pipeline('text-generation', model=hyrox_gpt2_model, tokenizer=hyrox_gpt2_tokenizer)

In [None]:
for i in range(20):
    generate_sample(split_hyrox_dataset["test"][i])
    print('--------------------------------------------------------')