In [54]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, DataCollatorForLanguageModeling
import numpy as np

# Dataset with context, this might be necessary for model to pickup proper thing to respond to. In the repo there's also a dataset with all questions and answers but I doubt it'd be as useful
dataset = load_dataset("tylercross/platos_socrates")
# dataset = load_dataset("lamini/taylor_swift")
dataset.shape

{'train': (3550, 3)}

In [58]:
from unittest.util import _MAX_LENGTH
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

def tokenize_function(examples, tokenizer):
    tokenizer.pad_token = tokenizer.eos_token
    max_length = 2048
    # print(len(examples))
    output = []
    for i in range(len(examples['input'])):
        text = examples["input"][i]+examples["output"][i]
        # text = examples["question"][i]+examples["answer"][i]
        output.append(text)
        
    # print(output)
    return tokenizer(output, padding="max_length", return_tensors='pt', truncation=True, max_length=max_length)

tokenized_dataset = dataset['train'].map(tokenize_function, batched=True, batch_size=100, drop_last_batch=True, fn_kwargs={'tokenizer': tokenizer})
# Absloutely necessary otherwise no loss error
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [45]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
data = tokenized_dataset.train_test_split(test_size=0.1)
train, test = data['train'], data['test']

In [46]:
batch_size = 2
max_steps = 1
num_train_epochs = 1
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,
  num_train_epochs=num_train_epochs,

  # Batch size for training
  per_device_train_batch_size=batch_size,

  # Directory to save model checkpoints
  output_dir='../models/',

  # Other arguments
  overwrite_output_dir=True, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=batch_size, # Batch size for evaluation
  evaluation_strategy="epoch",
  save_strategy="epoch",
  logging_strategy="epoch",
  logging_steps=1,

  
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="loss",
  greater_is_better=False,

  # see here, avoid oom https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/2
  eval_accumulation_steps=10
)



In [47]:
# No clue why but when I switch data to the Taylor Swift dataset, trainer gives  expected sequence of length 30 at dim 1 (got 16), when all other parts of the code are the same
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    # compute_metrics=compute_metrics,
)

trainer.train()

100%|██████████| 10/10 [07:55<00:00, 47.56s/it]
                                             
100%|██████████| 1/1 [05:17<00:00,  8.84s/it]

{'loss': 8.7338, 'grad_norm': 526.0093994140625, 'learning_rate': 0.0, 'epoch': 0.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                 
[A                                          
100%|██████████| 1/1 [06:12<00:00,  8.84s/it]
[A

{'eval_loss': 8.375192642211914, 'eval_runtime': 54.9271, 'eval_samples_per_second': 6.372, 'eval_steps_per_second': 3.186, 'epoch': 0.0}


                                             
100%|██████████| 1/1 [01:23<00:00, 83.07s/it]

{'train_runtime': 83.0647, 'train_samples_per_second': 0.096, 'train_steps_per_second': 0.012, 'train_loss': 8.73377799987793, 'epoch': 0.0}





TrainOutput(global_step=1, training_loss=8.73377799987793, metrics={'train_runtime': 83.0647, 'train_samples_per_second': 0.096, 'train_steps_per_second': 0.012, 'train_loss': 8.73377799987793, 'epoch': 0.0})

In [35]:
def generate_output(test_question, model, max_input_tokens=1000, max_output_tokens=100):
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize
    input_ids = tokenizer.encode(
          test_question,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens,
  
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(test_question):]
    print(generated_text_answer)
    return generated_text_answer

In [14]:
generate_output("""Who was the person""", model=model, max_output_tokens=100)

NameError: name 'generate_output' is not defined

### Lora/Qlora using PEFT: [Good Tutorialx](https://www.youtube.com/watch?v=XpoKB3usmKc&t=1292s)

In [18]:
import peft
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [50]:
peft_config =LoraConfig(r=64, lora_alpha=128, lora_dropout=0.0)#, target_modules='lora_A') #target_modules=["q_"]
model = get_peft_model(model, peft_config)

In [51]:
model.print_trainable_parameters()

trainable params: 786,432 || all params: 71,213,056 || trainable%: 1.104336822730933


#TODO (2024-03-18):
1. Read and learn ways to debug transformer https://huggingface.co/learn/nlp-course/chapter8/4?fw=pt
2. Investigate KeyError: 'eval_loss', why base model doesn't have this issue? 
3. Understand Transformer architecture better then come back to this
4. Another model using RAG to retrieve daily practice prompts.
5. in Taylor Swift bot, use DataCollator instead of using a fixed max_length at tokenizer_function and see how it works. (I commented out the max_length=512 step, and this will likely cause downstream training error, can we use collator to fix it?).

In [60]:

# No clue why but when I switch data to the Taylor Swift dataset, trainer gives  expected sequence of length 30 at dim 1 (got 16), when all other parts of the code are the same
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    # compute_metrics=compute_metrics,
)

model.config.use_cache = False
trainer.train()


  0%|          | 0/1 [01:57<?, ?it/s]        

{'loss': 3.9271, 'grad_norm': 11.875868797302246, 'learning_rate': 0.0, 'epoch': 0.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                             
  0%|          | 0/1 [02:49<?, ?it/s]
[ACheckpoint destination directory ../models/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_runtime': 52.7114, 'eval_samples_per_second': 6.64, 'eval_steps_per_second': 3.32, 'epoch': 0.0}


KeyError: 'eval_loss'

In [None]:
def generate_output(test_question, model, max_input_tokens=1000, max_output_tokens=100):
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize
    input_ids = tokenizer.encode(
          test_question,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens,
  
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(test_question):]
    return generated_text_answer

In [None]:
generate_output("""Who was the person""", model=model, max_output_tokens=100)