#### Per https://medium.com/@balci.pelin/llm-finetuning-410e8a2738ef to figure out why can't get my own data to work

In [1]:
from transformers import pipeline, set_seed, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model_name = 'lamini/lamini_docs'
dataset_name = "lamini/taylor_swift"
datasets = load_dataset(dataset_name)

In [81]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

# Why doesn't this work?
def tokenize_function(examples, tokenizer):
    # Due to batching, each list have 4 elements. Try if possible to join on element basis and if training would be faster as a result
    # text = examples['question'][0]+examples['answer'][0]
    text = [examples['question'][i]+examples['answer'][i] for i in range(len(examples['question']))]
    # necessary
    tokenizer.pad_token = tokenizer.eos_token

    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
    )
    # Doesn't seem necessary?
    tokenizer.truncation_side = "left"
    # This is the key, if max_length is not set properly then training will fail. this is item level so max length is dynamic and differ by each row, but why?
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        # 2048,
        2048
    )
    # print(text)
    # return tokenizer(examples["text"], padding="max_length", truncation=True)
    return tokenizer(text, return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)



train_dataset = datasets["train"]
tokenized_train = train_dataset.map(tokenize_function, batched=True, batch_size=1, drop_last_batch=True, fn_kwargs={"tokenizer": tokenizer})
test_dataset = datasets["test"]
tokenized_test = test_dataset.map(tokenize_function, batched=True, batch_size=1, drop_last_batch=True, fn_kwargs={"tokenizer": tokenizer})


# tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 783/783 [00:00<00:00, 1403.79 examples/s]
Map: 100%|██████████| 87/87 [00:00<00:00, 1406.81 examples/s]


In [64]:
model_name = "EleutherAI/pythia-70m"
model = AutoModelForCausalLM.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model.to('mps')
base_model.to('mps')

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [10]:
import os
# quickly test see if prevent  MPS backend out of memory (MPS allocated: 2.09 GB, other allocations: 15.95 GB, max allowed: 18.13 GB). Tried to allocate 98.25 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

In [65]:
from transformers import TrainingArguments, Trainer

# number of loops, overrides num_train_epochs
max_steps = 800
# Number of training epochs
num_train_epochs=7

# Save model to this direction
steps_or_epochs = f"{max_steps} steps" if max_steps != -1 else f"{num_train_epochs} epochs"
trained_model_name = f"{dataset_name.split('/')[-1]}_{model_name.split('/')[-1]}_{steps_or_epochs}"
save_dir = f'../models/taylorswift/{trained_model_name}/'

# See here
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,


  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=save_dir,

  # Other arguments
  overwrite_output_dir=True, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="epoch",
  save_strategy="epoch",
  logging_strategy="epoch",
  logging_steps=1,

  
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False,

  # see here, avoid oom https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/2
  eval_accumulation_steps=10
)



trainer = Trainer(
    model=model,
    # model_flops=model_flops,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)


training_output = trainer.train()

# Unnecessary as model is already outputted through output_dir argument
# trainer.save_model(save_dir)
# print("Saved model to:", save_dir)
# save_dir = f'../models/example2/{output_dir}/'

  0%|          | 0/800 [00:57<?, ?it/s]
  0%|          | 0/800 [00:40<?, ?it/s]
  2%|▏         | 18/800 [00:08<05:32,  2.35it/s]

KeyboardInterrupt: 

In [45]:
# finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir+[x.path for x in os.scandir(save_dir)][0], local_files_only=True)
# finetuned_slightly_model.to('mps')

In [49]:
def generate_output(test_question, model, max_input_tokens=1000, max_output_tokens=100):
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize
    input_ids = tokenizer.encode(
          test_question,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens,
  
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(test_question):]
    return generated_text_answer

In [50]:
# generate_output("Can Lamini generate technical documentation or user manuals for software projects?", base_model, max_output_tokens=50)
generate_output("How old is Taylor?", base_model, max_output_tokens=50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


"\n\nA:\n\nI think you're right.  I think you're right.  I think you're right.\n\nA:\n\nI think you're right.  I think you're right."

In [77]:
# generate_output("Can Lamini generate technical documentation or user manuals for software projects?", finetuned_slightly_model, max_output_tokens=100)

generate_output("Who is Taylor Swift", model, max_input_tokens=1000, max_output_tokens=100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'?Taylor Swift is a songwriter, songwriter, and writer. She has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has written many songs and has'

In [None]:
train_dataset['question'][1]

'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?'

In [None]:
#TODO try lamini/lamini_docs and see if perf is same as Coursera Class