# Introduction to LLM

Dataset: https://huggingface.co/datasets/eli5 <br>
model:
- DistilGPT2: https://huggingface.co/distilgpt2
- GPT2: https://huggingface.co/docs/transformers/model_doc/gpt2 <br>

This notebook is modified from: https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
! pip install transformers transformers[torch] datasets evaluate rouge_score

In [None]:
!nvidia-smi

In [None]:
import torch
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
# Load ELI5 dataset
# eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = load_dataset("rexarski/eli5_category", split="train[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()
print(f'dataset:\n{eli5}')

In [None]:
dash_line = '====='*5
prompt_id = 300
original_context = eli5["train"][prompt_id]["answers.text"][0]
prompt = original_context[:50]

print(dash_line)
print(f'PROMPT CONTEXT:\n{prompt}')
print(dash_line)
print(f'COMPLETE CONTEXT:\n{original_context}')

## preprocess data

In [None]:
model_name = "distilgpt2"
# model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
block_size = 128

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)

# This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.
# You can now use a second preprocessing function to concatenate all the sequences
# split the concatenated sequences into shorter chunks defined by block_size,
# which should be both shorter than the maximum input length and short enough for your GPU RAM.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_eli5 = eli5.map(
  preprocess_function,
  batched=True,
  num_proc=4,
  remove_columns=eli5["train"].column_names,
)
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

In [None]:
# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(model_name)

def count_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    def num_to_str(num):
        return format(num, ',')
    return f"trainable_params: {num_to_str(trainable_params)}\nall_params: {num_to_str(all_params)}\npercentage of trainable params: {100*trainable_params/all_params}%"
print(count_parameters(model))

# test the pre-trained model with zero-shot inferencing

In [None]:
from tqdm import tqdm
import pandas as pd

rogue = evaluate.load('rouge')
def evaluate_model(num_sample=10):
    df_test = pd.DataFrame()
    for idx in tqdm(range(num_sample)):
        human_baseline_summaries = eli5["test"][idx]['answers.text'][0]
        prompt = human_baseline_summaries[:20]
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
        outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=5, pad_token_id=tokenizer.eos_token_id)
        original_model_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        # print(f'generated full context:\n{original_model_summaries[0]},\nlength: {len(original_model_summaries[0])}')
        max_length = len(original_model_summaries) if len(original_model_summaries) < len(human_baseline_summaries) else len(human_baseline_summaries)
        print(f'max_length: {max_length}')
        original_model_results = rogue.compute(
            predictions=original_model_summaries[:max_length] ,
            references=human_baseline_summaries[:max_length],
            use_aggregator=True,
            use_stemmer=True,
        )
        logger = {
            'prompt': prompt,
            'human_baseline_summaries': human_baseline_summaries,
            'original_model_summaries': original_model_summaries,
            'rouge1': original_model_results['rouge1'],
            'rouge2': original_model_results['rouge2'],
            'rougeL': original_model_results['rougeL'],
            'rougeLsum': original_model_results['rougeLsum'],
        }
        df_test = pd.concat([df_test, pd.DataFrame([logger])])
    return df_test
df_test = evaluate_model(num_sample=10)
df_test

In [None]:
df_test['rouge1'].mean()

# Fine-tuning the pre-trained model

In [None]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

run_name = "finetune-distilgpt2-eli5"
training_args = TrainingArguments(
    output_dir=run_name,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    num_train_epochs=3,
    learning_rate=6e-4,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(run_name)
tokenizer.save_pretrained(run_name)

In [None]:
eval_results = trainer.evaluate()
eval_results

# testing the fine-tuned model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# load fine-tuned model
sft_model = AutoModelForCausalLM.from_pretrained(run_name)
del model
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to('cpu')
model.eval()
sft_model.device, model.device

In [None]:
def generate(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=5, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

dash_line = '====='*5
prompt_id = 500 # <------- change me
original_context = eli5["test"][prompt_id]["answers.text"][0]
prompt = original_context[:40]
original_model_summaries = generate(prompt, model, tokenizer)[0]
sft_model_summaries = generate(prompt, sft_model, tokenizer)[0]
print(dash_line)
print(f'PROMPT CONTEXT:\n{prompt}')
print(dash_line)
print(f'COMPLETE CONTEXT:\n{original_context}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_summaries}')
print(dash_line)
print(f'SFT MODEL:\n{sft_model_summaries}')

In [None]:
# Evaluate the model quatitatively using ROUGE
max_length = len(original_model_summaries) if len(original_model_summaries) < len(original_context) else len(original_context)
print(f'max_length: {max_length}')
original_model_results = rogue.compute(
    predictions=original_model_summaries[:max_length] ,
    references=original_context[0:max_length],
    use_aggregator=True,
    use_stemmer=True,
)
max_length = len(sft_model_summaries) if len(sft_model_summaries) < len(original_context) else len(original_context)
sft_model_results = rogue.compute(
    predictions=sft_model_summaries[:max_length],
    references=original_context[0:max_length],
    use_aggregator=True,
    use_stemmer=True,
)
print(f'original model results:\n{original_model_results}')
print(f'sft model results:\n{sft_model_results}')

In [None]:
# generate a new prompt by yourself
prompt = "Somatic hypermutation allows the immune system to"
generate(prompt, sft_model, tokenizer)[0]