In [None]:
from google.colab import drive
drive.mount('/content/drive/')
from huggingface_hub import login
login(token="<hf_token>")

In [None]:
!pip install bitsandbytes transformers accelerate peft datasets wandb


In [None]:
from datasets import load_dataset
train_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Output/loyalty_qa_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Output/loyalty_qa_val.jsonl', split='train')

In [None]:
print(train_dataset[:2])

In [None]:
import wandb, os
os.environ["WANDB_API_KEY"]="<wandb_api_key>"
wandb.login()
wandb_project = "Llama-finetune_latest"
os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "meta-llama/Meta-Llama-3-8B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")
base_model=model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_prompt(prompt):
    return tokenizer(formatting_func(prompt))

In [None]:
def formatting_func(example):
    text = f"### Question: {example['prompt']}\n ### Answer: {example['response']}"
    return text

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(tokenize_prompt)

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

In [None]:
def generate_text(user_prompt, max_new_tokens=100, repetition_penalty=1.15):
    # Tokenize the user prompt
    model_input = tokenizer(user_prompt, return_tensors="pt").to("cuda")

    # Set the model to evaluation mode
    model.eval()

    # Generate text without calculating gradients
    with torch.no_grad():
        # Generate the text
        generated_output = model.generate(
            **model_input,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty
        )

        # Decode the generated output to text
        generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    return generated_text

In [None]:
eval_prompt = "[MyElite Loyalty Program FAQ]:What is the maximum cashback I can earn?"
print(generate_text(eval_prompt))

In [None]:
eval_prompt = "[MyElite Loyalty Program FAQ]:Does the MyElite Loyalty Program offer any discount on purchases?"
print(generate_text(eval_prompt))

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print(model)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
print(model)

In [None]:
print(torch.cuda.device_count())
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
import transformers
from datetime import datetime

run_name = "Llama_finetune"
output_dir = "/content/drive/MyDrive/Output/" + run_name

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=2,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=200,
        learning_rate=2.5e-5,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_steps=25,
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=25,
        eval_strategy="steps",
        eval_steps=25,
        do_eval=True,
        report_to="wandb",
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
def LLM_response(prompt):
    eval_prompt = "Please provide an answer for [MyElite Loyalty Program FAQ]: "+prompt
    print(generate_text(eval_prompt))

In [None]:
user_prompt = "Does the MyElite Loyalty Program offer any discount on purchases?"
LLM_response(user_prompt)

In [None]:
eval_prompt = "[MyElite Loyalty Program FAQ]:What is the maximum cashback I can earn?"
print(generate_text(eval_prompt))