In [None]:
import getpass
print("Enter you Hugging Face token:")
TOKEN = getpass.getpass()

In [None]:
!pip install git+https://www.github.com/huggingface/transformers

!pip install git+https://github.com/huggingface/accelerate

!pip install bitsandbytes

!pip install einops
!pip install datasets
!pip install trl
!pip install peft
!pip install -U "huggingface_hub[cli]"

In [None]:
!git config --global credential.helper store
!huggingface-cli login --token $TOKEN --add-to-git-credential

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [None]:
# Suppressing “INFO” and “WARNING” messages by setting the verbosity of the Transformers library.
from transformers import logging
logging.set_verbosity_error()

# Suppressing Python warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
else:
  compute_dtype = torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config = bnb_config)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = quantized_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

In [None]:
from datasets import load_dataset

dataset = "openai/gsm8k"
data = load_dataset(dataset, 'main')

tokenizer.pad_token = tokenizer.eos_token
data = data.map(lambda samples: tokenizer(samples["question"], samples["answer"], truncation=True, padding="max_length", max_length=100), batched=True)
train_samples = data["train"].select(range(400))

display(train_samples)

In [None]:
print(train_samples[:1])

In [None]:
import peft
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
from transformers import TrainingArguments
import os

working_dir = 'content/drive/MyDrive/large-models/'
output_directory = os.path.join(working_dir, "lora")

training_args = TrainingArguments(
    output_dir = output_directory,
    auto_find_batch_size = True,
    learning_rate = 3e-4,
    num_train_epochs=2,
    report_to = "none"
)

In [None]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model = quantized_model,
    args = training_args,
    train_dataset = train_samples,
    peft_config = lora_config, tokenizer = tokenizer,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
trainer.train()

In [None]:
# Save the model.
model_path = os.path.join(output_directory, f"lora_model")

trainer.model.save_pretrained(model_path)

In [None]:
#We are going to clean some variables to avoid memory problems
import gc
import torch
del quantized_model
del trainer
del train_samples
del data
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_path = "content/drive/MyDrive/large-models/lora/lora_model"

from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        model_path,
                                        quantization_config = bnb_config)

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = loaded_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))