# Fine-tuning Mistral-7B con QLoRA

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate

In [None]:
!git clone https://github.com/xaviguardia/llm01.git && cd llm01

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# set the directory to /content/llm01
import os
os.chdir("/content/llm01")
model_id = "mistralai/Mistral-7B-v0.1"
dataset_path = "data/qa_dataset.jsonl"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")

model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import os

from huggingface_hub import login
from google.colab import userdata

# Retrieve the Hugging Face token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

# Log in to Hugging Face
login(token=hf_token)

# Load the JSON data using pandas
df = pd.read_json(dataset_path, lines=True)

# Convert the pandas DataFrame to a datasets Dataset object
dataset = Dataset.from_pandas(df)

def tokenize(sample):
    tokenized_sample = tokenizer(
        sample["prompt"] + "\n" + sample["response"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()
    return tokenized_sample

tokenized = dataset.map(tokenize)

args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    fp16=True, # Descomenta si tu GPU lo soporta
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized)
trainer.train()
# Save the trained model
trainer.save_model("./fine_tuned_model")

In [19]:
# el objetivo de la celda es cargar el modelo y el finetuneado
fine_tuned_model_path = "./fine_tuned_model"
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="gpu")


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")


# Try the trained model
# Load the fine-tuned model
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os

# Set the directory to /content/llm01 if it's not already
if os.getcwd() != '/content/llm01':
    os.chdir('/content/llm01')


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# Load the PEFT model (apply the LoRA adapters) onto the base model
model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

print("Fine-tuned model loaded and pipeline created successfully!")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:

# Example prompt
prompt = "¿Qué hace el interceptor de seguridad en las peticiones HTTP?"

# Generate text
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

print(generated_text[0]['generated_text'])
