# Fine-tuning Mistral-7B con QLoRA

In [1]:
!pip install transformers datasets peft bitsandbytes accelerate



In [2]:
!git clone https://github.com/xaviguardia/llm01.git && cd llm01

fatal: destination path 'llm01' already exists and is not an empty directory.


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# set the directory to /content/llm01
import os
os.chdir("/content/llm01")
model_id = "mistralai/Mistral-7B-v0.1"
dataset_path = "data/qa_dataset.jsonl"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")

model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import os

from huggingface_hub import login
from google.colab import userdata

# Retrieve the Hugging Face token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

# Log in to Hugging Face
login(token=hf_token)

# Load the JSON data using pandas
df = pd.read_json(dataset_path, lines=True)

# Convert the pandas DataFrame to a datasets Dataset object
dataset = Dataset.from_pandas(df)

def tokenize(sample):
    tokenized_sample = tokenizer(
        sample["prompt"] + "\n" + sample["response"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()
    return tokenized_sample

tokenized = dataset.map(tokenize)

args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    fp16=True, # Descomenta si tu GPU lo soporta
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized)
trainer.train()
# Save the trained model
trainer.save_model("./fine_tuned_model")

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=3, training_loss=10.527310689290365, metrics={'train_runtime': 13.9611, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.215, 'total_flos': 131126500786176.0, 'train_loss': 10.527310689290365, 'epoch': 3.0})

In [18]:
# Save the trained model
trainer.save_model("./fine_tuned_model")

# Try the trained model
# Load the fine-tuned model
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os

# Set the directory to /content/llm01 if it's not already
if os.getcwd() != '/content/llm01':
    os.chdir('/content/llm01')

model_id = "mistralai/Mistral-7B-v0.1"
# Specify the path to the saved fine-tuned model checkpoint
# You need to replace "checkpoint-steps-xyz" with the actual checkpoint directory name
fine_tuned_model_path = "./fine_tuned_model"


# Load the base model
# Make sure to load it with the same configuration (e.g., 4-bit quantization) as during training
base_model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# Load the PEFT model (apply the LoRA adapters) onto the base model
model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

# Create a pipeline for text generation
# It's often recommended to merge the adapter weights for inference pipelines, but not strictly necessary
# You can uncomment the next line to merge the weights if you prefer
# model = model.merge_and_unload()

# Remove device_map="auto" from the pipeline constructor
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

print("Fine-tuned model loaded and pipeline created successfully!")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [16]:

# Example prompt
prompt = "¿Qué hace el interceptor de seguridad en las peticiones HTTP?"

# Generate text
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

print(generated_text[0]['generated_text'])


Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


¿Qué hace el interceptor de seguridad en las peticiones HTTP?

```typescript
intercept(request: HttpRequest<any>, next: HttpHandler): Observable<HttpEvent<any>> {
  const authReq = request.clone({
    setHeaders: {
      Authorization: `Bearer ${this.auth.getToken()}`
    }
  });

  return next.handle(authReq);
}
```

¿Por qué se le pone `clone` a la petición?

El método `HttpEvent` es un tipo `interface` muy poderoso.

```typescript
interface HttpEvent<T> {
  /**
   * The response body
   */
  body: T;
  /**
   * The response status code
   */
  status: number;
  /**
   * The response headers
   */
  headers: HttpHeaders;
  /**
   * The response type, e.g 'json'.
   */
  type: string;
  /**
   * The response URL
   */
  url: string;
  /**
   * The status text, e.g 'OK'
   */
