# 02 - Fine-Tuning avec LoRA 

**Objectif**: Fine-tuner un LLM sur le dataset Customer Support

**Modele**: TinyLlama-1.1B (optimise pour Mac)

In [None]:
# Installation
# !pip install torch transformers datasets accelerate peft trl

In [1]:
import torch
import json
from pathlib import Path
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Check device
print(f'PyTorch version: {torch.__version__}')
if torch.backends.mps.is_available():
    DEVICE = 'mps'
    print('Using MPS (Apple Silicon)')
elif torch.cuda.is_available():
    DEVICE = 'cuda'
    print(f'Using CUDA: {torch.cuda.get_device_name(0)}')
else:
    DEVICE = 'cpu'
    print('Using CPU - training will be slow')

PyTorch version: 2.10.0
Using MPS (Apple Silicon)


## 1. Configuration

In [2]:
# Paths
DATA_DIR = Path('../data')
MODEL_DIR = DATA_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)

# Model - TinyLlama optimise pour Mac M1
BASE_MODEL = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# Training config (reduit pour Mac)
NUM_EPOCHS = 1
BATCH_SIZE = 2
GRADIENT_ACCUMULATION = 8
LEARNING_RATE = 2e-4
MAX_LENGTH = 256

# Subset pour entrainement rapide (optionnel)
USE_SUBSET = True
SUBSET_SIZE = 2000

print('Config loaded')

Config loaded


## 2. Charger les donnees

In [3]:
# Charger les donnees preparees
with open(DATA_DIR / 'processed' / 'train.json', 'r') as f:
    train_data = json.load(f)
    
with open(DATA_DIR / 'processed' / 'val.json', 'r') as f:
    val_data = json.load(f)

# Utiliser subset pour test rapide
if USE_SUBSET:
    train_data = train_data[:SUBSET_SIZE]
    val_data = val_data[:500]

# Convertir en Dataset HuggingFace
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f'Train samples: {len(train_dataset)}')
print(f'Val samples: {len(val_dataset)}')

Train samples: 2000
Val samples: 500


In [4]:
# Reformater pour TinyLlama (format different de Mistral)
def format_for_tinyllama(example):
    # Extraire instruction et response du format actuel
    text = example['text']
    
    # Parser le format existant
    if 'Customer:' in text and 'Assistant:' in text:
        customer_part = text.split('Customer:')[1].split('[/INST]')[0].strip()
        assistant_part = text.split('Assistant:')[1].replace('</s>', '').strip()
    else:
        return example
    
    # Format TinyLlama
    formatted = f"""<|system|>
You are a helpful customer support assistant.</s>
<|user|>
{customer_part}</s>
<|assistant|>
{assistant_part}</s>"""
    
    return {'text': formatted}

# Appliquer le format
train_dataset = train_dataset.map(format_for_tinyllama)
val_dataset = val_dataset.map(format_for_tinyllama)

print('\nExemple formate:')
print(train_dataset[0]['text'][:400])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]


Exemple formate:
<|system|>
You are a helpful customer support assistant.</s>
<|user|>
want help seeing the early termination fee</s>
<|assistant|>
To help you understand the early termination fee, please provide us with your account details so that we can access the specific terms and conditions associated with your plan. Once we have this information, we will be able to provide you with a detailed explanation of


## 3. Charger le modele

In [5]:
# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

print(f'Tokenizer: {tokenizer.name_or_path}')
print(f'Vocab size: {tokenizer.vocab_size}')

Tokenizer: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Vocab size: 32000


In [6]:
# Charger le modele
print(f'Loading model: {BASE_MODEL}')

# Sur Mac (MPS), device_map='auto' peut causer des problemes avec PEFT/LoRA
# (Erreur: expected device meta but got mps:0)
# Il vaut mieux charger explicitement sur le device
if DEVICE == 'mps':
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float32, # Utiliser torch_dtype pour compatibilite
        trust_remote_code=True,
    )
    model = model.to(DEVICE)
else:
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float32,
        device_map='auto',
        trust_remote_code=True,
    )

print('Model loaded!')
print(f'Parameters: {model.num_parameters():,}')

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Model loaded!
Parameters: 1,100,048,384


## 4. Configurer LoRA

In [7]:
# Configuration LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
)

# Appliquer LoRA
model = get_peft_model(model, lora_config)

# Stats
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f'Trainable parameters: {trainable_params:,}')
print(f'Total parameters: {total_params:,}')
print(f'Trainable: {100 * trainable_params / total_params:.2f}%')

Trainable parameters: 4,505,600
Total parameters: 1,104,553,984
Trainable: 0.41%


## 5. Training

In [9]:
# Training arguments optimises pour Mac M1
training_args = TrainingArguments(
    output_dir=str(MODEL_DIR / 'checkpoints'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    eval_strategy='steps',
    eval_steps=200,
    save_steps=200,
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type='cosine',
    save_total_limit=2,
    # Mac M1 specific
    fp16=False,
    bf16=False,
    # use_mps_device=(DEVICE == 'mps'), # Deprecated in newer transformers
    dataloader_pin_memory=False,
    report_to='none',
)

print('Training arguments ready')

Training arguments ready


In [10]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    output_dir=str(MODEL_DIR / 'checkpoints'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    eval_strategy='steps',
    eval_steps=200,
    save_steps=200,
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type='cosine',
    save_total_limit=2,
    fp16=False,
    bf16=False,
    dataloader_pin_memory=False,
    report_to='none',
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
)

print('Trainer ready')

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Trainer ready


In [11]:
# Start training
print('='*60)
print('STARTING TRAINING')
print('='*60)
print(f'Model: {BASE_MODEL}')
print(f'Train samples: {len(train_dataset)}')
print(f'Epochs: {NUM_EPOCHS}')
print(f'Batch size: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} = {BATCH_SIZE * GRADIENT_ACCUMULATION}')
print(f'Learning rate: {LEARNING_RATE}')
print(f'Device: {DEVICE}')
print('='*60)
print('\nTraining ...\n')

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


STARTING TRAINING
Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Train samples: 2000
Epochs: 1
Batch size: 2 x 8 = 16
Learning rate: 0.0002
Device: mps

Training ...



Step,Training Loss,Validation Loss


TrainOutput(global_step=125, training_loss=0.9846302795410157, metrics={'train_runtime': 4194.8234, 'train_samples_per_second': 0.477, 'train_steps_per_second': 0.03, 'total_flos': 2842478903476224.0, 'train_loss': 0.9846302795410157})

In [12]:
# Save final model
final_model_path = MODEL_DIR / 'customer-support-tinyllama-lora'
trainer.save_model(str(final_model_path))
tokenizer.save_pretrained(str(final_model_path))

print(f'Model saved to: {final_model_path}')

Model saved to: ../data/models/customer-support-tinyllama-lora


## 6. Test le modele fine-tune

In [13]:
def generate_response(question):
    prompt = f"""<|system|>
You are a helpful customer support assistant.</s>
<|user|>
{question}</s>
<|assistant|>
"""
    
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extraire la reponse
    if '<|assistant|>' in response:
        response = response.split('<|assistant|>')[-1].strip()
    return response

print('Generation function ready')

Generation function ready


In [14]:
# Test questions
test_questions = [
    'I want to cancel my order',
    'Where is my package?',
    'How do I get a refund?',
    'I received a damaged product',
    'What is your return policy?',
]

print('='*60)
print('TEST RESPONSES')
print('='*60)
for q in test_questions:
    print(f'\nCustomer: {q}')
    print(f'Assistant: {generate_response(q)}')
    print('-'*40)

TEST RESPONSES

Customer: I want to cancel my order


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_values=None`.


Assistant: I's, or other or in the other customers in the future.
<|user|<| < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < <
----------------------------------------

Customer: Where is my package?
Assistant: We have to help you's < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < <
----------------------------------------

Customer: How do I get a refund?
A

In [16]:
def generate_response(question):
    prompt = f"""<|system|>
You are a helpful customer support assistant.</s>
<|user|>
{question}</s>
<|assistant|>
"""
    
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.3,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.2,  # Evite les repetitions
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if '<|assistant|>' in response:
        response = response.split('<|assistant|>')[-1].strip()
    # Nettoyer les caracteres bizarres
    response = response.split('<')[0].strip()
    return response

# Test
for q in ['I want to cancel my order', 'Where is my package?', 'How do I get a refund?']:
    print(f'Q: {q}')
    print(f'A: {generate_response(q)}')
    print('-'*40)

Q: I want to cancel my order
A: Thank you can help us, and the company's of the 201:
----------------------------------------
Q: Where is my package?
A: I can help you have to the company's, and other areas of the following:
----------------------------------------
Q: How do I get a refund?
A: I's, and the other areas of the 198-0.
----------------------------------------


## 7. Sauvegarder les metriques

In [15]:
# Sauvegarder les resultats
results = {
    'model': BASE_MODEL,
    'lora_r': LORA_R,
    'lora_alpha': LORA_ALPHA,
    'train_samples': len(train_dataset),
    'val_samples': len(val_dataset),
    'epochs': NUM_EPOCHS,
    'batch_size': BATCH_SIZE * GRADIENT_ACCUMULATION,
    'learning_rate': LEARNING_RATE,
    'trainable_params': trainable_params,
    'trainable_percent': 100 * trainable_params / total_params,
    'device': DEVICE,
}

with open(DATA_DIR / 'results' / 'training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print('Results saved!')
print(json.dumps(results, indent=2))

Results saved!
{
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "lora_r": 16,
  "lora_alpha": 32,
  "train_samples": 2000,
  "val_samples": 500,
  "epochs": 1,
  "batch_size": 16,
  "learning_rate": 0.0002,
  "trainable_params": 4505600,
  "trainable_percent": 0.40791125334440875,
  "device": "mps"
}
