In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer
from evaluate import load
import optuna
from sklearn.model_selection import train_test_split



2024-11-17 19:27:38.364232: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-17 19:27:38.371230: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 19:27:38.379372: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 19:27:38.381787: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 19:27:38.388186: I tensorflow/core/platform/cpu_feature_guar

In [2]:
from transformers import TrainerCallback

class CustomPruningCallback(TrainerCallback):
    def __init__(self, trial, monitor):
        self.trial = trial
        self.monitor = monitor

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            return
        current_score = metrics.get(self.monitor)
        if current_score is None:
            return
        self.trial.report(current_score, step=state.global_step)
        if self.trial.should_prune():
            raise optuna.TrialPruned()


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig
import optuna
from optuna.trial import TrialState
import torch
from torch.utils.data import TensorDataset

# Define your custom pruning callback if not already defined
# from your_module import CustomPruningCallback, SFTTrainer

def objective(trial):
    # Hyperparameter suggestions
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [1, 2, 4, 8])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [1, 2, 4, 8])


    # Device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load dataset and tokenizer
    dataset = load_dataset("databricks/databricks-dolly-15k", trust_remote_code=True)
    dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
    model_name = "mistralai/Mistral-7B-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        add_eos_token=True,
        use_fast=True,
        padding_side='left'
    )
    tokenizer.pad_token = tokenizer.eos_token

    # Model preparation
    compute_dtype = getattr(torch, "bfloat16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)
    model.config.pad_token_id = tokenizer.pad_token_id

    def format_conversation(examples):
    # Join the list into a single string if it's a list of sentences
    
        conversations = []
        context = examples['context']
        instruction = examples['instruction']
        response = examples['response']
        for i in range(len(context)):
            conversation = f"{context[i]} {instruction[i]} {response[i]}"
            conversations.append(conversation)

        # Tokenize the joined conversations
        return tokenizer(conversations, truncation=True, max_length=512, padding="max_length", return_tensors="pt")

    # Tokenize the dataset
    tokenized_dataset = dataset.map(format_conversation, batched=True)

    # Remove any columns not needed for training (e.g., original text fields)
    tokenized_dataset = tokenized_dataset.remove_columns(['instruction', 'response', 'context', 'category'])

    # Ensure the format is PyTorch-friendly
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    # LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=4,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            'k_proj', 'q_proj', 'v_proj', 'o_proj',
            'gate_proj', 'down_proj', 'up_proj'
        ]
    )

    # Training arguments
    output_dir = f"./v2_mistral7b_results_trial_{trial.number}"
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=per_device_eval_batch_size,
        log_level="debug",
        logging_steps=10,
        learning_rate=learning_rate,
        eval_steps=10,
        max_steps=200,
        save_steps=25,
        warmup_steps=10,
        lr_scheduler_type="linear",
    )

    # Split the dataset into training and validation at 80%:20%
    # train_dataset, validation_dataset = train_test_split(tokenized_dataset, test_size=0.2, random_state=42)

    # Initialize trainer with pruning
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
        dataset_kwargs={'skip_prepare_dataset': True},
        callbacks=[CustomPruningCallback(trial, "eval_loss")]
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    eval_loss = eval_results['eval_loss']

    return eval_loss

# Run the Optuna study
if __name__ == "__main__":
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)

    # Output the best hyperparameters
    print("Best hyperparameters: ", study.best_params)
    print("Best evaluation loss: ", study.best_value)


[I 2024-11-17 19:27:39,832] A new study created in memory with name: no-name-45b77805-1773-4a7f-a687-69350b8b91c8


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 12,008
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 200
  Number of trainable parameters = 20,971,520
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,1.7708,1.89062
20,1.6653,1.807968
30,1.572,1.758353
40,1.5195,1.72657
50,1.5939,1.704474
60,1.5613,1.6898
70,1.5547,1.680195
80,1.5357,1.669519
90,1.5627,1.659294
100,1.6161,1.65237



***** Running Evaluation *****
  Num examples = 3003
  Batch size = 1

***** Running Evaluation *****
  Num examples = 3003
  Batch size = 1
Saving model checkpoint to ./v2_mistral7b_results_trial_0/checkpoint-25
loading configuration file config.json from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
 

[I 2024-11-18 05:55:45,981] Trial 0 finished with value: 1.6238725185394287 and parameters: {'learning_rate': 1.955043800235954e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 1, 'lora_r': 8, 'lora_alpha': 128, 'lora_dropout': 0.19461913403003125}. Best is trial 0 with value: 1.6238725185394287.
loading file tokenizer.model from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/tokenizer.model
loading file tokenizer.json from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/usuario/.cache/hu

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 