In [None]:
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install optuna
!pip install optuna-integration
!pip install triton


In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import optuna
import pandas as pd
import datasets
import time
import gc

In [67]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.ipc_collect()
    try:
        del model
    except:
        pass
    try:
        del tokenizer
    except:
        pass
    try:
        del trainer
    except:
        pass
clear_memory()

In [None]:
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-2-9b-it",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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"
)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass



In [68]:
tokenizer = []
model = []

In [None]:
from datasets import load_dataset
dataset = load_dataset("twright8/gem_training")
dataset['train'] = dataset['train'].map(formatting_prompts_func, batched = True,)
dataset['test'] = dataset['test'].map(formatting_prompts_func, batched = True,)

clear_memory()

In [None]:
print(dataset['train'])

In [65]:
from transformers import EarlyStoppingCallback
import math
import wandb
wandb.init(mode="disabled")

def make_modelz(lora_alpha,lora_dropout,r):
    clear_memory()
    max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "google/gemma-2-9b-it",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = "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"
    )
    model = FastLanguageModel.get_peft_model(
    model,
    r = r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    )
    return model, tokenizer


def objective(trial):
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5,early_stopping_threshold=0.01)
    
    clear_memory()

    lora_alpha = trial.suggest_int("lora_alpha", 16, 256)
    lora_dropout = trial.suggest_float("lora_dropout", 0.15, 0.5)
    r = trial.suggest_categorical("r", [0.25, 0.5, 1, 2])
    r = float(r) * lora_alpha
    r = int(math.ceil(r))
    model, tokenizer = make_modelz(lora_alpha,lora_dropout,r)
    weight_decay= trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)
    warmup_steps= trial.suggest_int("warmup_steps", 2, 10)
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-3, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 8)
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset['train'],
        eval_dataset = dataset['test'],
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        callbacks=[early_stopping_callback],

        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = warmup_steps,
            learning_rate = learning_rate,
            fp16 = not is_bfloat16_supported(),
            eval_strategy = "steps",#
            save_strategy = "steps",
            save_steps=50,
            eval_steps=50,
            load_best_model_at_end = True,
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = weight_decay,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            gradient_checkpointing=True,
        ),
    )
    try:
        del model
    except:
        pass
    trainer_stats = trainer.train()
    time.sleep(20)

    eval_result = trainer.evaluate()
    print(eval_result)
    clear_memory()
    time.sleep(20)
    print("sleepy")
    return eval_result['eval_loss']

In [66]:
from optuna.pruners import HyperbandPruner

pruner = HyperbandPruner()
study = optuna.create_study(direction='minimize', pruner=pruner)

study.optimize(objective, n_trials=10, show_progress_bar=True, gc_after_trial=True)

# Get the best hyperparameters
best_hyperparameters = study.best_params
print(best_hyperparameters)

# Plot parameter importances
plot_param_importances(study)

[I 2024-07-16 16:24:22,354] A new study created in memory with name: no-name-972c7b93-8d98-446a-bcbb-3e1acddda600


  0%|          | 0/10 [00:00<?, ?it/s]

==((====))==  Unsloth: Fast Gemma2 patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.2. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
[W 2024-07-16 16:24:25,720] Trial 0 failed with parameters: {'lora_alpha': 20, 'lora_dropout': 0.48622698773072837, 'r': 1} because of the following error: ValueError('Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. ').
Traceback (most recent call last):
  File "/opt/

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 