<h2><b>Notebook MultiGPU WeniGPT</b></h2>


<h3><b>Plano de treinamento para WeniGPT</b></h3>
---

Criar padrão para Wandb (salvar template para experimentos) <br>
Como rodar com múltiplas GPUs com bom tamanho de batch;<br>
3.1 Escolher o dataset;<br>
3.2 Utilizar hiperparâmetros que já utilizamos;<br>
3.3 Critério de limite será pelo valor do max_steps ao invés de diminuir a quantidade de dados;<br>
3.4 Utilizar auto_batch e outros parametros de gpu; gradient_acumulation<br>
3.5 Definir valor do max_steps para modelo controle e variações com parâmetros;<br>
Primeira rodada de experimentos: o que queremos modificar primeiro e por quê? Qual nosso objetivo? É definir qual hiperparâmetro?<br>
4.1 Conjunto pequeno de hiperparâmetros para testar:<br>
- otimizador: AdamW, Adafactor<br>
- learning_rate: 2**-4, 2**-5<br>
- scheduler: cosine, constant_with_warmup<br>
- Lora-rank: 16, 8/32<br>
- top_k: 50, 10/<br>
- top_p: 1, 0.5<br>
- sliding_window: 4096, 2048<br>
- temperature: 1, 0.1

---

# 2) Importing Dependencies

In [None]:
#@title imports

from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GPTQConfig, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, PeftConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub.utils import enable_progress_bars
from accelerate import Accelerator
from huggingface_hub import HfApi
from trl import SFTTrainer
from typing import Dict
from typing import Any
import huggingface_hub
import pandas as pd
import transformers
import accelerate
import deepspeed
import evaluate
import datetime
import locale
import wandb
import torch
import time
import os
import gc

locale.getpreferredencoding = lambda: "UTF-8"
torch.utils.checkpoint.use_reentrant=True

In [None]:
#@title utils | Clear cache + execution time

class ClearCache():
    """
    Classe para gerenciar o esvaziamento da memória cache da GPU utilizando o PyTorch.

    Essa classe permite limpar a memória cache da GPU antes e depois da execução
    de um bloco de código usando o gerenciador de contexto 'with'.

    Exemplo de uso:
    ```
    with ClearCache():
        # Seu código que utiliza recursos da GPU aqui
    # Memória cache da GPU é automaticamente liberada ao sair do bloco 'with'
    ```

    """

    def __enter__(self):
        """
        Método de entrada do gerenciador de contexto.

        Esse método é chamado quando o bloco 'with' é iniciado. Ele esvazia a memória
        cache da GPU utilizando a função 'torch.cuda.empty_cache()'.

        """
        torch.cuda.empty_cache()

    def __exit__(self, exc_type, exc_val, exc_tb):
        """
        Método de saída do gerenciador de contexto.

        Esse método é chamado quando o bloco 'with' é encerrado. Ele também esvazia
        a memória cache da GPU utilizando a função 'torch.cuda.empty_cache()'.

        :param exc_type: Tipo da exceção, se ocorrer
        :param exc_val: Valor da exceção, se ocorrer
        :param exc_tb: Traceback da exceção, se ocorrer

        """
        torch.cuda.empty_cache()


class EasyDict(dict):
    """Convenience class that behaves like a dict but allows access with the attribute syntax."""

    def __getattr__(self, name: str) -> Any:
        try:
            return self[name]
        except KeyError:
            raise AttributeError(name)

    def __setattr__(self, name: str, value: Any) -> None:
        self[name] = value

    def __delattr__(self, name: str) -> None:
        del self[name]


def execution_time(func):
    """
    Decorator that measures the execution time of a given function and prints the result.

    This decorator can be used to wrap around a function to measure the time it takes
    to execute. It will print the execution time in seconds.

    Args:
        func (callable): The function to measure the execution time of.
    Returns:
        callable: A wrapper function that measures the execution time and calls the
        original function.

    Example usage:
    @execution_time
    def my_function():
        # Your code here

    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time for {func.__name__}: {round(execution_time, 3)} seconds")
        return result

    return wrapper

def clear_memory_cache():
    """
     Clears the GPU memory cache and collects garbage.

    This function performs the following operations:
    1. Resets the maximum memory allocated on the GPU using `torch.cuda.reset_max_memory_allocated()`.
    2. Resets the peak memory statistics using `torch.cuda.reset_peak_memory_stats()`.
    3. Empties the GPU memory cache using `torch.cuda.empty_cache()`.
    4. Collects and prints the number of unreachable objects using `gc.collect()`.

    This function can be useful to free up GPU memory and improve memory management when working with PyTorch.

    Example usage:
    ```
    clear_memory_cache()
    ```
    """
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()
    print(f"Cleared memory: {gc.collect()}")



In [None]:
#@title cuda infos
print(f"Cuda is available: ", torch.cuda.is_available())
print(f"Cuda device capability: ", torch.cuda.get_device_capability())
#print(f"Cuda visible devices: ", os.environ["CUDA_VISIBLE_DEVICES"])

#device_index = 0
#device = torch.device(f'cuda:{device_index}' if torch.cuda.is_available() else 'cpu')
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'
!export TOKENIZERS_PARALLELISM=true

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
#@title nvidia-smi
!nvidia-smi

In [None]:
#@title Training parameters

training_arguments = {
    'model_base_repository_id': "HuggingFaceH4/zephyr-7b-beta",
    'hub_model_id': "Weni/WeniGPT-2.1.1-zephyr-7b-beta-experiments-LLM-Base-1.0.1",
    'dataset_id': "Weni/LLM-Base-1.0.1-negative_reduction-context_expantion",
    'folder_name': "zephyr7bbeta",
    'description': 'experiments',

    # Dataset
    'dataset_text_field': "prompt",
    'context_field': "",
    'instruction_field': "instruction",
    'target_field': "chosen_response",
    'train_dataset':"train",
    'eval_dataset':"test",

    # HuggingFace
    'hub_token': os.environ['HUB_TOKEN'],
    'push_to_hub': True,
    'hub_strategy': 'all_checkpoints',

    # Wandb
    'report_to': 'wandb',
    'wandb_token': os.environ['WANDB_TOKEN'],

    # Lora
    'bits': 4 ,
    'use_exllama': True,
    'device_map': "auto",
    'use_cache': False,
    'lora_r': 16,
    'lora_alpha': 16,
    'lora_dropout': 0.05,
    'bias': "none",
    'target_modules': ["q_proj", "v_proj"],
    'task_type': "CAUSAL_LM",

    # Bits and bytes
    'load_in_4bit':True,
    'use_4bit':True,
    'bnb_4bit_use_double_quant':True,
    'bnb_4bit_quant_type':"nf4",
    'bnb_4bit_compute_dtype': torch.float16,

    # Training Args
    'max_seq_length':  8096,
    'num_train_epochs': 5,
    'per_device_train_batch_size':  1,
    'per_device_eval_batch_size': 1,
    'gradient_accumulation_steps': 2,
    'gradient_checkpointing': True,
    'optimizer': "paged_adamw_32bit",
    'learning_rate':  2e-4,
    'logging_steps': 50,
    'max_steps':10,
    'fp16': True,
    'packing': True,
    'lr_scheduler_type': "cosine",
    'pretraining_tp': 1,
    'mlm':False,
    'save_strategy': "epoch",
    'evaluation_strategy': "epoch",
    'load_best_model_at_end': True,
    'metric_for_best_model': 'eval_loss',
    'greater_is_better': False,
    'prediction_loss_only':True,
    'save_safetensors': True,
    'max_grad_norm': 0.3,
    'warmup_ratio': 0.03,
    'weight_decay': 0.01,
    'neftune_noise_alpha':5,
    'torch_dtype': torch.float16,
    'save_total_limit': 5,

    # Tokenizer
    'padding': True,
    'padding_side': 'left',
    'add_eos_token':True,
    'trust_remote_code': True,
    'use_auth_token':True,

    # Misc
    'disable_tqdm': False,
    'include_inputs_for_metrics':True,

    # GPU config

    'deepspeed': {
        'gradient_accumulation_steps': 1,
        'gradient_clipping': 1.0,
        'offload_optimizer_device': 'cpu',
        'offload_param_device': 'cpu',
        'zero3_init_flag': True,
        'zero3_save_16bit_model': False,
        'zero_stage': 3
    },
}

training_arguments = EasyDict(training_arguments)

In [None]:
@execution_time
def login_hugging_face_hub(token):
    """
    Log in to the Hugging Face platform using the provided token and set the model repository name.

    This function logs in to the Hugging Face platform using the provided authentication token and
    sets the name of the model repository that will be used later.

    Parameters:
        token (str): The authentication token to log in to the Hugging Face platform.
        model_name (str): The name of the model repository created on Hugging Face.

    Example:
        token = "your_token_here"
        model_name = "your_repository_name"
        login_hugging_face_hub(token, model_name)
    """
    !huggingface-cli login --token $token
    
login_hugging_face_hub(training_arguments.hub_token)

# 3) Load Dataset

In [None]:
def format_instruction(sample):
    sample['prompt'] = f"""<s>Você é um médico tratando um paciente com amnésia. Para responder as perguntas do paciente, você irá ler um texto anteriormente para se contextualizar. Se você trouxer informações desconhecidas, fora do texto lido, poderá deixar o paciente confuso. Se o paciente fizer uma questão sobre informações não presentes no texto, você precisa responder 'Desculpe, não possuo essa informação', pois se tentar responder, pode trazer informações que não ajudarão o paciente recuperar sua memória.\n\nTEXTO: {sample['context']}.\n\nPERGUNTA: {sample['question']}.\nLembre, se não estiver no texto ou não souber a resposta, responda especificamente 'Desculpe, não possuo essa informação'. Precisamos ajudar o paciente.\n\nRESPOSTA: {sample['resposta']}</s>"""
    return sample

@execution_time
def load_dataset_and_split(dataset_id, column_target_name, sample=None, seed=55, test_size=0.1):
    """
    Loads a dataset with the given ID, shuffles it, and splits the training set into
    training and testing sets.

    Parameters:
    - dataset_id (str): The ID of the dataset to load.
    - sample (int, optional): The number of rows to use from the dataset (default is None).
    - seed (int, optional): Seed for random shuffling of the dataset (default is 55).
    - test_size (float, optional): The proportion of the dataset to include in the test split (default is 0.1).

    Returns:
    - tuple: A tuple containing the training and testing datasets.
    """
    dataset = load_dataset(dataset_id)

    if sample is not None:
        sample_size =  int(len(dataset) * sample)
        dataset = dataset.shuffle(seed=seed)['train'].select(range(sample_size))
    else:
        dataset = dataset.shuffle(seed=seed)['train']

    print(type(dataset))
    dataset = dataset.train_test_split(test_size=test_size)
    print(type(dataset))

    dataset['train'] = dataset['train'].map(format_instruction, num_proc=8, remove_columns=[column_target_name, "question", "context", "id", "correct_ans"])
    dataset['test'] = dataset['test'].map(format_instruction, num_proc=8, remove_columns=[column_target_name, "question", "context", "id", "correct_ans"])

    return dataset

In [None]:
dataset = load_dataset_and_split(training_arguments.dataset_id, 'resposta', test_size=0.1)
dataset

In [None]:
num_gpus = 1
training_arguments.max_steps = int(len(dataset['train']) / (num_gpus * training_arguments.per_device_train_batch_size * training_arguments.gradient_accumulation_steps))
training_arguments.max_steps = 10

In [None]:
#@title checking the dataset output
dataset['train']['prompt'][0]

# 4) Functions dedicated to preprocessing, training, and model storage

In [None]:
for i in range(10):
  clear_memory_cache()

In [None]:
#@title Functions

@execution_time
def load_model_and_tokenizer(model_base_repository_id, quantized=True, quantization_type=None, dataset=None):
    """
    Carrega o modelo e o tokenizer.

    Parameters:
    - model_base_repository_id (str): O ID do repositório.
    - quantized (bool): Se o modelo deve ser quantizado.
    - quantization_type (str): Tipo de quantização ("bits_and_bytes" ou "gpqt").

    Returns:
    - Tuple[AutoModelForCausalLM, AutoTokenizer]: Modelo e Tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(
        model_base_repository_id,
        padding=training_arguments.padding,
        max_lenght=training_arguments.max_seq_length,
        trust_remote_code=training_arguments.trust_remote_code,
    )

    tokenizer.add_eos_token = training_arguments.add_eos_token
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = training_arguments.padding_side

    if quantized:
        if quantization_type == "bits_and_bytes":
            model = AutoModelForCausalLM.from_pretrained(
                model_base_repository_id,
                quantization_config=BitsAndBytesConfig(
                    load_in_4bit=training_arguments.load_in_4bit,
                    use_4bit=training_arguments.use_4bit,
                    bnb_4bit_use_double_quant=training_arguments.bnb_4bit_use_double_quant,
                    bnb_4bit_quant_type=training_arguments.bnb_4bit_quant_type,
                    bnb_4bit_compute_dtype=training_arguments.bnb_4bit_compute_dtype
                ),
                use_cache=training_arguments.use_cache,
                device_map=training_arguments.device_map,
                torch_dtype=training_arguments.torch_dtype
            )
        elif quantization_type == "gptq":
            model = AutoModelForCausalLM.from_pretrained(
                model_base_repository_id,
                quantization_config=GPTQConfig(
                    bits=training_arguments.bits,
                    dataset=dataset_for_gptq['train']['prompt'],
                    use_exllama=training_arguments.use_exllama,
                    tokenizer=tokenizer
                ),
                use_cache=training_arguments.use_cache,
                device_map=training_arguments.device_map,
                torch_dtype=training_arguments.torch_dtype
            )
        else:
            raise ValueError("Tipo de quantização não suportado")
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_base_repository_id,
            use_cache=training_arguments.use_cache,
            device_map=training_arguments.device_map,
            torch_dtype=training_arguments.torch_dtype
        )


    return model, tokenizer

@execution_time
def configure_and_prepare_model(model):
    """
    """
    model.config.use_cache = training_arguments.use_cache
    model.config.pretraining_tp = training_arguments.pretraining_tp
    model.gradient_checkpointing_enable()
    #model.enable_input_require_grads()
    model = prepare_model_for_kbit_training(model)

    peft_config = LoraConfig(
        r=training_arguments.lora_r,
        lora_alpha=training_arguments.lora_alpha,
        lora_dropout=training_arguments.lora_dropout,
        bias=training_arguments.bias,
        task_type=training_arguments.task_type,
        target_modules=training_arguments.target_modules
    )

    model = get_peft_model(model, peft_config)

    return model, peft_config

def configure_and_prepare_model_for_BitsBytes(model, peft_config):
    """
    """
    model.config.use_cache = training_arguments.use_cache
    model.config.pretraining_tp = training_arguments.pretraining_tp
    model.gradient_checkpointing_enable()
    #model.enable_input_require_grads()
    model = get_peft_model(model, peft_config)

    return model

@execution_time
def build_model_name(hub_model_id,  dataset, num_train_epochs, per_device_train_batch_size):
    """
    Builds a name for the model and defines the paths to save the model and the tokenizer.
    Args:
        model_hf (str): Name or identifier of the Hugging Face model.
        dataset_file (str): Path to the CSV file containing the dataset data.
        epochs (int): Number of training epochs for the model.
        batch_size (int): Batch size used during training.
        folder_name (str): Name of the directory where the model and tokenizer will be saved.
        description (str): Model description or training objective.

    Returns:
    tuple: Model name and the full path to save it.
    """
    dataset = dataset
    dataset_size = "{:,.0f}".format(len(dataset)).replace(",", ".")
    today = datetime.date.today()
    today = today.strftime("%d-%m-%y")
    model_name = str(today) + '-' + hub_model_id.replace('/','-') + '_' + description + '-' + str(dataset_size) + '_epochs-' + str(num_train_epochs) + '_batch_' + str(per_device_train_batch_size)

    dir_model_name = './' + folder_name + '/' + model_name
    drive_model_name = '/content/drive/Shareddrives/ModelosdeIA/Modelos/Zephyr/'+ dir_model_name[11:]

    return dir_model_name, drive_model_name

@execution_time
def login_hugging_face_hub(token):
    """
    Log in to the Hugging Face platform using the provided token and set the model repository name.

    This function logs in to the Hugging Face platform using the provided authentication token and
    sets the name of the model repository that will be used later.

    Parameters:
        token (str): The authentication token to log in to the Hugging Face platform.
        model_name (str): The name of the model repository created on Hugging Face.

    Example:
        token = "your_token_here"
        model_name = "your_repository_name"
        login_hugging_face_hub(token, model_name)
    """
    !huggingface-cli login --token $token

@execution_time
def push_to_hub(model, tokenizer, huggingface_model_name):
    """
    Push the model and its associated tokenizer to the Hugging Face Model Hub.

    This function sends a trained model and its corresponding tokenizer to the Hugging Face Model Hub,
    allowing them to be shared, versioned, and used by other users.

    Args:
        model (PreTrainedModel): The trained model to be pushed to the Model Hub.
        tokenizer (PreTrainedTokenizer): The tokenizer corresponding to the model.
        huggingface_model_name (str): Name of the model repository on the Hugging Face Model Hub.

    Returns:
        None

    Note:
        Make sure you have imported the PreTrainedModel and PreTrainedTokenizer classes.

    Example Usage:
    >>> model = TrainedModel()
    >>> tokenizer = ModelTokenizer()
    >>> push_to_hub(model, tokenizer, 'my-awesome-model')
    """
    try:
        model.push_to_hub(huggingface_model_name, use_auth_token=training_arguments.use_auth_token)
        tokenizer.push_to_hub(huggingface_model_name, use_auth_token=training_arguments.use_auth_token)
    except Exception as e:
        print("An error occurred:", e)

@execution_time
def create_huggingface_repository(repository_id, first_commit_message):
    """
    Create a private repository on Hugging Face.

    This function creates a private repository on Hugging Face with the specified repository ID
    and an initial commit message.

    Args:
        repository_id (str): The unique ID for the new repository.
        first_commit_message (str): The message for the initial commit.

    Returns:
        None

    Example Usage:
    >>> repo_id = "my-repo"
    >>> initial_message = "Initial commit"
    >>> create_huggingface_repository(repo_id, initial_message)
    """
    repository_id = repository_id
    first_commit_message = first_commit_message
    api.create_repo(repo_id=repository_id)

@execution_time
def train_model(model, tokenizer, dataset, peft_config,  dir_model_name, training_arguments):
    """
    """
    trainer = SFTTrainer(
        model=model,
        max_seq_length=training_arguments.max_seq_length,
        neftune_noise_alpha=training_arguments.neftune_noise_alpha,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field=training_arguments.dataset_text_field,
        peft_config=peft_config,
        packing=training_arguments.packing,
        args=TrainingArguments(
            disable_tqdm=training_arguments['disable_tqdm'],
            num_train_epochs=training_arguments['num_train_epochs'],
            per_device_train_batch_size=training_arguments['per_device_train_batch_size'],
            per_device_eval_batch_size=training_arguments['per_device_eval_batch_size'],
            gradient_accumulation_steps=training_arguments['gradient_accumulation_steps'],
            gradient_checkpointing=training_arguments['gradient_checkpointing'],
            optim=training_arguments['optimizer'],
            learning_rate=training_arguments['learning_rate'],
            logging_steps=training_arguments['logging_steps'],
            max_steps=training_arguments['max_steps'],
            fp16=training_arguments['fp16'],
            lr_scheduler_type=training_arguments['lr_scheduler_type'],
            save_strategy=training_arguments['save_strategy'],
            evaluation_strategy=training_arguments['evaluation_strategy'],
            load_best_model_at_end=training_arguments['load_best_model_at_end'],
            metric_for_best_model=training_arguments['metric_for_best_model'],
            greater_is_better=training_arguments['greater_is_better'],
            prediction_loss_only=training_arguments['prediction_loss_only'],
            save_safetensors=training_arguments['save_safetensors'],
            save_total_limit=training_arguments['save_total_limit'],
            report_to=training_arguments['report_to'],
            max_grad_norm=training_arguments['max_grad_norm'],
            warmup_ratio=training_arguments['warmup_ratio'],
            weight_decay=training_arguments['weight_decay'],
            hub_model_id=training_arguments['hub_model_id'],
            push_to_hub=training_arguments['push_to_hub'],
            hub_strategy=training_arguments['hub_strategy'],
            hub_token=training_arguments['hub_token'],
            output_dir=f"{dir_model_name}/checkpoints/"
          ),
          data_collator=transformers.DataCollatorForLanguageModeling(
              tokenizer,
              mlm=training_arguments.mlm),

    )
    trainer.train()
    eval_results = trainer.evaluate()
    trainer.save_model(dir_model_name)
    tokenizer.save_pretrained(dir_model_name)
    clear_memory_cache()

@execution_time
def main(hub_model_id, model, tokenizer, data, token,
         dir_model_name, peft_config, training_arguments):

    model = model
    tokenizer = tokenizer
    data = data
    peft_config = peft_config
    training_arguments = training_arguments

    for i in range(3):
      clear_memory_cache()

    login_hugging_face_hub(token)

    train_model(model, tokenizer, data, peft_config,  dir_model_name,  training_arguments)

    push_to_hub(model, tokenizer, training_arguments.hub_model_id)

    model1 = AutoPeftModelForCausalLM.from_pretrained(
        dir_model_name,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
    )

    merged_model = model1.merge_and_unload()

    merged_model.save_pretrained(dir_model_name)

    tokenizer.save_pretrained(dir_model_name)

    push_to_hub(merged_model, tokenizer, training_arguments.hub_model_id)



In [None]:
#@title get dir name to save model

folder_name = training_arguments.folder_name
model_base_repository_id = training_arguments.model_base_repository_id
description = training_arguments.description
epochs = training_arguments.num_train_epochs
per_device_train_batch_size = training_arguments.per_device_train_batch_size
dataset = dataset

dir_model_name, drive_model_name = build_model_name(model_base_repository_id, dataset, epochs, per_device_train_batch_size)
print(f"model_complete_name: {drive_model_name, dir_model_name}")


In [None]:
#@title login huggingface hub

login_hugging_face_hub(training_arguments.hub_token)

for i in range(5):
  clear_memory_cache()
  

# 9) Hyperparameters Search

- definir limites de hiperparametros
- 
criar lista de espaco
- 
iterar em list de espac
- s
atualizar hiperparametros com base no esp
- ço
inicializar o wandb para o experime
- tos
inicializar o otimizador com base no hiperparametros atuali
- ados
inicializar o scheduler com base no huperparametros atual
- zados
atualizar variaveis tipo dir na
- e, etc
chamar funcao de tre
- no nova
fec
- model_confighar wandb

In [None]:
def train_model(model, tokenizer, dataset, peft_config,  dir_model_name, optimizer, scheduler, training_arguments):
    """
    """
    trainer = SFTTrainer(
        model=model,
        max_seq_length=training_arguments.max_seq_length,
        neftune_noise_alpha=training_arguments.neftune_noise_alpha,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field=training_arguments.dataset_text_field,
        peft_config=peft_config,
        packing=training_arguments.packing,
        optimizers=(optimizer, scheduler),
        args=TrainingArguments(
            disable_tqdm=training_arguments['disable_tqdm'],
            num_train_epochs=training_arguments['num_train_epochs'],
            per_device_train_batch_size=training_arguments['per_device_train_batch_size'],
            per_device_eval_batch_size=training_arguments['per_device_eval_batch_size'],
            gradient_accumulation_steps=training_arguments['gradient_accumulation_steps'],
            gradient_checkpointing=training_arguments['gradient_checkpointing'],
            max_steps=training_arguments['max_steps'],
            fp16=training_arguments['fp16'],
            evaluation_strategy="steps",
            load_best_model_at_end=training_arguments['load_best_model_at_end'],
            metric_for_best_model=training_arguments['metric_for_best_model'],
            greater_is_better=training_arguments['greater_is_better'],
            prediction_loss_only=training_arguments['prediction_loss_only'],
            save_safetensors=training_arguments['save_safetensors'],
            save_total_limit=training_arguments['save_total_limit'],
            report_to=training_arguments['report_to'],
            max_grad_norm=training_arguments['max_grad_norm'],
            warmup_ratio=training_arguments['warmup_ratio'],
            weight_decay=training_arguments['weight_decay'],
            hub_model_id=training_arguments['hub_model_id'],
            push_to_hub=training_arguments['push_to_hub'],
            hub_strategy=training_arguments['hub_strategy'],
            hub_token=training_arguments['hub_token'],
            output_dir=f"{dir_model_name}/checkpoints/",
            include_inputs_for_metrics=True,
            save_strategy="steps",
            save_steps=50,
            logging_steps=50,
          ),
          data_collator=transformers.DataCollatorForLanguageModeling(
              tokenizer,
              mlm=training_arguments.mlm),

    )
    
    trainer.train()

def choose_optimizer(optim_string, learning_rate, params):
    if optim_string == "AdamW":
        return transformers.AdamW(params, lr=learning_rate)
    elif  optim_string == "Adafactor":
        return transformers.Adafactor(params, lr=learning_rate)
    else:
        raise Exception("Unknown optimizer")

def choose_scheduler(scheduler_string, optimizer, num_steps, warmup_ratio):
    num_warmup = int(warmup_ratio * num_steps)
    if scheduler_string == "cosine":
        return transformers.get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup, num_training_steps=num_steps) 
    elif scheduler_string == "constant_with_warmup":
        return transformers.get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup, num_training_steps=num_steps)
    else:
        raise Exception("Unknown scheduler")
    

In [None]:
wandb.login(key = os.environ['WANDB_TOKEN'])
accelerator = Accelerator(log_with="wandb")

In [None]:
# transformers.utils.logging.set_verbosity_debug()
# import logging
# LOGLEVEL = os.environ.get('LOGLEVEL', 'DEBUG').upper()
# logging.basicConfig(level=LOGLEVEL)

In [None]:
from sklearn.model_selection import ParameterGrid

In [None]:
wandb_args = {
    'learning_rate': [2e-4, 2e-5],
    'lora_rank': [16, 32],
    'optimizer': ['AdamW', 'Adafactor'],
    'lr_scheduler_type':['cosine', 'constant_with_warmup']
}

hyper_space = list(ParameterGrid(wandb_args))
hyper_space

In [None]:
import pandas as pd

In [None]:
# dicionario de quais hiper parametros vamos estudar
# espaco_hiperparams = https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html

# definir dataset test e eval 
quantidade_exemplos = 10
seed = 44

dataset_aux = dataset['test'].train_test_split(test_size=quantidade_exemplos, seed=seed)
dataset['eval'] = dataset_aux['train']
dataset['test'] = dataset_aux['test']
training_arguments.eval_dataset = 'eval'

In [None]:
dataset

In [None]:
# atualizar hiperparametros com base no espaço
for hiperparams in hyper_space:
	# atualizar training_arguments baseado no hiperparams
    training_arguments.optimizer = hiperparams['optimizer']
    training_arguments.lr_scheduler_type = hiperparams['lr_scheduler_type']
    training_arguments.learning_rate = hiperparams['learning_rate']
    training_arguments.lora_r = hiperparams['lora_rank']
	
    def run(training_arguments, dataset):       
        # atualizar essas infos:
        folder_name = training_arguments.folder_name
        model_base_repository_id = training_arguments.model_base_repository_id
        description = training_arguments.description
        epochs = training_arguments.num_train_epochs
        per_device_train_batch_size = training_arguments.per_device_train_batch_size
        
        dir_model_name, drive_model_name = build_model_name(model_base_repository_id, dataset, 	epochs, per_device_train_batch_size)
        print(f"model_complete_name: {drive_model_name, dir_model_name}")
        # carregar modelo e token
        model, tokenizer = load_model_and_tokenizer(training_arguments.model_base_repository_id, quantized=True, quantization_type='bits_and_bytes')
        model, peft_config = configure_and_prepare_model(model)
        model_config = transformers.PretrainedConfig.get_config_dict(training_arguments.model_base_repository_id)[0]
        
        # inicializar otimizador com base no hiperparams
        optimizer = choose_optimizer(training_arguments.optimizer, training_arguments.learning_rate, model.parameters())
        # inicializar scheduler com base no hiperparams
        scheduler = choose_scheduler(training_arguments.lr_scheduler_type, optimizer, training_arguments.max_steps, training_arguments.warmup_ratio)
        
        model,  dataset = accelerator.prepare(model, dataset)
        
        # wand init accelerate com base nos novos hiperparams
        kwargs = {
            "notes": 'Experimentos de hiperparametros',
            "group": f"{training_arguments.folder_name}_{training_arguments.max_steps}_bitsandbytes",
            "name": f"{training_arguments.optimizer}_{training_arguments.learning_rate}_{training_arguments.lr_scheduler_type}_{training_arguments.lora_r}",
            "entity":"weni"
        }
        
        accelerator.init_trackers(
            project_name = 'Experimentos de hiperparametros',
            config = training_arguments,
            init_kwargs={"wandb": kwargs}
        )
        
        # chama train.py modificado com dataset eval, otimizador, scheduler, novos training_args
        train_model(model, tokenizer, dataset, peft_config,  dir_model_name, optimizer, scheduler, training_arguments)
        
        # faz inferencia nos experimentos dataset test
        model = model.merge_and_unload()
        answers = []
        for i, text in enumerate(dataset['test']['prompt']):
            text = text.split("\n\nRESPOSTA:")[0] + "\n\nRESPOSTA:"
            encoding = tokenizer(text, return_tensors="pt",padding=True) 
            with torch.no_grad():
                outputs = model.generate(**encoding, max_new_tokens=500,
                    top_k=10,
                    typical_p=0.95,
                    temperature=0.5,
                    top_p=0.95,
                    num_return_sequences=1,
                    repetition_penalty=1.03,
                    do_sample=True)
        
            decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
            answers.append(decoded_outputs)
        
        # adiciona tabela pro wandb log
        dataframe = pd.DataFrame(answers)
    
        wandb_table = wandb.Table(dataframe=dataframe)
        wandb.log({"Teste": wandb_table})
    
    run(training_arguments, dataset)
    # fecha wandb
    accelerator.end_training()

