<a href="https://colab.research.google.com/github/vinidiol/descmerc/blob/main/fine_tuning_teenytinyllama_RAFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SLIM-RAFT - Simplified Logical Intelligent Model for RAFT**

For more info check here: https://github.com/yurifacanha/ncmrag

#Fine-Tuning TeenyTinyLLaMA for NCM Code

The original code can be checked here: https://github.com/Nkluge-correa/TeenyTinyLlama/tree/main/Fine-tuning

## Instal & Import

In [None]:
! pip install transformers==4.38.0
!pip install pyyaml==6.0.1
!pip install datasets==2.16.1
!pip install wandb==0.16.2
!pip install codecarbon==2.3.3
!pip install huggingface_hub==0.20.2
!pip install accelerate==0.26.1
!pip install sentencepiece==0.1.99
!pip install flash-attn==2.5.8
!pip install autoawq==0.1.7


Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.0
    Uninstalling transformers-4.41.0:
      Successfully uninstalled transformers-4.41.0
Successfully installed tokenizers-0.15.2 transformers-4.38.0
Collecting datasets==2.16.1
  Downloading

In [None]:

import os
os.kill(os.getpid(), 9)

##Functions

In [None]:
from typing import Optional
from dataclasses import dataclass, field

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    base_model: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The base model to use in the fine-tuning."
            )
        },
    )

    model_id: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The model id of the model to train.",
                "Options are: `160m` and `460m`.",
            )
        },
    )

    use_fast: Optional[bool] = field(
        default=True,
        metadata={"help": "Whether to use the fast method of tokenization."},
    )

    output_hidden_states: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to return all hidden-states (i.e., all hidden-states for all layers)."},
    )

    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )

    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
                "execute code present on the Hub on your local machine."
            )
        },
    )

    low_cpu_mem_usage: bool = field(
        default=False,
        metadata={
            "help": (
                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
                "set True will benefit LLM loading time and RAM consumption."
            )
        },
    )

    boi_token: Optional[str] = field(
        default='<instruction>',
        metadata={"help": "The 'beginning of instruction' token"},
    )

    eoi_token: Optional[str] = field(
        default='</instruction>',
        metadata={"help": "The 'end of instruction' token"},
    )

    chat_template: Optional[str] = field(
        default=None,
        metadata={"help": "The chat template to use."},
    )

    attn_implementation: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Whether to use the optimized implementation of attention. "
                "Option is `None` or `flash_attention_2`."
            )
        },
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )

    dataset_split: Optional[str] = field(
        default=None,
        metadata={"help": "The dataset split to use."},
    )

    validation_split_percentage: Optional[int] = field(
        default=0.05,
        metadata={"help": "The percentage of the train set used as validation set in case there's no validation split"},
    )

    max_prompt_length: Optional[int] = field(
        default=100,
        metadata={"help": "The maximum length of the prompt when performing DPO training."},
    )

    max_length: Optional[int] = field(
        default=2048,
        metadata={
            "help": (
                "Optional input sequence length after tokenization. "
                "The training dataset will be truncated in block of this size for training. "
            )
        },
    )

    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )

    sanity_check: Optional[bool] = field(
        default=False,
        metadata={"help": "If set, will run training on a small portion of the dataset."},
    )

@dataclass
class ExtraArguments:
    """
    Arguments pertaining miscellaneous things (e.g., the Accelerator, W&B, logger name, etc.).
    """
    logger_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the logger to use."},
    )

    wandb_token: Optional[str] = field(
        default=None,
        metadata={"help": "The token to use for logging to wandb."},
    )

    wandb_log_steps: Optional[int] = field(
        default=1,
        metadata={"help": "The number of steps to log to wandb."},
    )

    sample_every: Optional[int] = field(
        default=100,
        metadata={"help": "The number of steps between each time the model generates samples."},
    )

    mixed_precision: Optional[str] = field(
        default='no',
        metadata={"help": "Whether to use mixed precision or not ('no', 'fp16')."},
    )

##Parameters

In [None]:

param = {
'model_args':
  {'base_model': "nicholasKluge/TeenyTinyLlama-160m",
  'model_id': "160m",
  'use_fast': True,
  'output_hidden_states': False,
  'cache_dir': None,
  'model_revision': "main",
  'trust_remote_code': False,
  'low_cpu_mem_usage': False,
  'boi_token': "<instruction>",
  'eoi_token': "</instruction>",
  'chat_template': "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<instruction>' + message['content'].strip() + '</instruction>'}}{% 'elif' message['role'] == 'assistant' %}{{ message['content'].strip() + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
  'attn_implementation':"eager"}, #}'lash_attention_2',
'data_args':
    {'dataset_name': "yurifacanha/NCM-RAFT",
    #'dataset_split': "train[:1%]",
    'dataset_split': "train",
    'validation_split_percentage': 0.1,
    'max_length': 2048,
    'preprocessing_num_workers': None,
    'sanity_check': False},
'training_args':
  {'output_dir': "checkpoints",
  'num_train_epochs': 2,
  'do_train': True,
  'do_eval': True,
  'per_device_train_batch_size': 4,
  'per_device_eval_batch_size': 4,
  'gradient_accumulation_steps': 1, #1
  'gradient_checkpointing': False,
  'weight_decay': 0.01,
  'learning_rate': 0.00001,
  'adam_epsilon': 0.00000001,
  'lr_scheduler_type': "cosine",
  'warmup_steps': 1000,
  'seed': 42,
  'dataloader_pin_memory': True,
  'hub_token': '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',
  'push_to_hub': True,
  'hub_model_id': "yurifacanha/TeenyTinyLlama-160m-Chat-fine-tuning"},
'extra_args':
  {'logger_name': "TeenyTinyLlama",
  'wandb_token': None,
  'wandb_log_steps': 1,
  'sample_every': 5000,
  'mixed_precision': 'no'},

  }

##Training and Saving the new model

In [None]:




import os
import sys
import time
import math
import yaml
import torch
import wandb
import random
import logging
import argparse
from tqdm import tqdm
import datasets
from datasets import load_dataset
from tokenizers import AddedToken
from codecarbon import EmissionsTracker
from torch.utils.data import DataLoader
import huggingface_hub
from huggingface_hub import create_repo, HfApi

import transformers
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    default_data_collator,
    get_scheduler,
    GenerationConfig,
)

from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed

# These environment variables result in improved performance in modern Ampere GPUs (e.g., A100)
# Remember that `TF32` mode will only work on Ampere GPUs!
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def main(data):

    # Separete the arguments for the model, data, training, and extra arguments (wandb, accelerate, etc.).
    # You can check the `specifications.py` file to see the structure of the arguments.
    model_args = ModelArguments(**data['model_args'])
    data_args = DataTrainingArguments(**data['data_args'])
    training_args = TrainingArguments(**data['training_args'])
    extra_args = ExtraArguments(**data['extra_args'])

    # We are going to be using the `accelerate` library, which provides the `Accelerator` class
    # that can be used to handle device placement and distributed training.
    accelerator = Accelerator(
        mixed_precision=extra_args.mixed_precision,
        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
        project_dir=training_args.output_dir)

        # Create a directory to save the logs and the model checkpoints.
    os.makedirs(training_args.output_dir, exist_ok=True)

    # Set the logger.
    # Nothing fancy here, just a simple logger.
    logger = get_logger(extra_args.logger_name)

    # Create configurations for the logger.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    # We are setting the verbosity of the `datasets`, `transformers` and `huggingface_hub` libraries
    # to `error` to avoid unnecessary logs.
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()
    huggingface_hub.utils.logging.set_verbosity_error()

    # Log the status of the accelerator on all processes.
    logger.info(accelerator.state, main_process_only=False)

    # Set seed before initializing model.
    # This is important to ensure synchronization of the random number generators across all processes.
    if training_args.seed is not None:
        set_seed(training_args.seed)

    # We are using the `accelerator.wait_for_everyone()` method to ensure that all processes
    # have finished the previous steps before moving on to the next one.
    # Documentation: https://huggingface.co/docs/accelerate/v0.27.2/en/package_reference/accelerator#synchronicity-control
    accelerator.wait_for_everyone()

    # Create a HuggingFace repository if needed (only the main process should do this).
    if accelerator.is_main_process:
        if training_args.push_to_hub and training_args.hub_token is not None:
            if training_args.hub_model_id is not None:
                create_repo(
                    repo_id=training_args.hub_model_id,
                    token=training_args.hub_token,
                    repo_type="model",
                    exist_ok=True,
                    private=True)

            else:
                raise ValueError("No model id provided. Try running with `hub_model_id=your-user-name/your-model-name`")

    # Load the fine-tuning dataset.
    if data_args.dataset_name is not None:

        dataset = load_dataset(
            data_args.dataset_name,
            split=data_args.dataset_split,
            token=None,
            cache_dir=model_args.cache_dir,
        )


        # Make a list of prompts to serve as seeds for generation.
        seeds = [model_args.boi_token + x[0]['content'] + model_args.eoi_token for x in dataset.select(range(100))['conversations']]

        # Shuffle the dataset.
        dataset = dataset.shuffle(seed=training_args.seed)

        # Sanity check: use only the first 100 examples
        if data_args.sanity_check:
            dataset = dataset.select(range(100))

            logger.info(f"Sanity check: using only the first 100 examples")

        logger.info(f"Loaded dataset: {data_args.dataset_name} | Split: {data_args.dataset_split} | Number of examples: {len(dataset):,}")

    else:

        raise ValueError("No dataset provided. Try running with `dataset_name=nicholasKluge/instruct-aira-dataset`")

    if model_args.base_model is not None:

        # Now we are going to load the configuration, model and tokenizer from the HuggingFace Hub.
        # According to the documentation, the `from_pretrained` methods guarantee that only one local process can concurrently
        # download the model/tokenizer from the HuggingFace Hub.
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.base_model,
            **{
                "cache_dir": model_args.cache_dir,
                "use_fast": model_args.use_fast,
                "revision": model_args.model_revision,
                "token": training_args.hub_token,
                "trust_remote_code": model_args.trust_remote_code,
            }
        )

        # Add special tokens.
        special_tokens_dict = {
            "additional_special_tokens": [
                AddedToken(model_args.boi_token, lstrip=False, rstrip=False, normalized=True, single_word=False),
                AddedToken(model_args.eoi_token, lstrip=False, rstrip=False, normalized=True, single_word=False),
            ]
        }

        tokenizer.add_special_tokens(special_tokens_dict)



        logger.info(f"Special tokens added to the tokenizer: {tokenizer.all_special_tokens}")

        # Add chat template to the tokenizer.

        template = """{{bos_token}}{% for message in messages %}
        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
        {% endif %}
        {% if message['role'] == 'user' %}
        {{ '<instruction>' + message['content'].strip() + '</instruction>'}}
        {% elif message['role'] == 'assistant' %}
        {{ message['content'].strip() + eos_token}}
        {% else %}
        {{ raise_exception('Only user and assistant roles are supported!') }}
        {% endif %}
        {% endfor %}"""


        tokenizer.chat_template = template

        # Load the configuration of the `base_model`
        configuration = AutoConfig.from_pretrained(
            model_args.base_model,
            **{
                "cache_dir": model_args.cache_dir,
                "revision": model_args.model_revision,
                "token": training_args.hub_token,
                "trust_remote_code": model_args.trust_remote_code,
                "output_hidden_states": model_args.output_hidden_states,
            }
        )
        # Load the pretrained model to be fine-tuned
        model = AutoModelForCausalLM.from_pretrained(
                model_args.base_model,
                config=configuration,
                cache_dir=model_args.cache_dir,
                revision=model_args.model_revision,
                token=training_args.hub_token,
                trust_remote_code=model_args.trust_remote_code,
                low_cpu_mem_usage=model_args.low_cpu_mem_usage,
                attn_implementation=model_args.attn_implementation,
            )

        # Resize the token embeddings of the model to match the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

        # Add new `name_or_path` to the model config.
        if training_args.hub_model_id is not None:
            model.config.name_or_path = training_args.hub_model_id

        # Gradient checkpointing can be enabled to reduce the memory usage during training.
        # However, this will slow down the training process by about 20%.
        if training_args.gradient_checkpointing:
            model.gradient_checkpointing_enable()
            model.config.use_cache = False

        logger.info(f"Model to train (base architecture): {model_args.base_model}")

    else:
        raise ValueError("No base model provided. Try running with `base_model=gpt2`")

    #    Create a formated Chat column.
    dataset = dataset.map(lambda x: {"formatted_conversations": tokenizer.apply_chat_template(x["conversations"], tokenize=False, add_generation_prompt=False)})

    # Tokenize the dataset.
    column_names = dataset.column_names

    # Tokenize all texts in the dataset.
    def tokenize_function(examples):
        return tokenizer(examples['formatted_conversations'],
            add_special_tokens=False,
            truncation=True,
            max_length=data_args.max_length,
            padding="max_length",
            )

    with accelerator.main_process_first():
        dataset = dataset.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=True,
            desc="Running tokenizer on every text in dataset",
        )

    # Add a column named `labels` wich is a copy of the `input_ids` column.
    with accelerator.main_process_first():
        dataset = dataset.map(
            lambda examples: {"labels": examples["input_ids"]},
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=True,
            desc="Adding labels to the dataset",
        )

    # Split the dataset into train and validation sets.
    if training_args.do_eval and data_args.validation_split_percentage is not None:

        logger.info("Splitting the dataset into train and validation sets...")

        dataset = dataset.train_test_split(test_size=data_args.validation_split_percentage)

        logger.info(f"Train set size: {len(dataset['train']):,} | Validation set size: {len(dataset['test']):,}")

    else:

        logger.info(f"Using the whole dataset for training. Training set size: {len(dataset):,}")

    # Create the Training DataLoader.
    if training_args.do_train and training_args.do_eval:
        if "train" not in dataset:
            raise ValueError("`do_train=True` requires a train dataset")
        train_dataset = dataset["train"]
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=True,
            collate_fn=default_data_collator,
            batch_size=training_args.per_device_train_batch_size,
            pin_memory=training_args.dataloader_pin_memory,
        )

        # Create the Evaluation DataLoader.
        if "test" not in dataset:
            raise ValueError("`do_eval=True` requires a validation dataset")
        eval_dataset = dataset["test"]
        eval_dataloader = DataLoader(
            eval_dataset,
            collate_fn=default_data_collator,
            batch_size=training_args.per_device_eval_batch_size,
            pin_memory=training_args.dataloader_pin_memory,
        )

    elif training_args.do_train and not training_args.do_eval:
        train_dataset = dataset
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=True,
            collate_fn=default_data_collator,
            batch_size=training_args.per_device_train_batch_size,
            pin_memory=training_args.dataloader_pin_memory,
        )

    # Now, we create our Optimizer. First, we will split weights in two groups,
    # one with weight decay and the other not.
    # These strings `["bias", "layer_norm.weight"]` represent parameter names that should not be subject to weight decay during optimization.
    # Weight decay is a regularization technique used during training to prevent overfitting by penalizing large weights.
    no_decay = ["bias", "layer_norm.weight"]

    # The first dictionary corresponds to parameters with weight decay (regularization) applied to them (non-bias and non-layer_norm.weight parameters).
    # The second dictionary corresponds to parameters without weight decay (regularization) applied to them (bias and layer_norm.weight parameters).
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": training_args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    # We are using the `AdamW` optimizer, which is a variant of the Adam optimizer.
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
        lr=training_args.learning_rate,
        eps=training_args.adam_epsilon,
    )

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / training_args.gradient_accumulation_steps)

    # Set max_steps
    training_args.max_steps = int(num_update_steps_per_epoch * training_args.num_train_epochs)

    # Our scheduler will start with a warmup phase, where the learning rate will increase linearly from 0 to the initial learning rate
    # over the first n `num_warmup_steps` of the training steps. Then, the learning rate will decrease following the cosine function.
    # If the shape of the learning rate curve is not according to what we expect, there is something wrong with (probably) the `num_training_steps` parameter.
    lr_scheduler = get_scheduler(
        name=training_args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=training_args.warmup_steps,
        num_training_steps=training_args.max_steps,
    )

    # We are preparing everything with `accelerator`. The `prepare` method will handle the device
    # placement and distributed training.
    if training_args.do_train and training_args.do_eval:

        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
        )

    else:

        model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, lr_scheduler
        )

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.TPU:
        model.tie_weights()

    # Now, we need to recalculate our total training steps as the size of the training dataloader may have changed.
    # This change (I belive) should be due to the distributed training, where the dataset is split among the
    # different processes.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / training_args.gradient_accumulation_steps)
    training_args.max_steps = training_args.num_train_epochs * num_update_steps_per_epoch
    training_args.num_train_epochs = math.ceil(training_args.max_steps / num_update_steps_per_epoch)

    # Now, we want to log the training process to the Weights & Biases platform. We need to initialize the `wandb`
    # logger and then log the training process to the platform. Since we are using distributed training, we
    # need to ensure that only the main process logs the training process to the platform.
    if accelerator.is_main_process:

        if extra_args.wandb_token is not None:

            # Login to wandb.
            wandb.login(key=extra_args.wandb_token)

            # Initialize wandb.
            wandb.init(
                project=extra_args.logger_name,
                notes="Fine tuning TeenyTinyLlama",
                tags=["Alignment", "Fine-tuning", "Energy Consumption", "Language Modeling", "Portuguese"],
                config=data,#all_kwargs,
                name=f"""{extra_args.logger_name.lower()}-{model_args.model_id}-Chat-{time.strftime("%d-%m-%Y")}""",
            )

    # We would also like to track the energy consumption of the training process. We are going to use the `codecarbon` library
    # to do this. We need to initialize the `EmissionsTracker` and then track the energy consumption of the training process.
    tracker = EmissionsTracker(
        project_name=extra_args.logger_name,
        log_level="critical", # set to "critical" to silence codecarbon
        output_dir=training_args.output_dir,
        output_file=f"emissions_{accelerator.process_index}.csv",
        tracking_mode='machine'
    )

    logger.info(f'Geo Location: ISO: {tracker._geo.country_iso_code} | Country: {tracker._geo.country_name} | Region : {tracker._geo.region}')

    # Initialize the HuggingFace Hub API.
    if training_args.push_to_hub and training_args.hub_token is not None:
        if training_args.hub_model_id is not None:
            api = HfApi(token=training_args.hub_token)

    # The total batch size is calculated by multiplying the number of samples in `per_device_train_batch_size`
    # by the number of processes in the accelerator.
    total_batch_size = training_args.per_device_train_batch_size * accelerator.num_processes * training_args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {training_args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {training_args.max_steps}")

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(training_args.max_steps), disable=not accelerator.is_local_main_process, unit=" samples", desc="Training")
    completed_steps = 0
    starting_epoch = 0

    # Update the progress_bar if load from checkpoint.
    progress_bar.update(completed_steps)

    # Start training loop and activate codecarbon tracking.
    tracker.start()

    for epoch in range(starting_epoch, training_args.num_train_epochs):

        logger.info(f'Beginning epoch {epoch + 1} of {training_args.num_train_epochs}')

        # Set the model to training mode.
        model.train()
        total_loss = 0

        # Iterate over the batches of data in the current epoch.
        for step, batch in enumerate(train_dataloader, start=1):
            with accelerator.accumulate(model):

                # Forward pass the batch through the model and get the loss.
                outputs = model(**batch)
                loss = outputs.loss

                # Add the loss to the total loss.
                total_loss += loss.detach().float()

                # We only want to log the loss to wandb from the main process.
                if accelerator.is_main_process:
                    if (step) % extra_args.wandb_log_steps == 0 and extra_args.wandb_token is not None:
                        wandb.log({
                            "loss": loss.detach().float().item(),
                            # Log the learning rate to wandb (this is how we can monitor the learning rate during training).
                            "lr": lr_scheduler.get_last_lr()[0],
                            })

                # Backward pass and update optimizer.
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Update the progress bar. The `accelerator.sync_gradients` method is used to synchronize the gradients across all processes.
            # Hence, the progress bar is updated only when all processes have finished the current step.
            if accelerator.sync_gradients:
                progress_bar.update(1)
                completed_steps += 1

            accelerator.wait_for_everyone()

            # Generate text from the model every `sample_every ` steps.
            if accelerator.is_main_process:

                if completed_steps % extra_args.sample_every == 0 and not step == 0:

                    model.config.use_cache = True

                    try:

                        model.eval()

                        if accelerator.is_main_process:

                            # Sample a string from the `seeds` and generate text from the model.
                            inputs = tokenizer(random.choice(seeds), return_tensors="pt").to('cuda:0')

                            sample_outputs = model.generate(**inputs,
                                                do_sample=True,
                                                top_k=50,
                                                max_new_tokens=150,
                                                top_p=0.50,
                                                repetition_penalty=1.2,
                                                num_return_sequences=5)

                            model.config.use_cache = False

                            texts = []

                            for i, sample_output in enumerate(sample_outputs):
                                texts.append(tokenizer.decode(sample_output))

                            # Log the samples to the main process terminal.
                            for text in texts:
                                logger.info(f"Samples (Epoch: {epoch + 1} | Step: {step}): {text}")

                            # Log the samples to wandb.
                            if extra_args.wandb_token is not None:

                                training_samples = wandb.Table(columns=[f"Samples (Epoch: {epoch + 1} | Step: {step})"])
                                for text in texts:
                                    training_samples.add_data(text)
                                wandb.log({f"Samples (Epoch: {epoch + 1} | Step: {step})": training_samples})

                    except Exception as e:
                        logger.warning(f"Error while generating samples: {e}")
                        model.config.use_cache = False

                    model.train()

        # Evaluate the model at the end of each epoch if `do_eval=True`
        if training_args.do_eval:
            model.eval()
            losses = []
            logger.info(f"Running evaluation at the end of Epoch {epoch + 1}.")
            for step, batch in enumerate(tqdm(eval_dataloader, total=len(eval_dataloader), position=0, leave=True, disable=not accelerator.is_local_main_process, unit=" samples",  desc="Validation")):
                with torch.no_grad():
                    outputs = model(**batch)

                loss = outputs.loss
                losses.append(accelerator.gather_for_metrics(loss.repeat(training_args.per_device_eval_batch_size)))

            losses = torch.cat(losses)
            try:
                eval_loss = torch.mean(losses)
                perplexity = math.exp(eval_loss)
            except OverflowError:
                eval_loss = torch.mean(losses)
                perplexity = float("inf")

            logger.info(f"Epoch {epoch + 1} | Perplexity: {perplexity} | Average Training Loss: {total_loss.item() / completed_steps} | Evaluation Loss: {eval_loss} | Total Energy Consumption: {tracker._total_energy.kWh}")

            # Only the main process should log the validation metrics to wandb.
            if accelerator.is_main_process:

                if extra_args.wandb_token is not None:

                        wandb.log({
                            "eval_loss": eval_loss,
                            "perplexity": perplexity,
                            "avg_train_loss": total_loss.item() / completed_steps,
                            "total_energy_consumption": tracker._total_energy.kWh,
                        })

        else:
            logger.info(f"Epoch {epoch + 1} | Average Training Loss: {total_loss.item() / completed_steps} | Total Energy Consumption: {tracker._total_energy.kWh}")

            if accelerator.is_main_process:

                if extra_args.wandb_token is not None:

                        wandb.log({
                            "avg_train_loss": total_loss.item() / completed_steps,
                            "total_energy_consumption": tracker._total_energy.kWh,
                        })

        # Save the model checkpoint at the end of each epoch.
        accelerator.wait_for_everyone()

        output_dir = f"epoch_{epoch + 1}"

        if training_args.output_dir is not None:
            # Join the output directory with the current checkpoint directory.
            output_dir = os.path.join(training_args.output_dir, output_dir)
        # Save the model checkpoint.
        accelerator.save_state(output_dir)

        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            training_args.output_dir,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save
        )
        tokenizer.save_pretrained(output_dir)

        # Save the `generation_config` file
        generation_config = GenerationConfig(
            bos_token_id=tokenizer.bos_token_id,
            sep_token_id=tokenizer.sep_token_id,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            unk_token_id=tokenizer.unk_token_id,
            max_new_tokens=model.config.max_position_embeddings,
            min_length=0,
            do_sample=True,
            use_cache=False,
            renormalize_logits=True,
            top_k=30,
            top_p=0.3,
            temperature=0.3,
            repetition_penalty=1.2,
        )

        generation_config.save_pretrained(output_dir)

        tracker.flush()

    # Resume codecarbon tracking.
    tracker.stop()
    logger.info("Training complete!")
    if accelerator.is_main_process:
        if extra_args.wandb_token is not None:
            wandb.finish()

    # Upload the final emissions file to the Hub.
    if training_args.push_to_hub and training_args.hub_token is not None:
        if training_args.hub_model_id is not None:

            try:

                api.upload_file(
                    path_or_fileobj=f"{training_args.output_dir}/emissions_{accelerator.process_index}.csv",
                    path_in_repo=f"emissions_{accelerator.process_index}.csv",
                    repo_id=f"{training_args.hub_model_id}"
                )

                logger.info(f"Final emissions file pushed to the hub!")

            except Exception as e:
                logger.warning(f"Error while uploading emissions file to Hub: {e}")

    accelerator.wait_for_everyone()

    # Save the final checkpoint at the end of training and push it to the Hub.
    if training_args.output_dir is not None:

        output_dir = os.path.join(training_args.output_dir, "final-checkpoint")
        accelerator.save_state(output_dir)
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )
        tokenizer.save_pretrained(output_dir)

    if accelerator.is_main_process:

        if training_args.push_to_hub and training_args.hub_token is not None:
            if training_args.hub_model_id is not None:

                # Here we are going to push the model checkpoint to the HuggingFace Hub in a try-except block.
                # If the push to the Hub fails, we will log a warning.
                try:

                    # Push the final checkpoint to the Hub.
                    api.upload_folder(
                        repo_id=f"{training_args.hub_model_id}",
                        folder_path=output_dir,
                    )

                    logger.info(f"Final model pushed to the hub!")

                except Exception as e:
                    logger.warning(f"Error while uploading checkpoint to Hub: {e}")

main(param)

Map:   0%|          | 0/41587 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/41587 [00:00<?, ? examples/s]

Adding labels to the dataset:   0%|          | 0/41587 [00:00<?, ? examples/s]

Validation: 100%|██████████| 1040/1040 [01:41<00:00, 10.25 samples/s]
Validation: 100%|██████████| 1040/1040 [01:41<00:00, 10.26 samples/s]


model.safetensors:   0%|          | 0.00/650M [00:00<?, ?B/s]

optimizer.bin:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

scheduler.bin:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

random_states_0.pkl:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Training: 100%|██████████| 18714/18714 [1:31:50<00:00,  3.40 samples/s]
