# Installing Dependencies

In [None]:
! pip install "torch==2.0.1" datasets transformers trl peft accelerate bitsandbytes auto-gptq optimum

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
%cd gdrive/MyDrive/Colab Notebooks/crs-ptbr/

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
from trl import SFTTrainer
import pandas as pd

# Chatbot Config

In [None]:
model_name = "crs_ptbr_zephyr_7B_beta_8bit_corpusv2"

class Config:
  MODEL_NAME = model_name
  MODEL_ID = "TheBloke/zephyr-7B-beta-GPTQ"
  MODEL_REVISION = "gptq-8bit-32g-actorder_True"
  TOKENIZER_ID = "HuggingFaceH4/zephyr-7b-alpha"
  DATA_PATH = "data/v2/train.parquet"
  BITS = 8
  DISABLE_EXLLAMA = True
  DEVICE_MAP = "auto"
  USE_CACHE = False
  LORA_R = 16
  LORA_ALPHA = 16
  LORA_DROPOUT = 0.05
  BIAS = "none"
  TARGET_MODULES = ["q_proj", "v_proj"]
  TASK_TYPE = "CAUSAL_LM"
  OUTPUT_DIR = f"models/CHECKPOINT_{model_name}"
  BATCH_SIZE = 64
  GRAD_ACCUMULATION_STEPS = 1
  OPTIMIZER = "paged_adamw_32bit"
  LR = 2e-4
  LR_SCHEDULER = "cosine"
  EVAL_STRATEGY = "steps"
  LOGGING_STEPS = 30
  SAVE_STRATEGY = "steps"
  SAVE_STEPS = 70
  SAVE_TOTAL_LIMIT = 2 # two because of load_best_model
  EVAL_STEPS = 30
  NUM_TRAIN_EPOCHS = 3
  FP16 = True
  PUSH_TO_HUB = False
  DATASET_TEXT_FIELD = "sample"
  MAX_SEQ_LENGTH = 512
  PACKING = False

In [None]:
config = Config()

# Zephyr Trainer

In [None]:
class ZephyrTrainer:

    def __init__(self):

        '''
        A Trainer used to train the Zephyr 7B model which beats Llama2-70b-chat model for your custom usecase

        Initialized:
        config: Parameters required for the trainer to create and process dataset, train and save model finally
        tokenizer: Tokenizer required in training loop
        '''

        self.config = Config()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        self.tokenizer.pad_token = self.tokenizer.eos_token


    def create_dataset(self, type="train"):

        '''
        Downloads and processes the dataset

        Returns:
        processed_data: Training ready processed dataset
        '''

        data = pd.read_parquet(self.config.DATA_PATH)

        processed_data = Dataset.from_pandas(data[[self.config.DATASET_TEXT_FIELD]])

        # Debugging
        self.data = data

        return processed_data

    def prepare_model(self, load_path=False):

        '''
        Prepares model for finetuning by quantizing it and attaching lora modules to the model

        Returns:
        model - Model ready for finetuning
        peft_config - LoRA Adapter config
        '''
        self.create_dataset()
        bnb_config = GPTQConfig(
                                    bits=self.config.BITS,
                                    disable_exllama=self.config.DISABLE_EXLLAMA,
                                    tokenizer=self.tokenizer,
                                    dataset=self.data['sample'].tolist()
                                )

        model = AutoModelForCausalLM.from_pretrained(
                                                        self.config.MODEL_ID,
                                                        quantization_config=bnb_config,
                                                        device_map=self.config.DEVICE_MAP,
                                                        revision=self.config.MODEL_REVISION
                                                    )

        model.config.use_cache=self.config.USE_CACHE
        model.config.pretraining_tp=1
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        peft_config = LoraConfig(
                                    r=self.config.LORA_R,
                                    lora_alpha=self.config.LORA_ALPHA,
                                    lora_dropout=self.config.LORA_DROPOUT,
                                    bias=self.config.BIAS,
                                    task_type=self.config.TASK_TYPE,
                                    target_modules=self.config.TARGET_MODULES
                                )

        model = get_peft_model(model, peft_config)

        if load_path:
          model.load_state_dict(torch.load(load_path))
          return model, peft_config

        return model, peft_config

    def set_training_arguments(self):

        '''
        Sets the arguments for the training loop in TrainingArguments class
        '''

        training_arguments = TrainingArguments(
                                                output_dir=self.config.OUTPUT_DIR,
                                                per_device_train_batch_size=self.config.BATCH_SIZE,
                                                gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
                                                optim=self.config.OPTIMIZER,
                                                learning_rate=self.config.LR,
                                                lr_scheduler_type=self.config.LR_SCHEDULER,
                                                save_strategy=self.config.SAVE_STRATEGY,
                                                logging_steps=self.config.LOGGING_STEPS,
                                                num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
                                                fp16=self.config.FP16,
                                                push_to_hub=self.config.PUSH_TO_HUB,
                                                # save_strategy=self.config.SAVE_STRATEGY,
                                                save_steps=self.config.SAVE_STEPS,
                                                save_total_limit=self.config.SAVE_TOTAL_LIMIT,
                                                # eval_steps=self.config.EVAL_STEPS,
                                                # evaluation_strategy=self.config.EVAL_STRATEGY
                                                load_best_model_at_end=True,
                                            )

        return training_arguments

    def train(self):

        '''
        Trains the model on the specified dataset in config
        '''

        data = self.create_dataset()
        model, peft_config = self.prepare_model()
        training_args = self.set_training_arguments()

        trainer = SFTTrainer(
                                model=model,
                                train_dataset=data,
                                peft_config=peft_config,
                                dataset_text_field=self.config.DATASET_TEXT_FIELD,
                                args=training_args,
                                tokenizer=self.tokenizer,
                                packing=self.config.PACKING,
                                max_seq_length=self.config.MAX_SEQ_LENGTH
                            )
        trainer.train()

        self.model = trainer.model
        # trainer.push_to_hub()

In [None]:
# data = pd.read_parquet("data/train/train.parquet")
# config = Config()
# processed_data = Dataset.from_pandas(data[[config.DATASET_TEXT_FIELD]])

In [None]:
if __name__ == "__main__":
    config = Config()
    zephyr_trainer = ZephyrTrainer()
    zephyr_trainer.train()

In [None]:
# Save model
import os
os.makedirs(f'models/{config.MODEL_NAME}', exist_ok=True)
torch.save(zephyr_trainer.model.state_dict(), f'models/{config.MODEL_NAME}/model.pth')

# from google.colab import runtime
# runtime.unassign()

In [None]:
# Load Model
zephyr_trainer = ZephyrTrainer()
model, _ = zephyr_trainer.prepare_model(load_path='models/zephyr-7b-experiment/model.pth')

In [None]:
# End runtime colab
# from google.colab import runtime
# runtime.unassign()

### Inference

In [None]:
LOAD_PATH = f'models/{config.MODEL_NAME}/model.pth' # path to model

if LOAD_PATH:
  zephyr_trainer = ZephyrTrainer()
  model, _ = zephyr_trainer.prepare_model(load_path=LOAD_PATH)

else:
  model = zephyr_trainer.model

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch
import time

config = Config()
tokenizer_model = AutoTokenizer.from_pretrained(config.MODEL_ID)
tokenizer_template = AutoTokenizer.from_pretrained(config.TOKENIZER_ID)

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,  #
    temperature=0.1,
    max_new_tokens=120,
    pad_token_id=tokenizer_model.eos_token_id,
    repetition_penalty = 2.0
)


def get_inference(
    text,
    model,
    tokenizer_model=tokenizer_model,
    tokenizer_template=tokenizer_template,
    generation_config=generation_config,
):
    st_time = time.time()
    inputs = tokenizer_model(
        tokenizer_template.apply_chat_template(
            [
                {
                    "role": "system",
                    "content": "Você é um chatbot para indicação de filmes. Responda de maneira educada sugestões de filmes para os usuários.",
                },
                {"role": "user", "content": text},
            ],
            tokenize=False,
        ),
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, generation_config=generation_config)

    print(tokenizer_model.decode(outputs[0], skip_special_tokens=True))
    print(time.time() - st_time)

In [None]:
get_inference("qual seu nome?", model)

### Evaluation

In [None]:
LOAD_PATH = f'models/{config.MODEL_NAME}/model.pth' # path to model

if LOAD_PATH:
  zephyr_trainer = ZephyrTrainer()
  model, _ = zephyr_trainer.prepare_model(load_path=LOAD_PATH)

else:
  model = zephyr_trainer.model

In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    top_p=0.25,
    top_k=0,
    max_new_tokens=512,
    repetition_penalty=1.1,
    eos_token_id=tokenizer_model.eos_token_id,
    pad_token_id=tokenizer_model.eos_token_id,
)

def generate_eval(
    text_eval,
    model,
    tokenizer_model=tokenizer_model,
    tokenizer_template=tokenizer_template,
    generation_config=generation_config,
):
'''
Return the completion and time of the inference

Parameters:
text_eval: text to be completed
model: model to be used
tokenizer_model: tokenizer to be used
tokenizer_template: tokenizer to be used
generation_config: generation config to be used
'''
    st_time = time.time()
    inputs = tokenizer_model(text_eval, return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, generation_config=generation_config)

    completion = tokenizer_model.decode(outputs[0], skip_special_tokens=True).split('\n<|assistant|>\n')[-1]

    return (completion, time.time() - st_time)

In [None]:
test_data = pd.read_parquet("data/v2/test_dev.parquet")

In [None]:
dummy_index = 3
text_eval = test_data['instruction'].iloc[dummy_index]
dummy_data = test_data.iloc[0]
output = generate_eval(text_eval, model)

print(text_eval)

print(output)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install fuzzywuzzy[speedup]

In [None]:
import re
from fuzzywuzzy import fuzz

def regex_movies(text):
  regex = r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s\(\d{4}\)'
  movies = re.findall(regex, text)

  return list(set(movies))

def get_similarity(text_a, text_b):
  similarity = fuzz.ratio(text_a, text_b)
  return similarity

def compute_metrics(data_eval, model, threshold=80):

  expected_movies = data_eval['expected_movies']
  print(expected_movies)

  completion, inference_time = generate_eval(data_eval['instruction'], model)
  
  # Has Movie
  pred_movies = regex_movies(completion)
  print(pred_movies)

  # Hits
  hits = 0
  for pred_movie in pred_movies:
    for expected_movie in expected_movies:
      if fuzz.ratio(pred_movie, expected_movie) > threshold:
        print(fuzz.ratio(pred_movie, expected_movie))
        hits+=1

  return {
      'has_movie': 1 if pred_movies else 0,
      'hits': hits,
      'inference_time': inference_time,
      'completion': completion
  }

model_eval = data_eval.copy()
model_eval[['has_movie', 'hits', 'inference_time']] = model_eval.apply(lambda x: compute_metrics(x, model), axis=1, result_type='expand')