# Installing Dependencies

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Nov 16 22:19:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    42W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
! pip install "torch==2.0.1" "datasets==2.14.6" "transformers==4.34.1" "trl==0.7.2" "peft==0.5.0" "accelerate==0.24.0" "bitsandbytes==0.41.1" "auto-gptq==0.4.2" "optimum==1.13.2"

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.6
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.34.1
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.7.2
  Downloading trl-0.7.2-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.0/124.0 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.5.0
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

MessageError: ignored

In [None]:
%cd gdrive/MyDrive/Colab Notebooks/crs-ptbr/

/content/gdrive/MyDrive/Colab Notebooks/crs-ptbr


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
from trl import SFTTrainer
import pandas as pd

# Chatbot Config

In [None]:
model_name = "cesar_ptbr_newlr"

class Config:
  MODEL_NAME = model_name
  MODEL_ID = "TheBloke/zephyr-7B-beta-GPTQ"
  MODEL_REVISION = "gptq-8bit-32g-actorder_True"
  TOKENIZER_ID = "HuggingFaceH4/zephyr-7b-alpha"
  DATA_PATH = "data/v2/train.parquet"
  BITS = 8
  DISABLE_EXLLAMA = True
  DEVICE_MAP = "auto"
  USE_CACHE = False
  LORA_R = 16
  LORA_ALPHA = 16
  LORA_DROPOUT = 0.05
  BIAS = "none"
  TARGET_MODULES = ["q_proj", "v_proj"]
  TASK_TYPE = "CAUSAL_LM"
  OUTPUT_DIR = f"models/CHECKPOINT_{model_name}"
  BATCH_SIZE = 64
  GRAD_ACCUMULATION_STEPS = 1
  OPTIMIZER = "paged_adamw_32bit"
  LR = 5e-07#2e-4
  LR_SCHEDULER = "cosine"
  EVAL_STRATEGY = "steps"
  LOGGING_STEPS = 30
  SAVE_STRATEGY = "steps"
  SAVE_STEPS = 70
  SAVE_TOTAL_LIMIT = 2 # two because of load_best_model
  EVAL_STEPS = 30
  NUM_TRAIN_EPOCHS = 5
  FP16 = True
  PUSH_TO_HUB = False
  DATASET_TEXT_FIELD = "sample"
  MAX_SEQ_LENGTH = 512
  PACKING = False

In [None]:
config = Config()

# Zephyr Trainer

In [None]:
class ZephyrTrainer:

    def __init__(self, debugging=False):

        '''
        A Trainer used to train the Zephyr 7B model which beats Llama2-70b-chat model for your custom usecase

        Initialized:
        config: Parameters required for the trainer to create and process dataset, train and save model finally
        tokenizer: Tokenizer required in training loop
        '''

        self.config = Config()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.debugging = debugging


    def create_dataset(self, type="train"):

        '''
        Downloads and processes the dataset

        Returns:
        processed_data: Training ready processed dataset
        '''

        data = pd.read_parquet(self.config.DATA_PATH)

        if self.debugging:
          data = data.head(8)

        self.data = data

        processed_data = Dataset.from_pandas(data[[self.config.DATASET_TEXT_FIELD]])

        return processed_data

    def prepare_model(self, load_path=False):

        '''
        Prepares model for finetuning by quantizing it and attaching lora modules to the model

        Returns:
        model - Model ready for finetuning
        peft_config - LoRA Adapter config
        '''
        self.create_dataset()
        bnb_config = GPTQConfig(
                                    bits=self.config.BITS,
                                    disable_exllama=self.config.DISABLE_EXLLAMA,
                                    tokenizer=self.tokenizer,
                                    dataset=self.data['sample'].tolist()
                                )

        model = AutoModelForCausalLM.from_pretrained(
                                                        self.config.MODEL_ID,
                                                        quantization_config=bnb_config,
                                                        device_map=self.config.DEVICE_MAP,
                                                        revision=self.config.MODEL_REVISION
                                                    )

        model.config.use_cache=self.config.USE_CACHE
        model.config.pretraining_tp=1
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        peft_config = LoraConfig(
                                    r=self.config.LORA_R,
                                    lora_alpha=self.config.LORA_ALPHA,
                                    lora_dropout=self.config.LORA_DROPOUT,
                                    bias=self.config.BIAS,
                                    task_type=self.config.TASK_TYPE,
                                    target_modules=self.config.TARGET_MODULES
                                )

        model = get_peft_model(model, peft_config)

        if load_path:
          model.load_state_dict(torch.load(load_path))
          return model, peft_config

        return model, peft_config

    def set_training_arguments(self):

        '''
        Sets the arguments for the training loop in TrainingArguments class
        '''

        training_arguments = TrainingArguments(
                                                output_dir=self.config.OUTPUT_DIR,
                                                per_device_train_batch_size=self.config.BATCH_SIZE,
                                                gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
                                                optim=self.config.OPTIMIZER,
                                                learning_rate=self.config.LR,
                                                lr_scheduler_type=self.config.LR_SCHEDULER,
                                                save_strategy=self.config.SAVE_STRATEGY,
                                                logging_steps=self.config.LOGGING_STEPS,
                                                num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
                                                fp16=self.config.FP16,
                                                push_to_hub=self.config.PUSH_TO_HUB,
                                                # save_strategy=self.config.SAVE_STRATEGY,
                                                save_steps=self.config.SAVE_STEPS,
                                                save_total_limit=self.config.SAVE_TOTAL_LIMIT,
                                                # eval_steps=self.config.EVAL_STEPS,
                                                # evaluation_strategy=self.config.EVAL_STRATEGY
                                                # load_best_model_at_end=True,
                                            )

        return training_arguments

    def train(self):

        '''
        Trains the model on the specified dataset in config
        '''

        data = self.create_dataset()
        model, peft_config = self.prepare_model()
        training_args = self.set_training_arguments()

        self.trainer = SFTTrainer(
                                model=model,
                                train_dataset=data,
                                peft_config=peft_config,
                                dataset_text_field=self.config.DATASET_TEXT_FIELD,
                                args=training_args,
                                tokenizer=self.tokenizer,
                                packing=self.config.PACKING,
                                max_seq_length=self.config.MAX_SEQ_LENGTH
                            )
        self.trainer.train()

        self.model = self.trainer.model
        # trainer.push_to_hub()

In [None]:
# data = pd.read_parquet("data/train/train.parquet")
# config = Config()
# processed_data = Dataset.from_pandas(data[[config.DATASET_TEXT_FIELD]])

In [None]:
# Train
config = Config()
zephyr_trainer = ZephyrTrainer(debugging=False)
zephyr_trainer.train()

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Downloading (…)der_True/config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


Downloading model.safetensors:   0%|          | 0.00/8.17G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Map:   0%|          | 0/9005 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
30,1.7383
60,1.7289
90,1.7122
120,1.7126
150,1.6923
180,1.6825
210,1.6676
240,1.6541
270,1.6546
300,1.6408


NameError: ignored

In [None]:
zephyr_trainer.trainer.model.push_to_hub('matheusrdgsf/cesar-ptbr')

adapter_model.bin:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/matheusrdgsf/cesar-ptbr/commit/95444e3589fb3f35cad15a2a5fff630b9c3e40d1', commit_message='Upload model', commit_description='', oid='95444e3589fb3f35cad15a2a5fff630b9c3e40d1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Save model
print(f'./models/{config.MODEL_NAME}')
import os
os.makedirs(f'./models/{config.MODEL_NAME}', exist_ok=True)
torch.save(zephyr_trainer.model.state_dict(), f'./models/{config.MODEL_NAME}/model.pth')

models/crs_ptbr_zephyr_7B_beta_8bit_corpusv2


In [None]:
zephyr_trainer.model.save_pretrained(f'./models/{config.MODEL_NAME}_save_pretrained/')

In [None]:
trainer.save_model

In [None]:
# from google.colab import runtime
# runtime.unassign()

### Inference

In [None]:
LOAD_PATH = f'models/{config.MODEL_NAME}/model.pth' # path to model

if LOAD_PATH:
  zephyr_trainer = ZephyrTrainer()
  model, _ = zephyr_trainer.prepare_model(load_path=LOAD_PATH)

else:
  model = zephyr_trainer.model

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


Downloading (…)der_True/config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
ERROR:auto_gptq.nn_modules.qlinear.qlinear_exllama:exllama_kernels not installed.


ImportError: ignored

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch
import time

config = Config()
tokenizer_model = AutoTokenizer.from_pretrained(config.MODEL_ID)
tokenizer_template = AutoTokenizer.from_pretrained(config.TOKENIZER_ID)
# model = zephyr_trainer.trainer.model

generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    top_p=0.25,
    top_k=0,
    max_new_tokens=512,
    repetition_penalty=1.1,
    eos_token_id=tokenizer_model.eos_token_id,
    pad_token_id=tokenizer_model.eos_token_id,
)


def get_inference(
    text,
    model,
    tokenizer_model=tokenizer_model,
    tokenizer_template=tokenizer_template,
    generation_config=generation_config,
):
    st_time = time.time()
    inputs = tokenizer_model(
        tokenizer_template.apply_chat_template(
            [
                {
                    "role": "system",
                    "content": "Você é um chatbot para indicação de filmes. Responda em português e de maneira educada sugestões de filmes para os usuários.",
                },
                {"role": "user", "content": text},
            ],
            tokenize=False,
        ),
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, generation_config=generation_config)

    print(tokenizer_model.decode(outputs[0], skip_special_tokens=True).split('\n')[-1])
    print(time.time() - st_time)

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
pipe = pipeline("text-generation", model="TheBloke/zephyr-7B-beta-GPTQ", device_map="auto")

In [None]:
messages = [
    {
        "role": "system",
        "content": "Você é um chatbot para indicação de filmes. Responda em português e de maneira educada sugestões de filmes para os usuários.",
    },
    {"role": "user", "content":"Qual o melhor filme do mundo?"},
]
st_time = time.time()
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])
print(time.time() - st_time)

<|system|>
Você é um chatbot para indicação de filmes. Responda em português e de maneira educada sugestões de filmes para os usuários.</s>
<|user|>
Qual o melhor filme do mundo?</s>
<|assistant|>
Eu não posso dizer que existe um melhor filme do mundo, pois a preferência pessoal e o gosto são sujeitos. Cada pessoa tem seus próprios critérios para escolher o filme mais favorável. Então, eu sugiro que você especifique o gênero, ano ou outros detalhes para que eu possa fornecer sugestões mais precisas.
4.4559900760650635


In [None]:
get_inference("Recomende um filme de ação de 2h com ator Samuel Jackson.", model, tokenizer_model=tokenizer)

Um excelente filme de ação que você pode considerar é "Snakes on a Plane" (2006), estrelado por Samuel L. Jackson. Com uma duração aproximada de 1h e 45min, este filme combina ação, suspense e humor, e conta a história de um advogado (interpretado por Jackson) que se vê obrigado a proteger um avião da agressão de serpentes venenosas liberadas intencionalmente a bordo. É um filme divertido e memorável, que vale a pena assistir!
7.952854633331299


In [None]:
get_inference("Indique um filme para crianças que gostam de carros e ação.", model, tokenizer_model=tokenizer)

Um excelente filme para crianças que gostam de carros e ação é "Cars 3" (2017). Nele, o velho amigo de Lightning McQueen, Doc Hudson, retorna como mentor para ajudar o corajoso automóvel a superar as dificuldades na carreira e voltar às pistas com força renovada. Com uma história inspiradora, personagens memoráveis e sequências de ação emocionantes, "Cars 3" é uma opção divertida e educacional para os menores amantes de carros.
8.092994689941406


In [None]:
get_inference("Qual filme indicado para idosos com preferência de duração curta?", model, tokenizer_model=tokenizer)


Um bom filme indicado para idosos com preferência de duração curta é "The Best Exotic Marigold Hotel" (2011). Este filme britânico-indiano tem uma duração aproximada de 124 minutos, mas a história se desenrola em duas partes distintas, sendo que cada parte pode ser vista separadamente, o que permite que o espectador escolha apenas a parte mais curta, que dura cerca de 65 minutos. O filme conta com um elenco estelar composto por Dame Judi Dench, Bill Nighy, Maggie Smith e Tom Wilkinson, entre outros, e trata sobre um grupo de idosos britânicos que decidem mudar-se para um hotel de luxo na Índia, buscando novas oportunidades e aventuras na velhice. É um filme divertido, encantador e com uma mensagem inspiradora sobre a vida, o amor e a amizade.
13.194370746612549


In [None]:
get_inference("Qual o melhor filme do mundo?", model, tokenizer_model=tokenizer)



Eu não posso opinar ou escolher um filme como o "melhor" do mundo, pois a percepção de qualidade cinematográfica varia de pessoa para pessoa. No entanto, alguns dos clássicos mais reconhecidos e premiados da história do cinema incluem títulos como "Cidadão Kane", "Apocalypse Now", "2001: Uma Odisseia no Espaço", "Casablanca", "Lawrence dos Arábios", "O Último Mohicano", "Gone with the Wind", "Singin' in the Rain", "Sunset Boulevard", "Vertigo", "The Godfather", "Jaws", "Star Wars", "Raiders of the Lost Ark", "Blade Runner", "Schindler's List", "Forrest Gump", "Titanic", "The Matrix", "Memento", "Inception", "Mad Max: Fury Road", "Parasite" e "La La Land". Espero que essas sugestões ajude você a encontrar um filme que seja agradável para você!
14.420314073562622


### Evaluation

In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    top_p=0.25,
    top_k=0,
    max_new_tokens=512,
    repetition_penalty=1.1,
    eos_token_id=tokenizer_model.eos_token_id,
    pad_token_id=tokenizer_model.eos_token_id,
)

def generate_eval(
    text_eval,
    model,
    tokenizer_model=tokenizer_model,
    tokenizer_template=tokenizer_template,
    generation_config=generation_config,
    dev = True
):
    st_time = time.time()
    inputs = tokenizer_model(text_eval, return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, generation_config=generation_config)

    completion = tokenizer_model.decode(outputs[0], skip_special_tokens=True).split('\n<|assistant|>\n')[-1]

    if dev:
      print(time.time() - st_time)

    return completion

In [None]:
test_data = pd.read_parquet("data/v2/test_dev.parquet")

In [None]:
dummy_index = 3
text_eval = test_data['instruction'].iloc[dummy_index]
dummy_data = test_data.iloc[0]
output = generate_eval(text_eval, model)

print(text_eval)

print(output)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install fuzzywuzzy[speedup]

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.23.0 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fuzzywuzzy, rapidfuzz, Levenshtein, python-levenshtein
Successfully installed Levenshtein-0.23.0 fuzzyw

In [None]:
import re
from fuzzywuzzy import fuzz

def regex_movies(text):
  regex = r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s\(\d{4}\)'
  movies = re.findall(regex, text)

  return list(set(movies))

def get_similarity(text_a, text_b):
  similarity = fuzz.ratio(text_a, text_b)
  return similarity

def compute_metrics(data_eval, completion, threshold=80):

  expected_movies = data_eval['expected_movies']
  print(expected_movies)

  # Has Movie
  pred_movies = regex_movies(completion)
  print(pred_movies)

  # Hits
  hits = 0
  for pred_movie in pred_movies:
    for expected_movie in expected_movies:
      if fuzz.ratio(pred_movie, expected_movie) > threshold:
        print(fuzz.ratio(pred_movie, expected_movie))
        hits+=1

  # salvar csv com data_eval['context'], data_eval['response'], completion, inference_tinme, has_movie, hits

  return {
      'has_movie': 1 if pred_movies else 0,
      'hits': hits
  }



ModuleNotFoundError: ignored

In [None]:
compute_metrics(dummy_data, output)

['Hot Tub Time Machine' 'Super Troopers (2001)' 'Identity Thief (2013)'
 'The Heat  (2013)']
['The Wedding Singer (1998)', 'Little Nicky (2000)']


{'has_movie': 1, 'hits': 0}