# Bibliotecas

In [6]:
%%capture
!pip install unsloth

!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [14]:
import os
import unicodedata
from unsloth import FastLanguageModel,is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from sklearn.metrics import classification_report
import json
import time
import re

# Modelo

In [5]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`
Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.10.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Dados FineTune

In [8]:
%%capture
!gdown 15KPTyHXkPKLDFI9e3WzfKfWbVxXuTQdt
!gdown 16uS6ngWfL8va1M1s16NYqrVv0tYC_zy6

In [9]:
prompt = """You are an advanced language model tasked with extracting relevant information from a given input based on the provided instruction. Ensure that your response is precise, concise, and formatted according to the expected output.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

In [10]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


path = '/content/train_finetune_lenerbr.json'
dataset = load_dataset('json', data_files={"train": path}, split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3484 [00:00<?, ? examples/s]

In [11]:
start_time = time.time()

In [12]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer.train()

Map:   0%|          | 0/3484 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,484 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`


Step,Training Loss
1,1.8357
2,1.8557
3,1.8848
4,1.9594
5,1.8427
6,1.726
7,1.5764
8,1.4217
9,1.2947
10,1.0651


TrainOutput(global_step=60, training_loss=0.6546945378184319, metrics={'train_runtime': 296.2473, 'train_samples_per_second': 1.62, 'train_steps_per_second': 0.203, 'total_flos': 9351021907230720.0, 'train_loss': 0.6546945378184319, 'epoch': 0.1377726750861079})

In [13]:
end_time = time.time()
training_time = end_time - start_time
hours, rem = divmod(training_time, 3600)
minutes, seconds = divmod(rem, 60)

print(f"Tempo total de treino: {int(hours)} horas, {int(minutes)} minutos e {int(seconds)} segundos")

Tempo total de treino: 0 horas, 5 minutos e 0 segundos


In [15]:
start_time_inference = time.time()

In [18]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    prompt.format(
        """\n
        Extract relevant information from the provided text based on the following predefined entities. Each entity corresponds to a specific description that defines the nature of the information to be extracted. Organize the extracted data in a structured JSON format, ensuring that each field contains only relevant information according to its description. If a specific entity is not found in the text, omit it from the output. The expected entities and their descriptions are:
        Relevant Entities:
        ORGANIZACAO: The name of an organization, company, or institution mentioned in the text.
        PESSOA: The name of an individual person identified in the text.
        TEMPO: Any reference to a specific period of time, such as a date, year, or timeframe.
        LOCAL: Locations mentioned in the text, such as cities, states, or landmarks.
        LEGISLACAO: References to legislation or legal documents cited in the text.
        JURISPRUDENCIA: References to jurisprudence or court decisions that are cited in the text.\n""",
        "Assunto Recurso de Reconsideração interposto por Carlos Aureliano Motta de Souza ( ex-Diretor-Geral do Superior Tribunal Militar ) contra decisão que julgou suas contas irregulares e o condenou em débito e ao pagamento de multa em razão de irregularidades nas obras de construção do prédio da 1ª Circunscrição Judiciária Militar no Rio de Janeiro. LIQUIDAÇÃO IRREGULAR DE DESPESA E DANO AO ERÁRIO DECORRENTE DE PAGAMENTOS ANTECIPADOS PARA EXECUÇÃO DAS OBRAS DO EDIFÍCIO-SEDE DA 1ª CIRCUNSCRIÇÃO JUDICIÁRIA MILITAR NA ILHA DO GOVERNADOR-RJ.",
        "",  # output esperado aqui - deixar em branco
    )
], return_tensors="pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
response_tokens = model.generate(**inputs, streamer=text_streamer, max_new_tokens=1024)

<|begin_of_text|>You are an advanced language model tasked with extracting relevant information from a given input based on the provided instruction. Ensure that your response is precise, concise, and formatted according to the expected output.

### Instruction:


        Extract relevant information from the provided text based on the following predefined entities. Each entity corresponds to a specific description that defines the nature of the information to be extracted. Organize the extracted data in a structured JSON format, ensuring that each field contains only relevant information according to its description. If a specific entity is not found in the text, omit it from the output. The expected entities and their descriptions are:
        Relevant Entities:
        ORGANIZACAO: The name of an organization, company, or institution mentioned in the text.
        PESSOA: The name of an individual person identified in the text.
        TEMPO: Any reference to a specific period of ti

In [19]:
end_time_inference = time.time()
inference_time = end_time_inference - start_time_inference
hoursi, remi = divmod(inference_time, 3600)
minutesi, secondsi = divmod(remi, 60)

print(f"Tempo de inferência: {int(hoursi)} horas, {int(minutesi)} minutos e {int(secondsi)} segundos")

Tempo de inferência: 0 horas, 2 minutos e 27 segundos


OUTPUT ESPERADO        
        
        "output": {
            "ORGANIZACAO": [
                "1ª CIRCUNSCRIÇÃO JUDICIÁRIA MILITAR"
                "Superior Tribunal Militar",
                "1ª Circunscrição Judiciária Militar no Rio de Janeiro"
            ],
            "LOCAL": [
                "ILHA DO GOVERNADOR-RJ"
            ]
            "PESSOA": [
                "Carlos Aureliano Motta de Souza"
            ]
        }

In [66]:
# Função de normalização, tendo em vista que na comparação palavras com caracteres podem influenciar
def normalize_text(text):
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [67]:
def compare_entities(expected, generated):
    expected_normalized = [normalize_text(entity) for entity in expected]
    generated_normalized = [normalize_text(entity) for entity in generated]

    matched = set(expected_normalized) & set(generated_normalized)
    unmatched_expected = set(expected_normalized) - matched
    unmatched_generated = set(generated_normalized) - matched

    return len(matched), len(unmatched_expected), len(unmatched_generated)

In [68]:
def compare_outputs(expected, generated):
    for key, expected_entities in expected.items():
        generated_entities = generated.get(key, [])

        # Normalizando os valores para comparação
        expected_normalized = [normalize_text(entity) for entity in expected_entities]
        generated_normalized = [normalize_text(entity) for entity in generated_entities]

        # Variáveis para acompanhar correspondências e não correspondências
        correspondencias = []
        nao_correspondidos_esperados = expected_normalized.copy()  # Inicializa com todas as entidades esperadas
        nao_correspondidos_gerados = generated_normalized.copy()  # Inicializa com todas as entidades geradas

        # Comparar cada entidade esperada com todas as geradas
        for expected_entity in expected_normalized:
            for generated_entity in generated_normalized:
                # Verificar se as entidades são exatamente iguais ou se uma é parte da outra
                if expected_entity == generated_entity or (expected_entity in generated_entity or generated_entity in expected_entity):
                    correspondencias.append(expected_entity)
                    if expected_entity in nao_correspondidos_esperados:
                        nao_correspondidos_esperados.remove(expected_entity)
                    if generated_entity in nao_correspondidos_gerados:
                        nao_correspondidos_gerados.remove(generated_entity)
                    break

        print(f"Entidade: {key}")
        print(f"Entidades Correspondentes: {len(correspondencias)}")
        print(f"Esperado e não encontrado: {nao_correspondidos_esperados}")
        print(f"Gerado e não correspondente: {nao_correspondidos_gerados}")
        print(f"Total Esperado: {len(expected_entities)}, Total Gerado: {len(generated_entities)}")
        print("-----")


In [71]:
generated_output = {
    "JURISPRUDENCIA": None,
    "LEGISLACAO": None,
    "LOCAL": ["Rio de Janeiro", "Ilha do Governador"],
    "ORGANIZACAO": ["Superior Tribunal Militar", "1ª Circunscrição Judiciária Militar"],
    "PESSOA": ["Carlos Aureliano Motta de Souza"],
    "TEMPO": None
}

expected_output = {
    "ORGANIZACAO": [
        "1ª CIRCUNSCRIÇÃO JUDICIÁRIA MILITAR",
        "Superior Tribunal Militar",
        "1ª Circunscrição Judiciária Militar"
    ],
    "LOCAL": [
        "ILHA DO GOVERNADOR",
        "Rio de Janeiro"
    ],
    "PESSOA": [
        "Carlos Aureliano Motta de Souza"
    ]
}

In [72]:
compare_outputs(expected_output, generated_output)

Entidade: ORGANIZACAO
Entidades Correspondentes: 3
Esperado e não encontrado: []
Gerado e não correspondente: []
Total Esperado: 3, Total Gerado: 2
-----
Entidade: LOCAL
Entidades Correspondentes: 2
Esperado e não encontrado: []
Gerado e não correspondente: []
Total Esperado: 2, Total Gerado: 2
-----
Entidade: PESSOA
Entidades Correspondentes: 1
Esperado e não encontrado: []
Gerado e não correspondente: []
Total Esperado: 1, Total Gerado: 1
-----
