# Fine-Tuning Llama 3.2 3B Instruct Model

Using the XSum Dataset containing 37,500 documents and human-generated summaries (30,000 train, 3,750 test, 3,750 validation), the Large Language Model (LLM) will be fine-tuned on this dataset to improve its performance in text summarization tasks.

The goal of the fine-tuning process is to improve the text summarization performance of the base Large Language Model (LLM) based on a set of evaluation metrics.

The fine-tuning process will make use of Parameter-Efficient Fine-Tuning (PEFT), which will incorporate LoRa (Low-Rank Adaptation) to fine-tune a small number of model parameters instead of all of the model's parameters, allowing us to save computational and storage costs due to resource constraints.

In [1]:
# %%capture
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
import torch

from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    # instructions = examples["instruction"]
    inputs       = examples["document"]
    outputs      = examples["summary"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_from_disk
dataset = load_from_disk("../datasets/xsum_dataset.hf")
dataset = dataset.map(formatting_prompts_func, batched = True)
dataset

Map: 100%|██████████| 30000/30000 [00:00<00:00, 133359.82 examples/s]
Map: 100%|██████████| 3750/3750 [00:00<00:00, 85174.37 examples/s]
Map: 100%|██████████| 3750/3750 [00:00<00:00, 96004.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'text'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'text'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'text'],
        num_rows: 3750
    })
})

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainingArg = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = -1,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none", # Use this for WandB etc
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = None,
    packing = False, # Can make training 5x faster for short sequences.
    args = trainingArg,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 30000/30000 [00:06<00:00, 4647.77 examples/s]


In [7]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4070. Max memory = 11.994 GB.
2.768 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3,750
 "-____-"     Number of trainable parameters = 24,313,856
  0%|          | 1/3750 [00:18<19:04:38, 18.32s/it]

{'loss': 10.6546, 'grad_norm': 1.4817416667938232, 'learning_rate': 4e-05, 'epoch': 0.0}


  0%|          | 2/3750 [00:21<9:50:52,  9.46s/it] 

{'loss': 11.8614, 'grad_norm': 1.9751536846160889, 'learning_rate': 8e-05, 'epoch': 0.0}


  0%|          | 3/3750 [00:27<8:08:24,  7.82s/it]

{'loss': 11.4919, 'grad_norm': 2.108494281768799, 'learning_rate': 0.00012, 'epoch': 0.0}


  0%|          | 4/3750 [00:30<6:05:08,  5.85s/it]

{'loss': 11.2709, 'grad_norm': 2.0180323123931885, 'learning_rate': 0.00016, 'epoch': 0.0}


  0%|          | 5/3750 [00:34<5:23:43,  5.19s/it]

{'loss': 10.7881, 'grad_norm': 1.8707059621810913, 'learning_rate': 0.0002, 'epoch': 0.0}


  0%|          | 6/3750 [00:39<5:14:31,  5.04s/it]

{'loss': 10.1647, 'grad_norm': 1.5650972127914429, 'learning_rate': 0.00019994659546061417, 'epoch': 0.0}


  0%|          | 7/3750 [00:43<4:56:06,  4.75s/it]

{'loss': 9.9987, 'grad_norm': 1.2413358688354492, 'learning_rate': 0.00019989319092122832, 'epoch': 0.0}


  0%|          | 8/3750 [00:47<4:52:27,  4.69s/it]

{'loss': 9.8313, 'grad_norm': 1.202725887298584, 'learning_rate': 0.00019983978638184245, 'epoch': 0.0}


  0%|          | 9/3750 [00:51<4:38:06,  4.46s/it]

{'loss': 10.3148, 'grad_norm': 1.311071753501892, 'learning_rate': 0.0001997863818424566, 'epoch': 0.0}


  0%|          | 10/3750 [00:56<4:43:24,  4.55s/it]

{'loss': 8.5222, 'grad_norm': 1.432337760925293, 'learning_rate': 0.00019973297730307076, 'epoch': 0.0}


  0%|          | 11/3750 [00:59<4:13:06,  4.06s/it]

{'loss': 9.4552, 'grad_norm': 1.789084553718567, 'learning_rate': 0.00019967957276368492, 'epoch': 0.0}


  0%|          | 12/3750 [01:04<4:25:42,  4.26s/it]

{'loss': 9.7957, 'grad_norm': 2.4022693634033203, 'learning_rate': 0.00019962616822429908, 'epoch': 0.0}


KeyboardInterrupt: 

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        dataset['train']['document'][0], # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nSummarize the following text.\n\n### Input:\nGreig Laidlaw kicked three penalties and converted tries by Charlie Sharples and Richard Hibbard to put the Cherry and Whites 23-9 up at the break.Myler converted James Wilson\'s try to reduce the margin to seven points.Media playback is not supported on this deviceDan Murphy then went in for the hosts, Wilson grabbed a second and Myler and Laidlaw traded penalties before Manoa crucially touched down.Northampton\'s Premiership lead was cut to nine points, with Exeter Chiefs thrashing bottom side London Welsh 74-19 to go second.The draw saw Gloucester slip to ninth, 11 points adrift of fourth-placed Saracens, who face fellow play-off hopefuls Wasps on Sunday.Once-capped England fly-half Myler marked his 148th and club-record setting Premiership app

In [None]:
model.push_to_hub("woshityj/llama_3.2_3B_Instruct_bnb_finetuned", token = "XXXXXXXXXXXXXXXXXXXX") # Online saving
tokenizer.push_to_hub("woshityj/llama_3.2_3B_Instruct_bnb_finetuned", token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXX") # Online saving

100%|██████████| 1/1 [00:04<00:00,  4.66s/it]


Saved model to https://huggingface.co/woshityj/llama_3.2_3B_Instruct_bnb_finetuned


100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


## Evaluation Metrics

The evaluation metrics that will be used to evaluate the text summarization performance of the Large Language Models (LLMs) are:
1. METEOR (Metric for Evaluation of Translation with Explicit Ordering)
2. ROUGE-N (Recall-Oriented Understudy for Gisting Evaluation)
3. BERTScore
4. BLEU (BiLingual Evaluation Understudy)
5. G-Eval
6. FactCC

In [None]:
import os
import torch
import pandas as pd

from datasets import load_from_disk
from transformers.utils import is_flash_attn_2_available
from unsloth import FastLanguageModel

import evaluate

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "XXXXXXXXXXXXXXXXXXXX"

device = "cuda"
torch.cuda.empty_cache()

dataset = load_from_disk("../datasets/xsum_dataset.hf")
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
})

In [None]:
def load_peft_model():
    
    if (is_flash_attn_2_available() and (torch.cuda.get_device_capability(0)[0] >= 8)):
        attn_implementation = "flash_attention_2"
    else:
        attn_implementation = "sdpa"
    
    print(f"[INFO] Using attention implementation: {attn_implementation}")

    model_id = "woshityj/llama_3.2_3B_Instruct_bnb_finetuned"
    print(f"[INFO] Using model_id: {model_id}")

    peft_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = 8192,
        dtype = None,
        load_in_4bit = True,
        token = "XXXXXXXXXXXXXXXXXXXXXXXX"
    )

    peft_model.to(device)

    return peft_model, tokenizer

peft_model, tokenizer = load_peft_model()

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: woshityj/llama_3.2_3B_Instruct_bnb_finetuned
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
dataset = dataset['test']

articles = dataset['document'][0:50]
human_summaries = dataset['summary'][0:50]
generated_summaries = []

for idx, article in enumerate(articles):
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors='pt').to("cuda")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = input_ids.to(device)
    human_baseline_text_output = human_summaries[idx]
    FastLanguageModel.for_inference(peft_model)
    peft_model_output = peft_model.generate(**input_ids, max_new_tokens = 8192, temperature = 0.1)
    prompt_length = input_ids['input_ids'].shape[1]
    peft_model_text_output = tokenizer.decode(peft_model_output[0][prompt_length:], skip_special_tokens = True)
    generated_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_summaries, generated_summaries))

In [8]:
df = pd.DataFrame(zipped_summaries, columns = ['Human Summary', 'Generated Summary'])
df

Unnamed: 0,Human Summary,Generated Summary
0,A 22-year-old man has been charged with causin...,A 22-year-old man has been charged with causin...
1,A Shropshire charity has designated October 'B...,A charity is trying to dispel the myth that bl...
2,Now that Hurricane Junior has blown through Wa...,The Trump Tower meeting between Donald Trump J...
3,The new leader of Kensington and Chelsea Counc...,The new leader of Kensington and Chelsea Counc...
4,"The index of the UK's biggest 100 companies, t...",The FTSE 100 index has fallen by 4.67% in a da...
5,"A ""river of filth"", a spate of gorse fires, an...",The papers this week are full of stories about...
6,Four Welsh MPs are standing for election as ch...,Four MPs are vying to become the new chairs of...
7,A French court has jailed 35 porters at the co...,A French auction house has been ordered to pay...
8,"Investors must be quoted an ""all-in fee"" to ma...",The UK's financial regulator has announced a s...
9,North Korean leader Kim Jong-il is paying his ...,North Korean leader Kim Jong-il has left his h...


### METEOR (Metric for Evaluation of Translation with Explicit Ordering)

In [9]:
meteor = evaluate.load("meteor")

peft_model_meteor_results = meteor.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)]
)

print(peft_model_meteor_results)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.3190848229643983}


### ROUGE-N (Recall-Oriented Understudy for Gisting Evaluation)

In [11]:
rouge = evaluate.load("rouge")

peft_model_rouge_results = rouge.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

print(peft_model_rouge_results)

{'rouge1': 0.3820379985871919, 'rouge2': 0.16165955441775515, 'rougeL': 0.30418705127334356, 'rougeLsum': 0.3037050720068172}


### BERTScore

In [9]:
from statistics import mean

bert_score = evaluate.load("bertscore")

peft_model_bert_score_results = bert_score.compute(
    predictions = df['Generated Summary'],
    references = df['Human Summary'][0:len(df['Generated Summary'])],
    lang = "en"
)

print(mean(peft_model_bert_score_results['precision']))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.909839152097702


### BLEU (BiLingual Evaluation Understudy)

In [14]:
bleu_score = evaluate.load("bleu")

peft_model_bleu_score_results = bleu_score.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)]
)

print(peft_model_bleu_score_results)

Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<?, ?B/s]
Downloading extra modules: 4.07kB [00:00, ?B/s]                       
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<?, ?B/s]

{'bleu': 0.11133221754705928, 'precisions': [0.40307101727447214, 0.14818548387096775, 0.0881104033970276, 0.06053811659192825], 'brevity_penalty': 0.8333165886033752, 'length_ratio': 0.8457792207792207, 'translation_length': 1042, 'reference_length': 1232}





### G-Eval

In [2]:
df.to_pickle("./results/geneteated_summaries.pkl")

NameError: name 'df' is not defined

In [1]:
import os
import torch
import pandas as pd

from datasets import load_from_disk
from transformers.utils import is_flash_attn_2_available
from unsloth import FastLanguageModel

import evaluate

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

device = "cuda"
torch.cuda.empty_cache()

dataset = load_from_disk("../datasets/xsum_dataset.hf")
dataset

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
})

In [2]:
df = pd.read_pickle("./results/geneteated_summaries.pkl")

In [3]:
import transformers
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

from deepeval.models import DeepEvalBaseLLM
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)
import json

torch.cuda.empty_cache()

class CustomLlama3_8B(DeepEvalBaseLLM):
    def __init__(self):
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

        model_4bit = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-3.2-3B-Instruct",
            device_map="auto",
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Llama-3.2-3B-Instruct"
        )

        self.model = model_4bit
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model = model,
            tokenizer = self.tokenizer,
            use_cache = True,
            device_map = "auto",
            max_length = 8192,
            do_sample = True,
            top_k = 5,
            num_return_sequences = 1,
            eos_token_id = self.tokenizer.eos_token_id,
            pad_token_id = self.tokenizer.pad_token_id,
        )

        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(pipeline.tokenizer, parser)

        output_dict = pipeline(prompt, prefix_allowed_tokens_fn = prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Llama-3 8B"

mistral_7b = CustomLlama3_8B()

  _ = torch.tensor([0], device=i)
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


In [9]:
import transformers
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)
from deepeval.models import DeepEvalBaseLLM
from pydantic import BaseModel
import json

torch.cuda.empty_cache()

class Mistral7B(DeepEvalBaseLLM):
    def __init__(self):
        quantization_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_use_double_quant = True,
        )

        model_4bit = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-3.2-3B-Instruct",
            device_map="auto",
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Llama-3.2-3B-Instruct"
        )

        self.model = model_4bit
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        device = "cuda" # the device to load the model onto
        pipeline = transformers.pipeline(
            "text-generation",
            model = model,
            tokenizer = self.tokenizer,
            use_cache = True,
            device_map = "auto",
            max_length = 8192,
            do_sample = True,
            top_k = 5,
            num_return_sequences = 1,
            eos_token_id = self.tokenizer.eos_token_id,
            pad_token_id = self.tokenizer.pad_token_id,
        )

        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(pipeline.tokenizer, parser)

        output_dict = pipeline(prompt, prefix_allowed_tokens_fn = prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Llama-3.2 3B"

mistral_7b = Mistral7B()

# mistral_7b = CustomLlama3_3B()
# model = AutoModelForCausalLM.from_pretrained("unsloth/mistral-7b-instruct-v0.1-bnb-4bit")
# tokenizer = AutoTokenizer.from_pretrained("unsloth/mistral-7b-instruct-v0.1-bnb-4bit")

# mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)
# print(mistral_7b.generate("Write me a joke"))

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.78s/it]


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
import torch

from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)
from pydantic import BaseModel


from unsloth import FastLanguageModel
import json

torch.cuda.empty_cache()

class CustomMistral7B(DeepEvalBaseLLM):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def load_model(self):
        return self.model
    
    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        model_inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")

        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(self.tokenizer, parser)


        model_inputs = model_inputs.to("cuda")
        FastLanguageModel.for_inference(model)
        generated_ids = model.generate(**model_inputs, max_new_tokens = 8192, prefix_allowed_tokens_fn = prefix_function)
        prompt_length = model_inputs['input_ids'].shape[1]
        output = self.tokenizer.decode(generated_ids[0][prompt_length:])
        json_result = json.loads(output)

        return schema(**json_result) 
        # output = self.tokenizer.decode(generated_ids[0])
        # print(output)
        # json_result = json.loads(output)

        # return schema(**json_result)
    
    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)
    
    def get_model_name(self):
        return "Mistral 7B"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.1-bnb-4bit",
    max_seq_length = 8192,
    dtype = None,
    load_in_4bit = True)

mistral_7b = CustomMistral7B(model=model, tokenizer=tokenizer)


==((====))==  Unsloth 2024.12.4: Fast Mistral patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  _ = torch.tensor([0], device=i)


In [None]:
import deepeval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

deepeval.login_with_confident_api_key("XXXXXXXXXXX")

OPENAI_API_KEY = "XXXXXXXXXXXXXXXXX"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

coherence_metrics = GEval(
    name = "Coherence",
    # criteria = "You will be given one summary written for a news article. Your task is to rate the summary on one metric. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing and refer to it as needed.",
    evaluation_steps = [
        "Read the news article carefully and identify the main topic and key points.",
        "Read the summary and compare it to the news article. Check if the summary covers the main topic and key points of the news article, and if it presents them in a clear and logical order.",
        "Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the evaluation criteria."
    ],
    model = "gpt-4o-mini",
    evaluation_params = [LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [14]:
from deepeval.test_case import LLMTestCase


dataset = load_from_disk("../datasets/xsum_dataset.hf")

test_case = LLMTestCase(
    input = dataset['test'][0],
    actual_output = df['Generated Summary'][0],
)

coherence_metrics.measure(test_case)

KeyboardInterrupt: 

### Summarization Metric

In [7]:
from deepeval import evaluate
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase
import json

test_case = LLMTestCase(input = dataset['test'][0], actual_output = df['Generated Summary'][0])
metric = SummarizationMetric(
    threshold = 0.5,
    model = mistral_7b,
    n = 5
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

evaluate([test_case], [metric])

JSONDecodeError: Extra data: line 5 column 2 (char 162)