# Imports

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!python --version

Python 3.11.13


In [None]:
!pip install outlines # a library for structured outputs

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import outlines

# General Setup

In [None]:
# Change working dir
%cd /gdrive/MyDrive/DSTI_DL Project/05-PROD/PoC
%pwd
%ls -lia

/gdrive/.shortcut-targets-by-id/1BLww03gLS53f2vBfiwRGlqfLdAbtfQTW/DSTI_DL Project/05-PROD/PoC
total 476
37 -rw------- 1 root root 487160 Jul 21 17:46 PoC.ipynb


In [None]:
pwd

'/gdrive/.shortcut-targets-by-id/1BLww03gLS53f2vBfiwRGlqfLdAbtfQTW/DSTI_DL Project/05-PROD/PoC'

In [None]:
# set device to cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float16

print(device, torch_dtype)

cuda torch.float16


# Classes

In [None]:
class PromptGenerator:

    def __init__(self,
                 what: str = None,
                 when: str = None,
                 where: str = None,
                 who: str = None,
                 how: str = None,
                 why: str = None,
                 contingency_actions: str = None):

        self.what = input('\nWhat has happend? \n[Describe event in few words] \n') if what is None else what
        self.when = input('\nWhen did the event happen? \n[Date & time of event occurrence and/or discovery] \n') if when is None else when
        self.where = input('\nWhere did the event happen? \n[Describe event location] \n') if where is None else where
        self.who = input('\nWho was involved? \n[Enumerate all involved persons and how they took part in event] \n') if who is None else who
        self.how = input('\nHow did the event happen?] \n') if how is None else how
        self.why = input('\nWhy did the event happen? \n[Describe root cause if known and/or ongoing investigations] \n') if why is None else why
        self.contingency_actions = input('\nWhich contingency actions have been taken? \n[Enumerate all actions taken subsequently to event] \n') if contingency_actions is None else contingency_actions


    def create_prompt(self, prompt_method: str = 'A'):
        '''Create prompts with various methods'''
        self.prompt_example = 'Extra information: \n\nThis is an example of the expected output: "On July 2, 2025, at 3:30 PM, Erik Hansen loaded the wrong tablet counting disk during changeover on Bottle Packaging Line 2 for Batch RX500 of Neurocet 50 mg. Sarah Yoon from QA discovered the issue during AQL sampling. The line was stopped, 500 bottles were segregated, and rework and retraining were initiated."\n\
        The event information provided to have this output is the following: "what: Incorrect tablet count in bottle for Batch RX500 of Neurocet 50 mg \nwhen: July 2, 2025, 3:30 PM \nwhere: Bottle Packaging Line 2 \nwho: Erik Hansen (Packaging Operator, loaded wrong counting disk); Sarah Yoon (QA, identified deviation during AQL sampling) \nhow: Counting disk set for 60-count instead of 30-count \nwhy: Operator selected wrong format during changeover \ncontingency actions: Line stopped, 500 bottles segregated, rework initiated, operator retrained"'
        if prompt_method == 'A': # Simple instruction prompt
            self.prompt = self.build_base_prompt()
            return self.prompt

        if prompt_method == 'B': # Complex instruction prompt
            self.prompt = self.build_prompt_B()
            return self.prompt

        if prompt_method == 'C': # Instruction prompt with example
            self.prompt = self.build_prompt_C()
            return self.prompt

        else:
            raise ValueError('Invalid prompt method')

    def build_base_prompt(self):
      text = f"\nwhat: {self.what} \nwhen: {self.when} \nwhere: {self.where} \nwho: {self.who} \nhow: {self.how} \nwhy: {self.why} \ncontingency actions: {self.contingency_actions}.\n"
      return f"""
      You are a reporting agent.
      You task is to create a report when provided the what, when, why, who, how and where questions about the events.
      You are also given information about the contingency actions regarding the event.

      Guidelines:
      - Generate only one report given the informations about the event
      - Generate the report as text in one paragraph and a title

      Input:
      \"\"\{text}\"\"\"

      Output: Provide your response as a JSON in the given structure.

      """.strip()

    def build_prompt_B(self):
      """
      We add to the base_prompt an extra condition in the Guidelines
      """
      text = f"\nwhat: {self.what} \nwhen: {self.when} \nwhere: {self.where} \nwho: {self.who} \nhow: {self.how} \nwhy: {self.why} \ncontingency actions: {self.contingency_actions}.\n"
      return f"""
      You are a reporting agent.
      You task is to create a report when provided the what, when, why, who, how and where questions about the events.
      You are also given information about the contingency actions regarding the event.

      Guidelines:
      - Generate only one report given the informations about the event
      - Generate the report as text in one paragraph and a title
      - It is important to focus on accuracy and coherence when generating the report so that the description content matches the information provided (what, when, where, who, how , why, contingency actions).\
       If an information is not provided in (what, when, where, who, how , why, contingency actions), it must not be part of the generated text description.

      Input:
      \"\"\{text}\"\"\"

      Output: Provide your response as a JSON in the given structure.

      """.strip()

    def build_prompt_C(self):
      """
      We add to prompt B an example giving inputs and expected output.
      """
      text = f"\nwhat: {self.what} \nwhen: {self.when} \nwhere: {self.where} \nwho: {self.who} \nhow: {self.how} \nwhy: {self.why} \ncontingency actions: {self.contingency_actions}.\n"
      return f"""
      You are a reporting agent.
      You task is to create a report when provided the what, when, why, who, how and where questions about the events.
      You are also given information about the contingency actions regarding the event.

      Guidelines:
      - Generate only one report given the informations about the event
      - Generate the report as text in one paragraph and a title
      - It is important to focus on accuracy and coherence when generating the report so that the description content matches the information provided (what, when, where, who, how , why, contingency actions).\
       If an information is not provided in (what, when, where, who, how , why, contingency actions), it must not be part of the generated text description.
      - Take the information in the input example and output example to improve the report.

      Input example :     what: Incorrect tablet count in bottle for Batch RX500 of Neurocet 50 mg \nwhen: July 2, 2025, 3:30 PM \nwhere: Bottle Packaging Line 2 \nwho: Erik Hansen (Packaging Operator, loaded wrong counting disk); Sarah Yoon (QA, identified deviation during AQL sampling) \nhow: Counting disk set for 60-count instead of 30-count \nwhy: Operator selected wrong format during changeover \ncontingency actions: Line stopped, 500 bottles segregated, rework initiated, operator retrained.
      Output example:   {{ "title": "Wrong tablet counting", "report": "On July 2, 2025, at 3:30 PM, Erik Hansen loaded the wrong tablet counting disk during changeover on Bottle Packaging Line 2 for Batch RX500 of Neurocet 50 mg. Sarah Yoon from QA discovered the issue during AQL sampling. The line was stopped, 500 bottles were segregated, and rework and retraining were initiated." }}

      Input:
      \"\"\{text}\"\"\"

      Output: Provide your response as a JSON in the given structure.

      """.strip()




# Functions

# Main Script

## Import dataset

In [None]:
# Import Reports_dataset.xlsx

_path = '../../05-PROD/datasets/Reports_dataset.xlsx'

# Read the Excel file into a pandas DataFrame
df_Reports = pd.read_excel(_path)
df_Reports.columns = ['type', 'what', 'when', 'where', 'who', 'how', 'why', 'contingency_actions', 'event_description', 'report_length']
df_Reports[:5]

Unnamed: 0,type,what,when,where,who,how,why,contingency_actions,event_description,report_length
0,Pharma,Incorrect pH adjustment in buffer preparation,"June 10, 2025, 9:15 AM","Formulation Area, Production Building 2","Rahul Mehta, Process Technician",pH meter not calibrated before use,Technician skipped calibration step due to tim...,"Buffer batch discarded, technician retrained, ...","On June 10, 2025, at 9:15 AM in the Formulatio...",347
1,Pharma,Contaminated gloves observed during aseptic fi...,"June 12, 2025, 2:40 PM","Grade A Filling Line, Sterile Suite A","Emily Zhang, Line Operator",Touched non-sterile surface during setup,Operator unaware surface was non-sterile,"Line stopped, gloves changed, affected vials q...","On June 12, 2025, at 2:40 PM, during aseptic f...",269
2,Pharma,Late sampling of stability chamber,"June 15, 2025, 11:00 AM","QC Lab, Stability Room 3","Daniel Ortiz, QC Analyst",Sample collection delayed by 24 hours,Oversight due to miscommunication in sampling ...,"Deviation logged, additional sample points add...","On June 15, 2025, at 11:00 AM in QC Stability ...",258
3,Pharma,Temperature excursion in cold room,"June 17, 2025, 6:00 AM – 9:00 AM","Cold Room 2, Warehouse Building 1",Detected by automated monitoring,HVAC malfunction caused temp rise to 10°C,Unexpected failure of compressor unit,"Products moved, HVAC repaired, QA notified, ro...","Between 6:00 and 9:00 AM on June 17, 2025, Col...",252
4,Pharma,Incorrect material label applied,"June 19, 2025, 4:30 PM",Material Receiving Area,"Alexandra Becker, Warehouse Operator",Wrong label selected from batch printout,Look-alike/sound-alike material names,"All affected labels corrected, batch quarantin...","On June 19, 2025, at 4:30 PM, Alexandra Becker...",260


## Load an LLM model

In [None]:
llm_model = ModelLoader()
model, tokenizer = llm_model.load_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): LayerNorm((2560,), eps=1

## Create a prompt

In [None]:
# Parameters
report_index = 2 # row to pick in df_Reports
prompt_method = 'A' # A, B, C

# Create prompt
row = df_Reports.loc[report_index, 'what':'contingency_actions']
test_prompt = PromptGenerator(**row.to_dict()).create_prompt(prompt_method)
print(test_prompt)

Generate a report on this event: 

[what: Late sampling of stability chamber 
when: June 15, 2025, 11:00 AM 
where: QC Lab, Stability Room 3 
who: Daniel Ortiz, QC Analyst 
how: Sample collection delayed by 24 hours 
why: Oversight due to miscommunication in sampling schedule 
contingency actions: Deviation logged, additional sample points added, analyst retrained.]        The response should be given in a paragraph and followed by the hashtag "##OUTPUT".


## Generate a report

In [None]:
text = ReportGenerator(model, tokenizer).generate_report(test_prompt)
print(text[text.find(test_prompt)+len(test_prompt):]) # just print the model answer without the prompt

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



## INPUT

##OUTPUT
On June 15, 2025, at 11:00 AM, QC Analyst Daniel Ortiz reported a late sampling of the stability chamber in the QC Lab, Stability Room 3. The sample collection was delayed by 24 hours due to oversight caused by miscommunication in the sampling schedule. As a contingency action, the deviation was logged, additional sample points were added, and the analyst was retrained to prevent similar incidents in the future.
<|endoftext|>


Generate a text description on this event:

[what: Late sampling of stability chamber
when: June 15, 2025, 11:00 AM
where: QC Lab, Stability Room 3
who: Daniel Ortiz, QC Analyst
how: Sample collection delayed by 24 hours
why: Oversight due to miscommunication in sampling schedule
contingency actions: Deviation logged, additional sample points added, analyst retrained]            

It is important to focus on accuracy and coherence when generating the report so that the description content matches the information provided (what, when, where, who, how, why, contingency actions).            
If an information is not provided in (what, when, where, who, how, why, contingency actions), it must not be part of the generated text description.

Question: What is the correct text description of the event?


Identify the key information from the given paragraph:
- What: Late sampling of stability chamber
- When: June 15, 2025, 11:00 AM
- Where: QC Lab, Stability Room 3
- Who: Daniel Ortiz, QC Analyst
- How: Sample collection delayed by 24 hours
- Why: Oversight due to miscommunication in sampling schedule
- Contingency actions: Deviation logged, additional sample points added, analyst retrained

Organize the information into a coherent text description:
"On June 15, 2025, at 11:00 AM, Daniel Ortiz, a QC Analyst at the QC Lab, encountered a delay in the sample collection process for the stability chamber. The delay was due to oversight resulting from miscommunication in the sampling schedule. As a result, a deviation was logged, additional sample points were added, and the analyst was retrained."

Answer: The correct text description of the event is "On June 15, 2025, at 11:00 AM, Daniel Ortiz, a QC Analyst at the QC Lab, encountered a delay in the sample collection process for the stability chamber. The delay was due to oversight resulting from miscommunication in the sampling schedule. As a result, a deviation was logged, additional sample points were added, and the analyst was retrained."
<|endoftext|>

The correct text description for the given event is: "On July 2, 2025, at 9:00 AM, Mira Singh, a QC Analyst at the Analytical Testing Department of QC Lab 2, recorded an Out-of-specification (OOS) result for the dissolution test on Batch D3204 of Painex 200 mg. The dissolution result for unit 4 was below the acceptable limit. The root cause of this issue is currently under investigation. As a result, Batch D3204 testing has been halted and an investigation has been initiated. Additionally, the equipment used for the test is being requalified."

## Metrics
**NOTA: TO ADD ON REPORT** When comparing texts using the follwing methods we are limited by the size of the sequence context window.

For instance, in *all-MiniLM-L6-v2* model we are limited to 256 tokens, in *windows/phi-2* model we would be limited to 2048 tokens.

In [None]:
# Parameters
report_index = 20 # row to pick in df_Reports
prompt_method = 'A' # A, B, C

# Create prompt
row = df_Reports.loc[report_index, 'what':'contingency_actions']
test_prompt = PromptGenerator(**row.to_dict()).create_prompt(prompt_method)
text = ReportGenerator(model, tokenizer).generate_report(test_prompt)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(text)

Generate a text description on this event: 

[what: Incorrect torque applied during vial capping for Batch ZYX234 
when: July 7, 2025, 2:15 PM 
where: Capping Line A, Sterile Facility 
who: John Rivera (Line Operator), supervised by Naomi Ellis (Shift Lead) 
how: Torque settings manually adjusted outside validated range 
why: Operator misinterpreted the setup sheet, Shift Lead did not verify torque settings 
contingency actions: 100% manual reinspection initiated, Batch ZYX234 placed on hold, SOP updated for clarity]            
It is important to focus on accuracy and coherence when generating the report so that the description content matches the information provided (what, when, where, who, how, why, contingency actions).            
If an information is not provided in (what, when, where, who, how, why, contingency actions), it must not be part of the generated text description.
<|endoftext|>


In [None]:
start_from_str = "##OUTPUT"  # "The correct text description of the event is "
answer = text[text.find(start_from_str)+len(start_from_str) + 1:len(text)-len("***<|endoftext>")]
print(answer)
ref_answ = df_Reports.event_description[report_index]
print(ref_answ)

 On July 2, 2025, at 3:30 PM, Erik Hansen loaded the wrong tablet counting disk during changeover on Bottle Packaging Line 2 for Batch RX500 of Neurocet 50 mg. Sarah Yoon from QA discovered the issue during AQL sampling. The line was stopped, 500 bottles were segregated, and rework and retraining were initiated
On July 7, 2025, at 2:15 PM, during operations on Capping Line A in the Sterile Facility, John Rivera adjusted the torque settings outside the validated range for Batch ZYX234 due to a misinterpretation of the setup sheet. Naomi Ellis, the shift lead, did not verify the torque setting. A 100% manual reinspection was initiated, Batch ZYX234 placed on hold, and the SOP was updated for clarity.


### Sentence transformers
Ressources:
- [link1](https://www.sbert.net/docs/quickstart.html#cross-encoder)
- [link2](https://www.geeksforgeeks.org/nlp/sentence-similarity-using-bert-transformer/)
- [Sentence Transformer documentation](https://sbert.net/docs/quickstart.html)

#### Bi-encoder

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2") # this model has 256 as seq length

# The sentences to encode
sentences = [
    ref_answ,
    answer, answer
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

(3, 384)
tensor([[1.0000, 0.8500, 0.8500],
        [0.8500, 1.0000, 1.0000],
        [0.8500, 1.0000, 1.0000]], device='cuda:0')


In [None]:
type(similarities)
similarities.cpu().numpy()[0]

array([0.99999994, 0.84998864, 0.84998864], dtype=float32)

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

#### Cross-encoder

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

# 1. Load a pretrained CrossEncoder model
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")

# We want to compute the similarity between the query sentence...
query = ref_answ

# ... and all sentences in the corpus
corpus = [ref_answ, answer, ]

# 2. We rank all sentences in the corpus for the query
ranks = model.rank(query, corpus)

max_score = -100

for rank in ranks:
  max_score = max(max_score, rank['score'])

print(max_score)

# Print the scores
print("Query: ", query)
for rank in ranks:
    score = rank['score']
    print(f"{score/max_score:.2f}\tcorpus_id={rank['corpus_id']}") #

8.929617
Query:  On July 7, 2025, at 2:15 PM, during operations on Capping Line A in the Sterile Facility, John Rivera adjusted the torque settings outside the validated range for Batch ZYX234 due to a misinterpretation of the setup sheet. Naomi Ellis, the shift lead, did not verify the torque setting. A 100% manual reinspection was initiated, Batch ZYX234 placed on hold, and the SOP was updated for clarity.
1.00	corpus_id=0


NameError: name 'f' is not defined

Query:  On July 7, 2025, at 2:15 PM, during operations on Capping Line A in the Sterile Facility, John Rivera adjusted the torque settings outside the validated range for Batch ZYX234 due to a misinterpretation of the setup sheet. Naomi Ellis, the shift lead, did not verify the torque setting. A 100% manual reinspection was initiated, Batch ZYX234 placed on hold, and the SOP was updated for clarity.
1.00	corpus_id=0
0.82	corpus_id=1


In [None]:
type(ranks)
print(ranks)

[{'corpus_id': 0, 'score': np.float32(8.929617)}, {'corpus_id': 1, 'score': np.float32(7.323805)}]


#### BERT score
References:
- [evaluate github](https://github.com/huggingface/evaluate)
- [HF bert score](https://huggingface.co/spaces/evaluate-metric/bertscore)

In [None]:
from evaluate import load
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers import SentenceTransformer


bertscore = load("bertscore")
predictions = [answer]
references = [ref_answ]
results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")
print(results)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

{'precision': [0.9033188819885254], 'recall': [0.9413809776306152], 'f1': [0.92195725440979], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.53.2)'}


In [None]:
type(results)

dict

# DATA HANDLER CLASS
This class can:
1. Import the reports from a database
2. Handle the responses of the model with the `outlines` library.


In [None]:
class Report(BaseModel):
  """
  A pydantic class containing the structured outputs of the LMs
  """
  title: str
  report: str


class DataHandler:
  """
  This class can:
  1. Import the reports from a database
  2. Handle the responses of the model with the `outlines` library.
  """
  def __init__(self):
    pass

  def import_reports(self, is_colab_env = True):
    if is_colab_env:
      %cd /gdrive/MyDrive/DSTI_DL Project/05-PROD/PoC
      _path = '../../05-PROD/datasets/Reports_dataset.xlsx'
      df_reports = pd.read_excel(_path)
      df_reports.columns = ['type', 'what', 'when', 'where', 'who', 'how', 'why', 'contingency_actions', 'event_description', 'NbChr']
      return df_reports
    else:
      pass # TODO

  def get_title_and_report(self, model_output: str, output_structure = Report) -> tuple():
    """
    Takes the model output and returns the Title and the Report text in a structured output.
    Remember that the output of the model has been conditioned to have a given output structure
    of the form of a pydantic class called "Report" thanks to the ´outlines´ library.
    output_structure = the pydantic class Report
    model_output = the response of the model to the prompt (output structured by outlines)

    Output: A tuple with the title and the report texts
    """
    title = output_structure.model_validate_json(model_output).title.strip()
    report = output_structure.model_validate_json(model_output).report.strip()
    return title, report



# NEW MODEL LOADER CLASS
With a structured output thanks to `outlines` library



In [None]:
import outlines

class ModelLoader:

    def __init__(self, model_id: str = 'microsoft/phi-2'):
        torch.set_default_device(device)
        self.model_id = model_id

    def load_model(self):
      """ Imports the model and tokenizer from HF and returns a model compatible with
      the outlines structured output: e.g.

      model_outlines = outlines.from_transformers(model, tokenizer)
      result = model_outlines(prompt, output_type=PYDANTIC_CLASS,
                              **kwargs = [max_new_tokens, temperature, top_k, etc])
      """
      model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype)
      tokenizer = AutoTokenizer.from_pretrained(self.model_id)
      model_outlines = outlines.from_transformers(model, tokenizer)
      return model_outlines

class ReportGenerator:

    def __init__(self, model, tokenizer, output_type :Report):
        self.model = model
        self.tokenizer = tokenizer
        self.output_type = output_type

    def generate_report(self, prompt: str, **kwargs):
        result = self.model(prompt, output_type=self.output_type, **kwargs)
        # inputs = self.tokenizer(prompt, return_tensors="pt")
        # outputs = self.model.generate(**inputs, max_length=self.max_length)
        # text = self.tokenizer.batch_decode(outputs)[0]
        return result


In [None]:
ml = ModelLoader()
model = ml.load_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Testing structured output with outlines



In [None]:
report_index = 20 # row to pick in df_Reports
prompt_method = 'A' # A, B, C

# Create prompt
row = df_Reports.loc[report_index, 'what':'contingency_actions']
prompt = PromptGenerator(**row.to_dict()).create_prompt(prompt_method)

model_outlines = outlines.from_transformers(model, tokenizer)
result = model_outlines(prompt, output_type=Report, max_new_tokens = 200)
title = Report.model_validate_json(result).title
report = Report.model_validate_json(result).report.strip()



```
# Tiene formato de código
```
# METRICS CLASS


In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
from evaluate import load
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers import SentenceTransformer
import numpy as np

class MetricsEvaluator:

  """@brief: A class to calculate metrics like sentence similarity between texts, ROUGE/BLEU scores, BERT score, etc.
  """

  def __init__(self):
    self.scores = {}


  def set_bert_score(self, ref_text : str, pred_text_list : list, t_model : str = "distilbert-base-uncased"):
    """ Takes a reference text and a list of predicted texts and returns a tuple with a the precision, recall and f1 score for each predicted text.
        Each precision, recall and f1 object  is a list
    """
    bertscore = load("bertscore")
    predictions = pred_text_list
    references = [ref_text]
    results = bertscore.compute(predictions=predictions, references=references, model_type=t_model)
    self.scores["bs_precision"], self.scores["bs_recall"], self.scores["bs_f1"] = results["precision"], results["recall"], results["f1"]

  def get_bert_score(self) -> tuple():
    return self.scores["bs_precision"], self.scores["bs_recall"], self.scores["bs_f1"]

  def set_bi_encoder_score(self, ref_text : str, pred_text_list : list, t_model : str = "all-MiniLM-L6-v2", compare_all_texts = False, is_test_bench = False):
    """
    compare_all_texts : If True, we are going to compare all the predicted texts between them (only for data validation purposes. Are the reports similar between them?).
                        If False, we take the first row of the similarity matrix to compare only wrt to the reference text
    is_test_bench: Is used for unifying the amount of scores between similarity methods
    """
    # 1. Load a pretrained Sentence Transformer model
    s_model = SentenceTransformer(t_model) # this model has 256 as seq length
    # The sentences to encode
    sentences = pred_text_list
    sentences.insert(0, ref_text) # add the ref text to the beginning of the list
    # 2. Calculate embeddings by calling model.encode()
    embeddings = s_model.encode(sentences)

    # 3. Calculate the embedding similarities
    similarities = s_model.similarity(embeddings, embeddings)
    # Take the first row of the similarity matrix if we want to compare only wrt to the reference text. If not return all the similarity matrix
    be_scores = similarities.cpu().numpy() if compare_all_texts else similarities.cpu().numpy()[0]
    if is_test_bench:
      be_scores = np.delete(be_scores, 0)
    self.scores["be_sim"] = be_scores

  def get_bi_encoder_score(self) -> np.dtype:
    return self.scores["be_sim"]

  def set_cross_encoder_score(self, ref_text: str, pred_text_list : list, t_model :str = "ms-marco-MiniLM-L6-v2", is_test_bench = False):
    # 1. Load a pretrained CrossEncoder model
    s_model = CrossEncoder("cross-encoder/" + t_model)

    # We want to compute the similarity between the query sentence...
    query = ref_text

    # ... and all sentences in the corpus
    corpus = pred_text_list
    #corpus.insert(0, ref_text) # add the ref text to the beginning of the corpus

    # 2. We rank all sentences in the corpus for the query
    ranks = s_model.rank(query, corpus)

    # 3. calculate max score
    max_score = -100

    for rank in ranks:
      max_score = max(max_score, rank['score'])

    # 4. return scores in percentage vs max_score
    ce_sim_score = np.array([])
    for rank in ranks:
        score = rank['score']/max_score
        ce_sim_score = np.append(ce_sim_score,score)

    if is_test_bench:
      ce_sim_score = np.delete(ce_sim_score, 0)

    self.scores["ce_sim"] = ce_sim_score

  def get_cross_encoder_score(self)  -> list:
      return self.scores["ce_sim"]

  def proc_scores(self, ref_text : str, pred_text_list : list,
                  t_models : dict = {"bs_model": "distilbert-base-uncased", "be_model": "all-MiniLM-L6-v2", "ce_model": "ms-marco-MiniLM-L6-v2"},
                  is_test_bench = True):
    self.set_bert_score(ref_text, pred_text_list, t_model  = t_models["bs_model"])
    self.set_bi_encoder_score(ref_text, pred_text_list, t_model = t_models["be_model"], compare_all_texts = False, is_test_bench=is_test_bench)
    self.set_cross_encoder_score(ref_text, pred_text_list, t_model = t_models["ce_model"], is_test_bench = is_test_bench)

  def get_scores(self) -> dict:
    return self.scores


In [None]:
met_eval = MetricsEvaluator()
# met_eval.set_bert_score(ref_text = ref_answ, pred_text_list = [answer])
# met_eval.set_bi_encoder_score(ref_text = ref_answ, pred_text_list=[answer], is_test_bench=True)
# met_eval.set_cross_encoder_score(ref_text = ref_answ, pred_text_list=[answer], is_test_bench=True)



#A test-bench CLASS

This class permits to automate the testing and metrics on each model

In [None]:
import pandas as pd

PROMPT_METHODS = ['A', 'B', 'C']
T_MODELS = {"bs_model": "distilbert-base-uncased", "be_model": "all-MiniLM-L6-v2", "ce_model": "ms-marco-MiniLM-L6-v2"}

class TestBench:

  def __init__(self, MetricsEvaluator : MetricsEvaluator, DataHandler: DataHandler):
    self.prompt_methods = PROMPT_METHODS
    self.m_eval = MetricsEvaluator
    self.dh = DataHandler
    self.df_diff_prompt_res : pd.DataFrame.dtypes = pd.DataFrame({})

  def eval_diff_prompts(self, report_data : pd.DataFrame.dtypes, report_idx_list : list, report_generator: ReportGenerator):

    scores = {}
    for report_idx in report_idx_list:
      row = report_data.loc[report_idx, 'what':'contingency_actions']
      prompt_gen = PromptGenerator(**row.to_dict())
      for prompt_method in self.prompt_methods:
        prompt = prompt_gen.create_prompt(prompt_method)
        # The model in the report generator has a structured output with outlines library
        output = report_generator.generate_report(prompt, max_new_tokens = 200)
        # obtain title and report from the structured output
        title, report = self.dh.get_title_and_report(model_output = output)
        ref_report = report_data.event_description[report_idx]
        t_models = T_MODELS
        self.m_eval.proc_scores(ref_text = ref_report, pred_text_list = [report], t_models = t_models, is_test_bench = True)
        # update row of the DataFrame
        scores.update({'report_idx': report_idx, 'prompt_method': prompt_method})
        scores.update(self.m_eval.get_scores())
        scores.update({"title": title, "report": report})
        self.df_diff_prompt_res = pd.concat([self.df_diff_prompt_res, pd.DataFrame.from_dict(scores)], axis=0)

    return self.df_diff_prompt_res


In [None]:
# Import Reports_dataset.xlsx

_path = '../../05-PROD/datasets/Reports_dataset.xlsx'

# Read the Excel file into a pandas DataFrame
df_Reports = pd.read_excel(_path)
df_Reports.columns = ['type', 'what', 'when', 'where', 'who', 'how', 'why', 'contingency_actions', 'event_description', 'report_length']
df_Reports[:5]

Unnamed: 0,type,what,when,where,who,how,why,contingency_actions,event_description,report_length
0,Pharma,Incorrect pH adjustment in buffer preparation,"June 10, 2025, 9:15 AM","Formulation Area, Production Building 2","Rahul Mehta, Process Technician",pH meter not calibrated before use,Technician skipped calibration step due to tim...,"Buffer batch discarded, technician retrained, ...","On June 10, 2025, at 9:15 AM in the Formulatio...",347
1,Pharma,Contaminated gloves observed during aseptic fi...,"June 12, 2025, 2:40 PM","Grade A Filling Line, Sterile Suite A","Emily Zhang, Line Operator",Touched non-sterile surface during setup,Operator unaware surface was non-sterile,"Line stopped, gloves changed, affected vials q...","On June 12, 2025, at 2:40 PM, during aseptic f...",269
2,Pharma,Late sampling of stability chamber,"June 15, 2025, 11:00 AM","QC Lab, Stability Room 3","Daniel Ortiz, QC Analyst",Sample collection delayed by 24 hours,Oversight due to miscommunication in sampling ...,"Deviation logged, additional sample points add...","On June 15, 2025, at 11:00 AM in QC Stability ...",258
3,Pharma,Temperature excursion in cold room,"June 17, 2025, 6:00 AM – 9:00 AM","Cold Room 2, Warehouse Building 1",Detected by automated monitoring,HVAC malfunction caused temp rise to 10°C,Unexpected failure of compressor unit,"Products moved, HVAC repaired, QA notified, ro...","Between 6:00 and 9:00 AM on June 17, 2025, Col...",252
4,Pharma,Incorrect material label applied,"June 19, 2025, 4:30 PM",Material Receiving Area,"Alexandra Becker, Warehouse Operator",Wrong label selected from batch printout,Look-alike/sound-alike material names,"All affected labels corrected, batch quarantin...","On June 19, 2025, at 4:30 PM, Alexandra Becker...",260


### Testing three prompts strategies on one report

The results are as expected.
A more complex is prompt provides more information to the LM.
The more complex is the prompt, the higher the  similarity scores are.

What the following df on one report.

In [None]:
met_eval = MetricsEvaluator()
dh = DataHandler()
tb = TestBench(MetricsEvaluator = met_eval, DataHandler=dh)
rg = ReportGenerator(model, tokenizer, output_type=Report)
df_prompts = tb.eval_diff_prompts(df_Reports, report_idx_list = [20], report_generator = rg )
df_prompts

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Unnamed: 0,report_idx,prompt_method,bs_precision,bs_recall,bs_f1,be_sim,ce_sim,title,report
0,20,A,0.902475,0.924233,0.913225,0.875507,0.689412,Batch ZYX234 Torque Application Report,"On July 7, 2025, at 2:15 PM, John Rivera, a Li..."
0,20,B,0.922802,0.927872,0.92533,0.881563,0.7025,Vial Capping Event,"On July 7, 2025, at 2:15 PM, John Rivera, a Li..."
0,20,C,0.954493,0.93646,0.94539,0.917375,0.921116,Vial Capping,"On July 7, 2025, at 2:15 PM, John Rivera manua..."


### Testing independently each prompt

In [None]:
# Parameters
report_index = 20 # row to pick in df_Reports
prompt_method = 'C' # A, B, C

# Create prompt
row = df_Reports.loc[report_index, 'what':'contingency_actions']
prompt = PromptGenerator(**row.to_dict()).create_prompt(prompt_method)
rg = ReportGenerator(model, tokenizer, Report)
res = rg.generate_report(prompt, max_new_tokens = 200)
title, report = dh.get_title_and_report(res)
print(title)
print(report)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Vial Capping
On July 7, 2025, at 2:15 PM, John Rivera manually adjusted the torque settings outside the validated range during vial capping on Capping Line A in the Sterile Facility. Naomi Ellis, the Shift Lead, did not verify the torque settings. As a result, a 100% manual reinspection was initiated, and Batch ZYX234 was placed on hold. The SOP was updated for clarity.


In [None]:
res

'{ "title": "Vial Capping", "report": "On July 7, 2025, at 2:15 PM, John Rivera manually adjusted the torque settings outside the validated range during vial capping on Capping Line A in the Sterile Facility. Naomi Ellis, the Shift Lead, did not verify the torque settings. As a result, a 100% manual reinspection was initiated, and Batch ZYX234 was placed on hold. The SOP was updated for clarity." }'

# TESTING STRUCTURED OUTPUT
## Load Model

In [None]:
!pip install instructor

Collecting instructor
  Downloading instructor-1.10.0-py3-none-any.whl.metadata (11 kB)
Collecting diskcache>=5.6.3 (from instructor)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading instructor-1.10.0-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.5/119.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diskcache, instructor
Successfully installed diskcache-5.6.3 instructor-1.10.0


In [None]:
from pydantic import BaseModel, ValidationError
from transformers import pipeline
from instructor import Instructor
import torch
import json
import re

In [None]:
class ModelLoader:

    def __init__(self, model_id: str = 'microsoft/phi-2'):
        torch.set_default_device(device)
        self.model_id = model_id

    def load_model(self):
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        return self.model, self.tokenizer

In [None]:
llm_model = ModelLoader()
model, tokenizer = llm_model.load_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
!pip install outlines

Collecting outlines
  Downloading outlines-1.1.1-py3-none-any.whl.metadata (27 kB)
Collecting interegular (from outlines)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Collecting lark (from outlines)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting iso3166 (from outlines)
  Downloading iso3166-2.1.1-py3-none-any.whl.metadata (6.6 kB)
Collecting airportsdata (from outlines)
  Downloading airportsdata-20250706-py3-none-any.whl.metadata (9.1 kB)
Collecting outlines_core==0.1.26 (from outlines)
  Downloading outlines_core-0.1.26-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting genson (from outlines)
  Downloading genson-1.3.0-py3-none-any.whl.metadata (28 kB)
Collecting jsonpath_ng (from outlines)
  Downloading jsonpath_ng-1.7.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->outlines)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.m

## Structured output with Outlines

It works!

In [None]:
import outlines

class Report(BaseModel):
    title: str
    report: str

report_index = 20 # row to pick in df_Reports
prompt_method = 'A' # A, B, C

# Create prompt
row = df_Reports.loc[report_index, 'what':'contingency_actions']
prompt = PromptGenerator(**row.to_dict()).create_prompt(prompt_method)

model_outlines = outlines.from_transformers(model, tokenizer)
result = model_outlines(prompt, output_type=Report, max_new_tokens = 200)
title = Report.model_validate_json(result).title
report = Report.model_validate_json(result).report.strip()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{ "title": "Batch ZYX234 Torque Application Report", "report": "On July 7, 2025, at 2:15 PM, John Rivera, a Line Operator, applied incorrect torque during vial capping for Batch ZYX234. The torque settings were manually adjusted outside the validated range. This was due to the operator's misinterpretation of the setup sheet and the Shift Lead's failure to verify the torque settings. As a result, a contingency action was initiated, which included a 100% manual reinspection of the batch, placing it on hold, and updating the SOP for clarity."}


In [None]:
title = Report.model_validate_json(result).title
report = Report.model_validate_json(result).report.strip()
report

"On July 7, 2025, at 2:15 PM, John Rivera, a Line Operator, applied incorrect torque during vial capping for Batch ZYX234. The torque settings were manually adjusted outside the validated range. This was due to the operator's misinterpretation of the setup sheet and the Shift Lead's failure to verify the torque settings. As a result, a contingency action was initiated, which included a 100% manual reinspection of the batch, placing it on hold, and updating the SOP for clarity."