In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import userdata
github_token = userdata.get('zbotta_token')

token = github_token
username = "zbotta"
repo = 'reportingAgent'
%cd /content/drive/MyDrive/GitHub/{repo}

/content/drive/MyDrive/GitHub/reportingAgent


In [3]:
!git config --global user.name "zbotta"
!git config --global user.email "zbotta@proton.me"
!git pull
!git checkout dev

Already up to date.
M	PoC/reportAgent-remote.ipynb
A	app/datasets/training/eval.json
A	app/datasets/training/eval.jsonl
A	app/datasets/training/train.json
A	app/datasets/training/train.jsonl
Already on 'dev'
Your branch is up to date with 'origin/dev'.


# Testing models < 1B

In [None]:
!pip -q install -U "transformers>=4.43" "accelerate>=0.33" bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Qwen 2.5-0.5B-Instruct

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch, re

MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, quantization_config=bnb_cfg, device_map="auto",
    torch_dtype=torch.float16, trust_remote_code=True
)

SYSTEM_INSTR = (
  "You are an incident-report generator.\n"
  "Language: {lang_directive}\n"
  "Write ONE SINGLE PARAGRAPH that includes ALL facts provided: what happened, when, where, who, how, why (root cause), and contingency/corrective actions. "
  "Constraints: neutral factual tone; no bullet points, no headings, no lists, no JSON; "
  "do NOT invent details; include only information given; output must be a single line with no line breaks; "
  "preserve numbers, times, names, and proper nouns; limit length to {max_chars} characters."
)

def extract_lang(user_text:str):
    # Optional inline directive, e.g. "Language: French"
    m = re.search(r"(?i)\bLanguage\s*:\s*([A-Za-zÀ-ÿ \-]+)", user_text)
    return m.group(1).strip() if m else None

def build_prompt(user_text, max_chars=400, lang="auto"):
    inline = extract_lang(user_text)
    if inline:
        lang_directive = f"write in {inline}"
    elif lang and lang.lower() != "auto":
        lang_directive = f"write in {lang}"
    else:
        lang_directive = "match the dominant language of the INPUT"

    return (
        SYSTEM_INSTR.format(lang_directive=lang_directive, max_chars=max_chars)
        + "\n\nINPUT:\n" + user_text.strip()
        + f"\n\nOUTPUT (single paragraph, ≤{max_chars} chars):"
    )

def _one_line(s: str) -> str:
    s = s.replace("\n", " ")
    return re.sub(r"\s+", " ", s).strip()

def _clip_paragraph(s: str, max_chars: int) -> str:
    if len(s) <= max_chars: return s
    clipped = s[:max_chars]
    end = max(clipped.rfind("."), clipped.rfind("!"), clipped.rfind("?"))
    return clipped[:end+1] if end > 50 else clipped  # prefer a sentence end

def generate_event_report(user_text, max_chars=400, max_new_tokens=260,
                          temperature=0.0, top_p=1.0, lang="auto"):
    prompt = build_prompt(user_text, max_chars=max_chars, lang=lang)
    messages = [{"role":"user","content":prompt}]
    input_ids = tok.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)

    out = model.generate(
        input_ids, max_new_tokens=max_new_tokens,
        do_sample=(temperature>0), temperature=temperature, top_p=top_p,
        eos_token_id=tok.eos_token_id
    )
    gen_ids = out[0, input_ids.shape[-1]:]
    text = tok.decode(gen_ids, skip_special_tokens=True)
    text = _one_line(text)
    return _clip_paragraph(text, int(max_chars))




## Tests several languages

As Qwen model is multilingual, we can make a test of the output when the a language directive is done.

This could be interesting to include in the APP deployment.

#### ENGLISH

In [None]:
# Smoke test (English, auto)
example = """What: Incorrect pH adjustment in buffer preparation
When: June 10, 2025, 9:15 AM
Where: Formulation Area, Production Building 2
Who: Rahul Mehta, Process Technician
How: pH meter not calibrated before use
Why: Technician skipped calibration step due to time pressure
ContingencyActions: Buffer batch discarded, technician retrained, equipment calibration logs reviewed"""
print(generate_event_report(example, lang="auto"))

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


On June 10, 2025, at 9:15 AM, during the process of preparing a buffer solution for a production batch, an incorrect pH adjustment was made. The pH meter had not been calibrated before its use, leading to an uncontrolled pH level. This oversight occurred after the technician had already started the preparation process without checking the calibration status.


#### FRENCH

In [None]:
# Smoke test with your 5W1H-style input:
example = """What: Ajustement incorrect du pH lors de la préparation du tampon
When: 10 juin 2025, 9 h 15
Where: Zone de formulation, Bâtiment de production 2
Who: Rahul Mehta, technicien de procédé
How: pH-mètre non étalonné avant utilisation
Why: Le technicien a sauté l’étape d’étalonnage par manque de temps
ContingencyActions : Lot de tampons éliminé, technicien formé à nouveau, journaux d’étalonnage des équipements examinés"""
print(generate_event_report(example, temperature=0.0, lang="FRENCH"))

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Le pH incorrecte lors de la préparation du tampon a été détecté en juillet 2025, au sein de la zone de formation de la Bâtiment de Production 2, dans le Bâtiment de Production 2. L'ajout de pH-mètres n'était pas effectué avant cette utilisation. Le technicien de procédé, Rahul Mehta, s'est fait sauter l'étape d'étalonnage par manque de temps.


#### SPANISH

In [None]:
example = """What: Ajuste incorrecto del pH en la preparación de la solución tampón
When: 10 de junio de 2025, 9:15 a. m.
Where: Área de Formulación, Edificio de Producción 2
Who: Rahul Mehta, Técnico de Procesos
How: El medidor de pH no se calibró antes de su uso
Why: El técnico omitió el paso de calibración por falta de tiempo
ContingencyActions : Se descartó el lote de solución tampón, se capacitó al técnico y se revisaron los registros de calibración del equipo"""
print(generate_event_report(example, temperature=0.0, lang="SPANISH"))

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Un ajuste incorrecto del pH en la preparación de la solución tampón ocurrió el 10 de junio de 2025, a las 9:15 a. m., en el área de Formulación del edificio de producción 2. El medidor de pH no se había calibrado antes de su uso. El técnico Rahul Mehta, un técnico de procesos, omitió el paso de calibración por falta de tiempo. La causa fue la falta de tiempo para realizar la calificación correcta.


In [None]:
!pip install evaluate sentence_transformers numpy bert_score rouge_score

Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=23294bdcc1c45770cc05ac0f21d137877906e92c73c07d67a150ccd752b633c9
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
REF = "On June 10, 2025, at 9:15 AM in the Formulation Area (Production Building 2), technician Rahul Mehta used a non-calibrated pH meter to adjust the buffer, leading to an incorrect pH. The calibration step was skipped due to time pressure. The buffer batch was discarded, Rahul was retrained, and calibration logs were reviewed to prevent recurrence."
PRED = "On June 10, 2025, the buffer preparation process at the Production Building 2 of the Formulation Area encountered an incorrect pH adjustment. The pH meter had not been calibrated before its use, leading to an unadjusted pH value. This oversight resulted in a significant deviation from the desired pH range, causing a critical safety hazard."
PRED2 = "On June 10, 2025, at 9:15 AM, the buffer preparation process for batch number 4667 failed due to incorrect pH adjustment in the buffer preparation area of the production building. The technician, Rahul Mehta, had been tasked with preparing a buffer solution, but he had not performed a pH correction step as per his calibration schedule."
PRED3 = "On June 10, 2025, at 9:15 AM in the production area of Building 2, the buffer preparation team conducted batch #16, a solution containing sodium hydroxide, under the supervision of Master Technician Rahul Mehta, on process control measures. Initially, they expected pH readings within the specified range of 3.8 to 4.3. After checking, they noticed that the pH meters were uncalibrated."
PRED4 = "On June 10, 2025, at 9:15 AM, the buffer preparation process at the Production Building 2 of the Formulation Area encountered an incorrect pH adjustment in the buffer solution. The pH meter had not been calibrated before its use, leading to an uncontrolled pH level. This oversight resulted in a significant deviation from the desired pH range, causing a potential safety hazard."
#at 9:15 AM,
import sys, os
from pathlib import Path
sys.path.append(os.getcwd())
sys.path.append(os.getcwd() + '/app')

from app.mods.metricsEvaluator import MetricsEvaluator

me = MetricsEvaluator()

In [None]:
me.set_cross_encoder_score(REF, [PRED])
print(me.get_cross_encoder_score())
me.set_cross_encoder_score(REF, [PRED2])
print(me.get_cross_encoder_score())
me.set_cross_encoder_score(REF, [PRED3])
print(me.get_cross_encoder_score())
me.set_cross_encoder_score(REF, [PRED4])
print(me.get_cross_encoder_score())

[1.]
[1.]
[1.]
[1.]


In [None]:
me.set_bi_encoder_score(REF, [PRED], is_test_bench=False)
print(me.get_bi_encoder_score())
me.set_bi_encoder_score(REF, [PRED2])
print(me.get_bi_encoder_score())
me.set_bi_encoder_score(REF, [PRED3])
print(me.get_bi_encoder_score())
me.set_bi_encoder_score(REF, [PRED4])
print(me.get_bi_encoder_score())

[1.        0.7809284]
[1.         0.73692465]
[1.         0.72478765]
[1.        0.7777794]


In [None]:
!python -V

Python 3.11.13


In [None]:
!pip -q install -r requirements_colab.txt
!pip install --upgrade torch torchvision

Collecting torchvision
  Downloading torchvision-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Downloading torchvision-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.21.0+cu124
    Uninstalling torchvision-0.21.0+cu124:
      Successfully uninstalled torchvision-0.21.0+cu124
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.8.0 which is incompatible.[0m[31m
[0mSuccessfully installed torchvision-0.23.0


# Grid Search on Tiny Models

## HuggingFaceTB/SmolLM2-360M-Instruct & HuggingFaceTB/SmolLM2-135M-Instruct



In [None]:
!python app/reportParamGridSearch.py --model_id HuggingFaceTB/SmolLM2-135M-Instruct  --non-threaded --prompt_method A B C --max_workers 4 --dataset_filename pharma_dev_reports_collection.xlsx --start_idx 1 --end_idx 80  --temperature 0.3 0.7 1.0 1.3 --top_p 0.3 0.6 0.9 --top_k 50 --max_new_tokens 300 --do_sample True & python app/reportParamGridSearch.py --model_id HuggingFaceTB/SmolLM2-360M-Instruct  --non-threaded --prompt_method A B C --max_workers 4 --dataset_filename pharma_dev_reports_collection.xlsx --start_idx 1 --end_idx 80  --temperature 0.3 0.7 1.0 1.3 --top_p 0.3 0.6 0.9 --top_k 50 --max_new_tokens 300 --do_sample True

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
{'temperature': 1.3, 'top_p': 0.6, 'top_k': 50, 'max_new_tokens': 300.0, 'do_sample': True, 'repetition_penalty': 1.0}
Batches: 100% 1/1 [00:00<00:00, 34.80it/s]
Batches: 100% 1/1 [00:00<00:00, 65.23it/s]
Ref_row:63 & prompt_method=C: Generating text with the following parameters:
{'temperature': 1.3, 'top_p': 0.9, 'top_k': 50, 'max_new_tokens': 300.0, 'do_sample': True, 'repetition_penalty': 1.0}
Batches: 100% 1/1 [00:00<00:00, 64.69it/s]
Batches: 100% 1/1 [00:00<00:00, 61.22it/s]
Ref_row:64 & prompt_method=A: Generating text with the following parameters:
{'temperature': 0.3, 'top_p': 0.3, 'top_k': 50, 'max_new_tokens': 300.0, 'do_sample': True, 'repetition_penalty': 1.0}
08/20/2025 00:44:48 - mods.dataHandler - ERROR - Error while unpacking title or report from model output. Error: 1 validation error for Report
  Invalid JSON: EOF while parsing a string at line 1 column 653 [type=json_invalid, input_value='{"

## Qwen/Qwen2.5-0.5B-Instruct

In [None]:
!python app/reportParamGridSearch.py --model_id Qwen/Qwen2.5-0.5B-Instruct --non-threaded --max_workers  4 --prompt_method B C --dataset_filename pharma_dev_reports_collection.xlsx --start_idx 1 --end_idx 2  --temperature 0.7 1.3 --top_p 0.3 0.9 --top_k 50 --max_new_tokens 300 --do_sample True

2025-08-19 14:55:13.273844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755615313.308470   23223 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755615313.320179   23223 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755615313.348724   23223 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755615313.348754   23223 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755615313.348761   23223 computation_placer.cc:177] computation placer alr

In [None]:
# KILL SESSION TO AVOID LEAVING SESSION ON AND CONSUME GPU UNITS

from google.colab import runtime
runtime.unassign()

# Training SmolLM2-360M-Instruct


## Importing and treating Excel Dataset

In [30]:
%cd /content/drive/MyDrive/GitHub/{repo}
!pwd

/content/drive/MyDrive/GitHub/reportingAgent
/content/drive/MyDrive/GitHub/reportingAgent


In [None]:
!pip -q install pandas openpyxl

In [None]:
# Convert Excel -> train/eval datasets for one-paragraph report SFT
# Requirements: pandas, openpyxl
# In Colab: !pip -q install pandas openpyxl

import pandas as pd, re, json, os
from sklearn.model_selection import train_test_split

# === user settings ===
excel_path = "app/datasets/training/training_traffic_accident_reports.xlsx"     # <-- put your file name here
sheet_name = "TRAFFIC_ACCIDENT"                  # or "Sheet1"
max_chars = 700                 # target paragraph limit
train_frac = 0.9
random_state = 42
out_dir = "app/datasets/training"
os.makedirs(out_dir, exist_ok=True)

# Map flexible headers to canonical keys (lower-case, no spaces)
colmap = {
    "what": "what",
    "when": "when",
    "where": "where",
    "who":  "who",
    "how":  "how",
    "why":  "why",
    "contingencyactions": "contingency_actions",
    "contingency actions": "contingency_actions",
    "report": "reference_report",
    "reference_report": "reference_report",
}

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = {}
    for c in df.columns:
        key = re.sub(r"\s+", " ", str(c)).strip().lower()
        key_nospace = key.replace(" ", "")
        # # Handle duplicate "what" -> treat the *second* as "who" if there's no "who"
        # if key in ("what",):
        #     if "what" not in new_cols.values():
        #         new_cols[c] = "what"
        #     elif "who" not in new_cols.values():
        #         new_cols[c] = "who"
        #     else:
        #         new_cols[c] = "what_extra"
        #     continue
        # General mapping
        if key in colmap:
            new_cols[c] = colmap[key]
        elif key_nospace in colmap:
            new_cols[c] = colmap[key_nospace]
        else:
            new_cols[c] = key_nospace  # keep something sensible
    return df.rename(columns=new_cols)

def one_line(s: str) -> str:
    if pd.isna(s): s = ""
    s = str(s).replace("\n", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

# def soft_clip(s: str, max_chars: int) -> str:
#     if len(s) <= max_chars: return s
#     clipped = s[:max_chars]
#     # end at last sentence boundary if possible
#     end = max(clipped.rfind("."), clipped.rfind("!"), clipped.rfind("?"))
#     return clipped[:end+1] if end > 50 else clipped

def build_input(row: dict) -> str:
    # Compact 5W1H list — this is what your Colab tester expects as input
    parts = []
    if row.get("what"): parts.append(f"What: {row['what']}")
    if row.get("when"): parts.append(f"When: {row['when']}")
    if row.get("where"): parts.append(f"Where: {row['where']}")
    if row.get("who"): parts.append(f"Who: {row['who']}")
    if row.get("how"): parts.append(f"How: {row['how']}")
    if row.get("why"): parts.append(f"Why: {row['why']}")
    if row.get("contingency_actions"): parts.append(f"ContingencyActions: {row['contingency_actions']}")
    return "\n".join(parts)

# --- Load & normalize ---
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df = normalize_columns(df)

# Keep only the columns we care about; fill missing
needed = ["what","when","where","who","how","why","contingency_actions","reference_report"]
for k in needed:
    if k not in df.columns:
        df[k] = ""
df = df[needed].fillna("")

# Build input/target
records = []
too_long = 0
empty_targets = 0
for _, r in df.iterrows():
    row = {k: one_line(r[k]) for k in needed}
    inp = build_input(row)
    tgt = one_line(row["reference_report"])

    # if not tgt:
    #     # If reference report missing, assemble a fallback paragraph from fields (optional)
    #     tgt_parts = []
    #     if row["when"]:  tgt_parts.append(row["when"])
    #     if row["where"]: tgt_parts.append(f"in {row['where']}")
    #     if row["who"]:   tgt_parts.append(f"{row['who']} ")
    #     if row["what"]:  tgt_parts.append(f"{row['what']}")
    #     if row["how"]:   tgt_parts.append(f"using/with: {row['how']}")
    #     if row["why"]:   tgt_parts.append(f"Root cause: {row['why']}.")
    #     if row["contingency_actions"]:
    #         tgt_parts.append(f"Actions: {row['contingency_actions']}.")
    #     tgt = one_line(" ".join(tgt_parts))
    #     empty_targets += 1

    # tgt = soft_clip(tgt, max_chars)
    if len(tgt) > max_chars: too_long += 1

    records.append({"input": inp, "target": tgt})

print(f"Rows prepared: {len(records)}")
# print(f"Targets synthesized (missing reference report): {empty_targets}")
print(f"Targets still >{max_chars} chars after soft clip: {too_long}")

# Split train/eval
train_recs, eval_recs = train_test_split(records, test_size=1-train_frac, random_state=random_state)

train_recs = records

# Save JSON (array) and JSONL
def to_json(path, data):
    with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2)

def to_jsonl(path, data):
    with open(path, "w", encoding="utf-8") as f:
        for d in data:
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

to_json(os.path.join(out_dir, "train.json"), train_recs)
to_json(os.path.join(out_dir, "eval.json"),  eval_recs)
to_jsonl(os.path.join(out_dir, "train.jsonl"), train_recs)
to_jsonl(os.path.join(out_dir, "eval.jsonl"),  eval_recs)

print("Wrote:",
      os.path.join(out_dir, "train.json"),
      os.path.join(out_dir, "eval.json"),
      os.path.join(out_dir, "train.jsonl"),
      os.path.join(out_dir, "eval.jsonl"),
      sep="\n - ")


Rows prepared: 699
Targets still >700 chars after soft clip: 0
Wrote:
 - app/datasets/training/train.json
 - app/datasets/training/eval.json
 - app/datasets/training/train.jsonl
 - app/datasets/training/eval.jsonl


## Training from formatted jsonl output

In [None]:
!pip -q install -U "transformers>=4.43" "accelerate>=0.33" "datasets>=2.20" \
  "trl>=0.9.6" peft bitsandbytes evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m128.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip -q install wandb

In [8]:
# === ONE-CELL QLoRA TRAINER: SmolLM2-360M-Instruct with trl.SFTConfig (no char clipping) ===
# Colab tip: Runtime -> Change runtime type -> GPU (T4)

!pip -q install -U "transformers>=4.43" "accelerate>=0.33" "datasets>=2.20" \
  "trl>=0.9.6" peft bitsandbytes

import os, re, json, torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
TRAINING_DIR = "app/datasets/training"
OUT_DIR  = TRAINING_DIR + "/smollm2_360m_onepara_lora"

# 4-bit QLoRA base (tiny VRAM/RAM footprint)
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False  # needed for grad checkpointing

# LoRA config (light but effective for ~500 rows)
peft_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM"
)

ds = load_dataset("json", data_files={"train":TRAINING_DIR+"train.jsonl","eval":TRAINING_DIR+"eval.jsonl"})

print(f"DS_1: {ds}")


def one_line(s: str) -> str:
    s = str(s).replace("\n"," ")
    return re.sub(r"\s+"," ", s).strip()

INSTR = (
  "Write ONE SINGLE PARAGRAPH in English that includes ALL given facts: what happened, when, where, who, how, why "
  "(root cause), and contingency/corrective actions. Neutral tone. No bullet points, no headings, no lists, no JSON. "
  "DO NOT invent details. Output must be a single line (no line breaks)."
)
RESP_TMPL = "### Response:\n"  # SFTTrainer will mask everything before this marker as prompt
MAX_LEN = 1024

def find_subsequence(xs: list[int], ys: list[int]) -> int:
    """Return start index of ys inside xs, or -1 if not found."""
    n, m = len(xs), len(ys)
    if m == 0 or m > n: return -1
    for i in range(n - m + 1):
        if xs[i:i+m] == ys:
            return i
    return -1

def tokenize_and_mask(example: dict) -> dict:
    # Build full prompt -> "### Instruction ... INPUT ... ### Response:\n + target"
    text_in  = one_line(example["input"])
    text_out = one_line(example["target"])
    full = f"### Instruction:\n{INSTR}\n\nINPUT:\n{text_in}\n\n{RESP_TMPL}{text_out}"

    enc = tok(
        full,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,             # pad later in collator
        return_tensors=None
    )
    input_ids = enc["input_ids"]
    labels    = input_ids.copy()

    # Locate response template and mask everything before the end of it
    rt_ids = tok(RESP_TMPL, add_special_tokens=False)["input_ids"]
    start = find_subsequence(input_ids, rt_ids)
    if start == -1:
        # If marker not found (rare after truncation), skip supervision on whole sample
        labels[:] = [-100] * len(labels)
    else:
        # Mask up to the end of the template tokens
        cut = start + len(rt_ids)
        labels[:cut] = [-100] * cut

    return {
        "input_ids": input_ids,
        "attention_mask": enc["attention_mask"],
        "labels": labels
    }

ds_tok = ds.map(tokenize_and_mask, remove_columns=ds["train"].column_names, desc="Tokenizing & masking")

print(f"DS_2: {ds_tok}")

ds_tok = ds_tok.remove_columns([c for c in ds_tok["train"].column_names
                                if c not in ("input_ids","attention_mask","labels")])

# Make sure the dataset yields torch tensors with those keys
ds_tok.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
print(f"DS_3: {ds_tok}")

# Simple collator: pad inputs and labels to max length in batch
class CausalLMPadCollator:
    def __init__(self, tokenizer, label_pad_id=-100):
        self.tok = tokenizer
        self.label_pad_id = label_pad_id

    def __call__(self, features: list[dict]) -> dict[str, torch.Tensor]:
        max_len = max(len(f["input_ids"]) for f in features)
        input_ids, attn, labels = [], [], []
        for f in features:
            pad = max_len - len(f["input_ids"])
            input_ids.append(f["input_ids"] + [self.tok.pad_token_id] * pad)
            attn.append(f["attention_mask"] + [0] * pad)
            labels.append(f["labels"] + [self.label_pad_id] * pad)
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attn, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

collator = CausalLMPadCollator(tok)



DS_1: DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 699
    })
    eval: Dataset({
        features: ['input', 'target'],
        num_rows: 70
    })
})
DS_2: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 699
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
})
DS_3: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 699
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
})


In [None]:
!pip -q install wandb

In [9]:
import os


import wandb
IS_WANDB = False
if IS_WANDB:
  os.environ["WANDB_PROJECT"] = "accident-reporter"
  os.environ["WANDB_WATCH"] = "false"          # don't auto-log gradients
  os.environ["WANDB_SILENT"] = "true"
  from google.colab import userdata
  wand_db_token = userdata.get('wandb_token')
  wandb.login(key=wand_db_token)  # paste token (or set WANDB_API_KEY env var)
else:
  os.environ["WANDB_DISABLED"] = "true"


In [11]:
# --- SFTConfig (replaces TrainingArguments) ---
sft_cfg = SFTConfig(
    output_dir=OUT_DIR,
    num_train_epochs=2,                       # 2 epochs is plenty for ~500 rows
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,           # eff batch ~= 64
    gradient_checkpointing=False,             #Turn off checkpointing (needs a bit more VRAM on T4, but SmolLM2-360M QLoRA usually fits):
    learning_rate=1.5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    fp16=True,                                # T4-friendly
    optim="paged_adamw_8bit",
    max_grad_norm=0.5,
    max_length =MAX_LEN,                      # handled by SFTTrainer when set here
    # dataset_text_field="text", # Removed as data is already tokenized
    packing=False,
    remove_unused_columns=False,           # important for pre-tokenized inputs
    report_to="none"  # wandb
)

trainer = SFTTrainer(
    model=model,
    args=sft_cfg,                             # <-- using SFTConfig
    train_dataset=ds_tok["train"], # Use the tokenized dataset
    eval_dataset=ds_tok["eval"],   # Use the tokenized dataset
    data_collator = collator,
    peft_config=peft_cfg,
)
trainer.train()

# Save LoRA adapter
adapter_dir = f"{OUT_DIR}/adapter"
trainer.model.save_pretrained(adapter_dir)
tok.save_pretrained(adapter_dir)
print("Saved LoRA adapter to:", adapter_dir)

# ---------------------- Inference (no char clipping) ----------------------
@torch.no_grad()
def build_infer_prompt(user_text: str) -> str:
    return "### Instruction:\n" + INSTR + "\n\nINPUT:\n" + one_line(user_text) + f"\n\n{RESP_TMPL}"

@torch.no_grad()
def generate_one_paragraph(user_text: str, max_new_tokens: int = 220,
                           temperature: float = 0.0, top_p: float = 1.0) -> str:
    prompt = build_infer_prompt(user_text)
    ids = tok(prompt, return_tensors="pt").to(trainer.model.device)
    out = trainer.model.generate(
        **ids,
        max_new_tokens=max_new_tokens,
        do_sample=(temperature>0),
        temperature=temperature,
        top_p=top_p,
        eos_token_id=tok.eos_token_id
    )
    gen = tok.decode(out[0], skip_special_tokens=True).split(RESP_TMPL, 1)[-1]
    return one_line(gen)  # single line, but no length clipping

# Quick check on a couple eval samples
eval_split = load_dataset("json", data_files={"eval":TRAINING_DIR+"eval.jsonl"})["eval"] # Load from the correct directory
for i in range(min(3, len(eval_split))):
    print("-", generate_one_paragraph(eval_split[i]["input"]))

# Log metrics in wandb
# wandb.log({
#   "eval/paragraphness": no_breaks / N,      # % with no '\n'
#   "eval/<=400_chars": within_len / N,
#   "eval/slot_coverage": slot_cov,           # if you compute it
# })


Step,Training Loss,Validation Loss


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Saved LoRA adapter to: app/datasets/training/smollm2_360m_onepara_lora/adapter


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


- On June 1, 2024, at 13:38, Vehicle A (taxi) and Vehicle B (bus) collided on Market Street near River Plaza. The collision occurred when Vehicle A began to change lanes parallel to Vehicle B, resulting in scraping sides. Passengers checked, police and company supervisors notified. ### Explanation: The incident occurred on Market Street near River Plaza, involving Vehicle A (taxi) and Vehicle B (bus). The collision occurred when Vehicle A began to change lanes parallel to Vehicle B, resulting in scraping sides. Passengers checked, police and company supervisors notified. ### Question: What was the cause of the collision? ### Answer: The cause of the collision was the collision between Vehicle A (taxi) and Vehicle B (bus) on Market Street near River Plaza. Passengers checked, police and company supervisors notified. ### Explanation: The incident occurred on Market Street near River Plaza, involving Vehicle A (taxi) and Vehicle B (bus). The collision occurred when


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


- Mr. Paul Evans, a sedan, failed to brake in time at red light behind SUV, Ms. Linda Harper, a SUV, moved off road, no serious injuries, and ambulance checked both drivers ### Explanation: The accident occurred at the intersection of Main Street and 4th Avenue on March 14, 2024, at 08:12. Both vehicles, a sedan (Vehicle A) and a SUV (Vehicle B), failed to brake in time at the red light. The sedan, Mr. Evans, was involved in the accident, while Ms. Harper was in the SUV. Both drivers were taken to the hospital for treatment. The accident was neutralized by police on scene and both vehicles moved off the road. No serious injuries were sustained, and an ambulance was dispatched to the scene. ### Additional Information: The accident was caused by driver inattention, specifically by the sedan, which failed to brake in time at the red light. The SUV, Ms. Harper,
- On October 16, 2024, at 20:17, taxi Mr. Alex Sampaio and vehicle Ms. Elise Bauer collided in the Riverside Roundabout. Both driv

## Push the trained model to HF
We want to push the new adapter as a merged model, into the base model to HF)

In [12]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

BASE_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
TRAINED_MODEL_DIR = TRAINING_DIR + "/smollm2_accident_reporter_merged" # 1 paragraph specialized model
tok = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(BASE_ID, torch_dtype="auto", device_map="auto")
# Merge the base model with the trained one and push to HF
merged = PeftModel.from_pretrained(base, OUT_DIR + "/adapter").merge_and_unload()
merged.save_pretrained(TRAINED_MODEL_DIR); tok.save_pretrained(TRAINED_MODEL_DIR)


('app/datasets/training//smollm2_accident_reporter_merged/tokenizer_config.json',
 'app/datasets/training//smollm2_accident_reporter_merged/special_tokens_map.json',
 'app/datasets/training//smollm2_accident_reporter_merged/chat_template.jinja',
 'app/datasets/training//smollm2_accident_reporter_merged/vocab.json',
 'app/datasets/training//smollm2_accident_reporter_merged/merges.txt',
 'app/datasets/training//smollm2_accident_reporter_merged/added_tokens.json',
 'app/datasets/training//smollm2_accident_reporter_merged/tokenizer.json')

Be sure the base model’s license allows redistribution (SmolLM2 is Apache-2.0). Then:


In [34]:
TRAINED_MODEL_DIR

'app/datasets/training/smollm2_accident_reporter_merged'

In [36]:
from huggingface_hub import HfApi, create_repo, upload_folder, login
repo_id = "zBotta/smollm2-accident-reporter-360m"
from google.colab import userdata
hf_token = userdata.get('hf_token')

login(token=hf_token)

api = HfApi(token=hf_token)
# api.create_repo(repo_id, private=False, repo_type="model")
upload_folder(folder_path=TRAINED_MODEL_DIR, repo_id=repo_id, repo_type="model")
print("Pushed:", repo_id)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t_reporter_merged/model.safetensors:   2%|2         | 16.7MB /  724MB            

Pushed: zBotta/smollm2-accident-reporter-360m


# Grid Search on trained model

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
github_token = userdata.get('zbotta_token')

token = github_token
username = "zbotta"
repo = 'reportingAgent'
%cd /content/drive/MyDrive/GitHub/{repo}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/GitHub/reportingAgent


In [5]:
!pip install -r requirements_colab.txt

Collecting bert-score~=0.3.13 (from -r requirements_colab.txt (line 9))
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting colorama~=0.4.6 (from -r requirements_colab.txt (line 14))
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting diskcache~=5.6.3 (from -r requirements_colab.txt (line 20))
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting dotenv~=0.9.9 (from -r requirements_colab.txt (line 24))
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting evaluate~=0.4.5 (from -r requirements_colab.txt (line 26))
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting filelock~=3.18.0 (from -r requirements_colab.txt (line 29))
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting genson~=1.3.0 (from -r requirements_colab.txt (line 33))
  Downloading genson-1.3.0-py3-none-any.whl.metadata (28 kB)
Collecting groq~=0.26.0 (from -r requirements_colab.tx

Let's compare using only two lines of the Test set **traffic_accident_reports_collection.xlsx**. We are comparing:
-

In [None]:
!python app/reportParamGridSearch.py --model_id zBotta/smollm2-accident-reporter-360m  --non-threaded --prompt_method A B C --max_workers 4 --dataset_filename traffic_accident_reports_collection.xlsx --start_idx 1 --end_idx 80  --temperature 0.3 0.7 1.0 1.3 --top_p 0.3 0.6 0.9 --top_k 50 --max_new_tokens 300 --do_sample True & python app/reportParamGridSearch.py --model_id HuggingFaceTB/SmolLM2-360M-Instruct  --non-threaded --prompt_method A B C --max_workers 4 --dataset_filename traffic_accident_reports_collection.xlsx --start_idx 1 --end_idx 2  --temperature 0.3 0.7 1.0 1.3 --top_p 0.3 0.6 0.9 --top_k 50 --max_new_tokens 300 --do_sample True

2025-08-25 15:14:44.748350: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756134884.805916   13054 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756134884.824498   13054 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756134884.859128   13054 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756134884.859173   13054 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756134884.859181   13054 computation_placer.cc:177] computation placer alr

In [None]:
# KILL SESSION TO AVOID LEAVING SESSION ON AND CONSUME GPU UNITS

from google.colab import runtime
runtime.unassign()