<a href="https://colab.research.google.com/github/yvesemmanuel/deep-learning/blob/main/insurance-claim-language-model/bert_large_uncased_l0_005_b64_e5_finetuned_lora_porto_seguro_2025_11_20_19_30_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.handlers.clear()
logger.propagate = False

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)

logger.info("Logger is working!")

2025-11-20 19:27:32,874 - INFO - Logger is working!


In [2]:
!pip install transformers accelerate evaluate datasets peft wandb fsspec==2023.9.2 -q

In [3]:
import torch
import numpy as np
import random
import os

SEED = 42

def set_universal_seed(seed_value):
    """Sets the seed for full reproducibility across Python, NumPy, and PyTorch."""
    random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # for multi-GPU

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_universal_seed(SEED)

# HuggingFace and Weights and Biases

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33myveemmanuel[0m ([33memmanuel-company[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
model_checkpoint = "google-bert/bert-large-uncased"

# Load dataset

In [7]:
import pandas as pd

In [8]:
dataset_id = "porto-seguro"

In [9]:
train_path = "/content/train.csv"
ott_path = "/content/test.csv"

In [10]:
try:
    df_raw = pd.read_csv(train_path)
except FileNotFoundError:
    logger.error("Error: 'train.csv' file not found. Please load the PortoSeguro dataset.")
    df_raw = pd.DataFrame({"id": [], "target": []})

In [11]:
try:
    df_ott = pd.read_csv(ott_path)
except FileNotFoundError:
    logger.error("Error: 'test.csv' file not found. Please load the PortoSeguro dataset.")
    df_ott = pd.DataFrame({"id": [], "target": []})

In [12]:
df_raw.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9.0,1.0,5.0,8.0,0.0,1.0,1.0,0.0,0.0,1.0
1,9,0,1,1,7,0,0,0,0,1,...,3.0,1.0,1.0,9.0,0.0,1.0,1.0,0.0,1.0,0.0
2,13,0,5,4,9,1,0,0,0,1,...,4.0,2.0,7.0,7.0,0.0,1.0,1.0,0.0,1.0,0.0
3,16,0,0,1,2,0,0,1,0,0,...,2.0,2.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0,0,2,0,1,0,1,0,0,...,3.0,1.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0


In [13]:
def create_text_classification_example(row):
    PROMPT = (
        "Assess the likelihood of an insurance claim being filed based on the following customer attributes. "
        "All attributes have been anonymized to protect data confidentiality. "
    )

    INPUT_TEXT = "The client attributes are: "
    features = []

    feature_cols = [col for col in df_raw.columns if col not in ['id', 'target']]

    for col in feature_cols:
        val = row[col] if pd.notna(row[col]) else 'NaN'
        features.append(f"{col}: {val}")

    INPUT_TEXT += ", ".join(features) + "."

    if "target" in row:
      LABEL = row['target']
    else:
      LABEL = None

    formatted_instruction = {
        "text": f"{PROMPT} {INPUT_TEXT}",
        "label": LABEL
    }
    return formatted_instruction

## Pandas to Dataset

In [14]:
from datasets import Dataset

In [15]:
def pandas_to_dataset(df):
  instruction_data = df.apply(create_text_classification_example, axis=1).tolist()
  df_instructions = pd.DataFrame(instruction_data)
  hf_dataset = Dataset.from_pandas(df_instructions)
  return hf_dataset

In [16]:
hf_dataset = pandas_to_dataset(df_raw)
hf_dataset_ott = pandas_to_dataset(df_ott)

# Preprocessing dataset

This includes:

- Labels mapping;
- Prompt engineering;
- Data splitting.

## Label-to-ID mapping

In [17]:
id2label = {0: "NO_CLAIM", 1: "CLAIM_FILED"}
label2id = {"NO_CLAIM": 0, "CLAIM_FILED": 1}

# labels = dataset["train"].features["label"].names
# label2id, id2label = dict(), dict()
# for i, label in enumerate(labels):
#   label2id[label] = i
#   id2label[i] = label

## Text processor

In [18]:
from transformers import AutoTokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [20]:
def preprocess(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True)
    labels = []

    for label in examples["label"]:
      if label is None:
        labels.append(None)
      else:
        labels.append(int(label))

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Data splitting

Three data splits; 1) train, 2) eval, 2) test.

In [21]:
hf_dataset = hf_dataset.train_test_split(test_size=0.3)

In [22]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 113434
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 48615
    })
})

In [23]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 113434
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 48615
    })
})

In [25]:
train_data = hf_dataset["train"]
val_data = hf_dataset["test"]

val_data_split = val_data.train_test_split(test_size=0.3)
val_data = val_data_split["train"]
test_data = val_data_split["test"]

In [26]:
train_data.set_transform(preprocess)
val_data.set_transform(preprocess)
test_data.set_transform(preprocess)
hf_dataset_ott.set_transform(preprocess)

# Model _tunable_ hyperparameters

In [27]:
def print_trainable_parameters(model):
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
      all_param += param.numel()
      if param.requires_grad:
          trainable_params += param.numel()
  logger.info(
      f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
  )

In [28]:
from transformers import AutoModelForSequenceClassification

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    problem_type="single_label_classification", # Explicitly set for binary classification
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
print_trainable_parameters(model)

2025-11-20 19:30:09,732 - INFO - trainable params: 335143938 || all params: 335143938 || trainable%: 100.00


# Low-Rank Adaptation

In [31]:
from peft import LoraConfig, get_peft_model

In [32]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    modules_to_save=["classifier"],
)

lora_model = get_peft_model(model, lora_config)

In [33]:
print_trainable_parameters(lora_model)

2025-11-20 19:30:10,332 - INFO - trainable params: 3147778 || all params: 338291716 || trainable%: 0.93


# Training

In [34]:
from transformers import TrainingArguments

In [35]:
import datetime

In [36]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 64
learning_rate = 5e-3
num_train_epochs = 5
logging_steps = 10

timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run_name = f"{model_name}-l{learning_rate}_b{batch_size}_e{num_train_epochs}_finetuned-lora-{dataset_id}-{timestamp}"

training_args = TrainingArguments(
  run_name,
  remove_unused_columns=False,
  eval_strategy="epoch",
  save_strategy="epoch",
  learning_rate=learning_rate,
  per_device_train_batch_size=batch_size,
  gradient_accumulation_steps=4,
  per_device_eval_batch_size=batch_size,
  fp16=True,
  num_train_epochs=num_train_epochs,
  logging_steps=logging_steps,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  push_to_hub=True,
  label_names=["labels"],
  run_name=run_name,
  report_to=["wandb"]
)

## Evaluation metrics

In [37]:
import numpy as np
import evaluate

In [38]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  """Computes accuracy on a batch of predictions"""
  predictions = np.argmax(eval_pred.predictions, axis=1)
  return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script: 0.00B [00:00, ?B/s]

## Data collaction function

> A collation function is used by Trainer to gather a batch of training and evaluation examples and prepare them in a format that is acceptable by the underlying model. [Read more](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer).

In [39]:
from transformers import DataCollatorWithPadding

In [40]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training steps

In [41]:
from transformers import Trainer

In [42]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

logger.info("Starting LoRA Fine-tuning for Text Classification...")
trainer.train()

  trainer = Trainer(


Starting LoRA Fine-tuning for Text Classification...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1693,0.206221,0.962151
2,0.1528,0.171469,0.962151
3,0.1811,0.163459,0.962151
4,0.1561,0.167533,0.962151
5,0.1494,0.162579,0.962151


TrainOutput(global_step=2220, training_loss=0.16823271985526558, metrics={'train_runtime': 4668.2101, 'train_samples_per_second': 121.496, 'train_steps_per_second': 0.476, 'total_flos': 5.340480305013965e+17, 'train_loss': 0.16823271985526558, 'epoch': 5.0})

# Evaluate metrics on test - unseen - dataset

Results from training, system usage and evaluation will be in the dashboard of Weights and Biases. [Read more](https://wandb.ai/emmanuel-company/huggingface/runs/ppf670zf?nw=nwuseryveemmanuel).

In [43]:
trainer.evaluate(test_data)

{'eval_loss': 0.20511698722839355,
 'eval_accuracy': 0.9623585875899897,
 'eval_runtime': 47.6552,
 'eval_samples_per_second': 306.053,
 'eval_steps_per_second': 4.784,
 'epoch': 5.0}

In [44]:
wandb.finish()

0,1
eval/accuracy,▁▁▁▁▁█
eval/loss,█▂▁▂▁█
eval/runtime,█████▁
eval/samples_per_second,▃▄▁▄▅█
eval/steps_per_second,▃▄▁▄▅█
train/epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,█▂▂▁▃▂▂▂▁▁▁▂▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁
train/learning_rate,███▇▇▇▇▇▇▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▁▁
train/loss,█▆▄▄▆▃▄▅▆▄▅▅▃▅▆▃▇▃▅▆▄▃▆▃▅▅▃▄▅▂▁▃▂▇▅▄▄▅▃▂

0,1
eval/accuracy,0.96236
eval/loss,0.20512
eval/runtime,47.6552
eval/samples_per_second,306.053
eval/steps_per_second,4.784
total_flos,5.340480305013965e+17
train/epoch,5
train/global_step,2220
train/grad_norm,0.56639
train/learning_rate,0.0
