In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import os

home_dir = os.path.expanduser('~/InsureAI')
# Define the path to save the fine-tuned model
model_dir = os.path.join(home_dir, "models", "fine_tuned_model")  # Points to InsureAI/models/fine_tuned_model

In [2]:
from unsloth import FastLanguageModel

# model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit"
model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit"
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    # device_map="auto",
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.8: Fast Qwen2 patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2060. Max memory: 6.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu118. CUDA: 7.5. CUDA Toolkit: 11.8. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Prepare Company Data Informations for fine-tuning

In [3]:
# Initialize insurance.db data
import subprocess

# Define the path to setup.py relative to the notebook
setup_script_path = os.path.abspath("../sql/setup.py")

# Run the setup.py script
try:
    subprocess.run(["python", setup_script_path], check=True)
    print("setup.py executed successfully.")
except subprocess.CalledProcessError as e:
    print(f"An error occurred while executing setup.py: {e}")


Products already exist. No new records inserted.
(1, 'Death A', 'Death', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(2, 'Death B', 'Death', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(3, 'Death C', 'Death', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(4, 'TPD A', 'TPD', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(5, 'TPD B', 'TPD', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(6, 'TPD C', 'TPD', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(7, 'Critical Illness A', 'Critical Illness', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(8, 'Critical Illness B', 'Critical Illness', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(9, 'Critical Illness C', 'Critical Illness', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(10, 'Accidental A', 'Accidental', '{"coverage": "term", "premium": 100, "SA":10000}', 'In

In [4]:
import json
import sqlite3
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split

# Configuration
NUM_VARIATIONS_PER_ITEM = 200  # 219 + 94 = 313 (70% + 30%)
MAX_RETRIES = 5 # For generating unique variations
TRAIN_TEST_SPLIT = 0.2 # More training data
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Initialize database connection (replace with your actual path)
home_dir = os.path.expanduser("~/InsureAI")
conn = sqlite3.connect(os.path.join(home_dir, 'insurance.db'))
cursor = conn.cursor()

# Fetch products from database
cursor.execute("SELECT * FROM products")
products = cursor.fetchall()

# Base company data
company_data = {
    "company_name": "InsureAI",
    "launch_date": "1995-05-15",
    "description": "InsureAI is a leading provider of life, health, and general insurance products in Singapore. We are committed to helping our customers achieve financial security and peace of mind.",
    "company_type": "Life and General Insurance",
    "headquarters": "Singapore",
    "website": "https://www.insureai.sg",
    "contact_email": "info@insureai.sg",
    "contact_phone": "+65 8888 1314",
    "products": []
}

# Process products
for product in products:
    product_id, name, types, features, company = product
    features_dict = json.loads(features)
    
    product_details = {
        "id": product_id,
        "name": name,
        "types": types,
        "features": features_dict,
        "base_description": f"{name} is a type of {types} insurance. It is a/an {features_dict.get('coverage')} insurance product offered by {company}. Key features include: {', '.join([f'{k}: {v}' for k, v in features_dict.items()])}."
    }
    company_data["products"].append(product_details)

# Variation generation functions
def generate_company_variations(company, num_variations):
    variations = []
    generated = set()
    
    # Prompt templates
    prompt_templates = [
        "Tell me about {name}",
        "Describe {name}'s insurance services",
        "What makes {name} different from other insurers?",
        "How long has {name} been operating?",
        "What types of insurance does {name} offer?",
        "Explain {name}'s main advantages",
        "What contact options does {name} provide?",
        "Where is {name} headquartered?",
        "Describe {name}'s history and mission",
        "What awards has {name} won?",
        "Who should consider {name} insurance?",
        "What's special about {name}'s policies?"
    ]
    
    # Completion components
    components = {
        'type': [
            f"{company['company_name']} operates as a {company['company_type']} provider",
            f"As a leading {company['company_type']} company, {company['company_name']}",
            f"{company['company_name']} specializes in {company['company_type'].lower()}"
        ],
        'hq': [
            f"Headquartered in {company['headquarters']}",
            f"Main offices located in {company['headquarters']}",
            f"Based in the heart of {company['headquarters']}"
        ],
        'launch': [
            f"Established in {company['launch_date'].split('-')[0]}",
            f"Founded in {company['launch_date']}",
            f"Operating since {company['launch_date']}"
        ],
        'desc': [
            company['description'],
            f"{company['description'].split('.')[0]}. Offers personalized solutions through advanced AI-driven risk assessment.",
            company['description'].replace("leading", "top-rated").replace("helping", "empowering")
        ],
        'contact': [
            f"Contact options: Phone {company['contact_phone']}, Email {company['contact_email']}, or visit {company['website']}",
            f"Available at {random.choice(['www.insureai.sg', 'their website'])} or call {company['contact_phone']}",
            f"Reach them via {random.choice(['email', 'phone'])}: {company['contact_email']} | {company['contact_phone']}"
        ],
        'awards': [
            "Recipient of the 2023 Singapore Insurance Innovation Award",
            "Ranked #1 in Customer Satisfaction by SG Insurance Review",
            "Winner of Best Digital Insurance Platform 2022"
        ]
    }

    attempt = 0
    while len(variations) < num_variations and attempt < MAX_RETRIES:
        # Generate prompt
        prompt = random.choice(prompt_templates).format(name=company['company_name'])
        
        # Build completion with logical flow
        completion_structure = [
            ('type', 1), 
            ('launch', 0.8), 
            ('hq', 0.9),
            ('desc', 1),
            ('awards', 0.4),
            ('contact', 1)
        ]
        
        completion_parts = []
        for component, prob in completion_structure:
            if random.random() < prob:
                completion_parts.append(random.choice(components[component]))
        
        # Ensure minimum components
        if len(completion_parts) < 4:
            continue
            
        completion = ". ".join(completion_parts) + "."
        
        # Check uniqueness
        variation_hash = hash((prompt, completion))
        if variation_hash not in generated:
            variations.append((prompt, completion))
            generated.add(variation_hash)
        else:
            attempt += 1
    
    return variations

def generate_product_variations(product, company_name, num_variations):
    variations = []
    generated = set()
    
    # Prompt templates
    prompt_templates = [
        "Describe {product} from {company}",
        "What does {product} by {company} offer?",
        "Explain the {product} insurance policy",
        "What are the features of {product}?",
        "Tell me about {company}'s {product}",
        "What coverage does {product} provide?",
        "Details about the {product} plan",
        "What makes {product} by {company} special?",
        "Information about {product} insurance",
        "Why choose {product} from {company}?"
    ]
    
    # Completion components
    components = {
        'type': [
            f"{product['name']} is a {product['types']} insurance product",
            f"This {product['types']} policy: {product['name']}",
            f"A {product['types']} solution: {product['name']}"
        ],
        'coverage': [
            f"Coverage type: {product['features']['coverage']}",
            f"{product['features']['coverage'].title()} coverage",
            f"Provides {product['features']['coverage']} protection"
        ],
        'premium': [
            f"Premium: ${product['features']['premium']}",
            f"Cost: {product['features']['premium']} SGD",
            f"Priced at {product['features']['premium']} SGD"
        ],
        'sa': [
            f"Sum assured: {product['features']['SA']} SGD",
            f"Coverage amount: {product['features']['SA']}",
            f"SA: {product['features']['SA']}"
        ],
        'desc': [
            f"Offered by {company_name}",
            "24/7 claims support",
            "No-claim bonus rewards",
            "Free annual health checkups",
            "Worldwide coverage",
            "Instant policy issuance"
        ]
    }

    attempt = 0
    while len(variations) < num_variations and attempt < MAX_RETRIES:
        prompt = random.choice(prompt_templates).format(
            product=product['name'], 
            company=company_name
        )
        
        # Structured completion with natural flow
        completion_structure = [
            ('type', 1),
            ('coverage', 1),
            ('premium', 0.9),
            ('sa', 0.8),
            ('desc', 0.7)
        ]
        
        completion_parts = []
        for component, prob in completion_structure:
            if random.random() < prob:
                completion_parts.append(random.choice(components[component]))
        
        if len(completion_parts) < 4:
            continue
            
        completion = ". ".join(completion_parts) + "."
        
        variation_hash = hash((prompt, completion))
        if variation_hash not in generated:
            variations.append((prompt, completion))
            generated.add(variation_hash)
        else:
            attempt += 1
    
    return variations

# Generate and split data
all_train = []
all_eval = []

# Company variations
company_variations = generate_company_variations(company_data, NUM_VARIATIONS_PER_ITEM)
train_company, eval_company = train_test_split(
    company_variations, 
    test_size=TRAIN_TEST_SPLIT,
    random_state=SEED
)
all_train.extend(train_company)
all_eval.extend(eval_company)

# Product variations
for product in company_data["products"]:
    product_variations = generate_product_variations(
        product,
        company_data["company_name"],
        NUM_VARIATIONS_PER_ITEM
    )
    train_product, eval_product = train_test_split(
        product_variations,
        test_size=TRAIN_TEST_SPLIT,
        random_state=SEED
    )
    all_train.extend(train_product)
    all_eval.extend(eval_product)

# Shuffle datasets
random.shuffle(all_train)
random.shuffle(all_eval)

# Add second shuffle
random.shuffle(all_train)
random.shuffle(all_eval)

# Save to JSONL files
def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for prompt, completion in data:
            # Create messages in chat format
            chat_example = {
                "messages": [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": completion}
                ]
            }
            f.write(json.dumps(chat_example) + "\n")

save_jsonl(all_train, "train_data.jsonl")
save_jsonl(all_eval, "eval_data.jsonl")

print(f"Training samples: {len(all_train)}")
print(f"Evaluation samples: {len(all_eval)}")
print("Files saved successfully!")

Training samples: 2520
Evaluation samples: 631
Files saved successfully!


In [5]:
# Load the fine-tuning data
# dataset = load_dataset("json", data_files="fine_tuning_data.jsonl")
dataset = load_dataset("json", data_files={"train": "train_data.jsonl", "test": "eval_data.jsonl"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 2520
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 631
    })
})

In [7]:
from unsloth.chat_templates import get_chat_template

# Apply the Llama-3.1 chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,  # Tokenizer being used
    chat_template="llama-3.1",  # The chat template format
)

# Function to format and tokenize the conversation data
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) + tokenizer.eos_token
             for convo in convos]
    
    tokenized_output = tokenizer(
        texts,
        truncation=True,
        padding="max_length", 
        max_length=512,
        return_tensors="pt",
    )
    
    # Create labels and mask user input
    labels = tokenized_output["input_ids"].clone()
    
    # Find where assistant responses start (after "<|start_header_id|>assistant")
    assistant_token_id = tokenizer.encode("<|start_header_id|>assistant")[0]
    for idx in range(len(labels)):
        # Find all positions of assistant token
        assistant_positions = (labels[idx] == assistant_token_id).nonzero()
        if len(assistant_positions) > 0:
            # Mask everything before the first assistant token
            start_mask = assistant_positions[0] + 1  # Include the start token
            labels[idx, :start_mask] = -100
    
    return {"input_ids": tokenized_output["input_ids"],
            "attention_mask": tokenized_output["attention_mask"],
            "labels": labels}

# def formatting_prompts_func(examples):
#     convos = examples["messages"]
#     # Ensure EOS token is added
#     texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) + tokenizer.eos_token
#              for convo in convos]
#     return tokenizer(
#         texts,
#         truncation=True,
#         padding="max_length",
#         max_length=512,
#         return_tensors="pt",
#     )

tokenized_dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/2520 [00:00<?, ? examples/s]

Map:   0%|          | 0/631 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['messages', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2520
    })
    test: Dataset({
        features: ['messages', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 631
    })
})

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "v_proj"],  # Fine-tune key attention layers
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.1.8 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [10]:
training_args = TrainingArguments(
    output_dir="./results",  # Where to save model checkpoints
    overwrite_output_dir=True,  # Overwrite old checkpoints
    per_device_train_batch_size=5,  # Small batch to fit in 6GB VRAM
    per_device_eval_batch_size=5,  # Same for evaluation
    # gradient_accumulation_steps=4,  # Helps with small batch size
    evaluation_strategy="epoch",  # "epoch" or "steps"
    save_strategy="epoch",
    load_best_model_at_end=True, #Enable early stopping
    # eval_steps=70,  # Frequency of evaluation
    # save_steps=500,  # Save model every X steps
    # save_total_limit=5,  # Keep only last X checkpoints
    logging_dir="./logs",  # Log directory
    # logging_steps=100,  # Log every 100 steps
    learning_rate=3e-5,  # Suitable for Qwen fine-tuning
    num_train_epochs=15,  # Number of epochs
    fp16=True,  # Use mixed precision to save VRAM
    optim="adamw_bnb_8bit",  # 8-bit optimizer for efficiency
    # lr_scheduler_type="cosine",  # Learning rate decay
    warmup_steps=500,  # Gradual increase in learning rate
    weight_decay=0.01,  # Added regularization
    greater_is_better=False,
    report_to="tensorboard",
    metric_for_best_model="eval_loss",
)




In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [12]:
# Evaluate the model before training. Check evaluation loss before training
#{'eval_loss': 13.443951606750488, 'eval_model_preparation_time': 0.0022, 'eval_runtime': 291.9052, 'eval_samples_per_second': 1.644, 'eval_steps_per_second': 1.644}
#Perplexity: 689658.3484340009
# import math

# eval_results = trainer.evaluate()
# print(eval_results)

# perplexity = math.exp(eval_results["eval_loss"])
# print(f"Perplexity: {perplexity}")

In [13]:
#TrainOutput(global_step=420, training_loss=13.907184382847378, metrics={'train_runtime': 5618.6296, 'train_samples_per_second': 0.598, 'train_steps_per_second': 0.075, 'total_flos': 6.382635102240768e+16, 'train_loss': 13.907184382847378, 'epoch': 3.0})
# from unsloth.chat_templates import train_on_responses_only
# trainer = train_on_responses_only(
#     trainer,
#     instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",  # Mark user input
#     response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",  # Mark assistant response
# )
# Start training the model
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,520 | Num Epochs = 15
O^O/ \_/ \    Batch size per device = 5 | Gradient Accumulation steps = 1
\        /    Total batch size = 5 | Total steps = 7,560
 "-____-"     Number of trainable parameters = 2,179,072


Epoch,Training Loss,Validation Loss
1,2.3761,2.378019
2,0.401,2.613814
3,0.2366,2.624131
4,0.1252,2.574429
5,0.1063,2.535082
6,0.1,2.484767
7,0.096,2.476134
8,0.094,2.505755
9,0.0922,2.497414
10,0.0905,2.518953


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=7560, training_loss=0.27520421327106537, metrics={'train_runtime': 14287.8018, 'train_samples_per_second': 2.646, 'train_steps_per_second': 0.529, 'total_flos': 1.795116122505216e+17, 'train_loss': 0.27520421327106537, 'epoch': 15.0})

In [14]:
# Save the model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"Fine-tuning complete. Model saved to {model_dir}.")

Fine-tuning complete. Model saved to /home/waijianlim/InsureAI/models/fine_tuned_model.


In [15]:
# Evaluate the model - Eval loss : {'eval_loss': 13.454931259155273, 'eval_runtime': 289.4939, 'eval_samples_per_second': 1.658, 'eval_steps_per_second': 1.658, 'epoch': 3.0}
# Perplexity: 697272.2800492591
import math

eval_results = trainer.evaluate()
print(eval_results)

perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

{'eval_loss': 2.378019332885742, 'eval_runtime': 74.6401, 'eval_samples_per_second': 8.454, 'eval_steps_per_second': 1.701, 'epoch': 15.0}
Perplexity: 10.783523128215108
