In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import os

# Define the path to save the fine-tuned model
save_dir = os.path.join("..", "..", "models", "fine_tuned_model")  # Points to InsureAI/models/fine_tuned_model

In [2]:
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



Prepare Company Data Informations for fine-tuning

In [3]:
# Initialize insurance.db data
import subprocess

# Define the path to setup.py relative to the notebook
setup_script_path = os.path.abspath("../sql/setup.py")

# Run the setup.py script
try:
    subprocess.run(["python", setup_script_path], check=True)
    print("setup.py executed successfully.")
except subprocess.CalledProcessError as e:
    print(f"An error occurred while executing setup.py: {e}")


(1, 'Death A', 'Death', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(2, 'Death B', 'Death', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(3, 'Death C', 'Death', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(4, 'TPD A', 'TPD', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(5, 'TPD B', 'TPD', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(6, 'TPD C', 'TPD', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(7, 'Critical Illness A', 'Critical Illness', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(8, 'Critical Illness B', 'Critical Illness', '{"coverage": "endowment", "premium": 200, "SA":20000}', 'InsureAI')
(9, 'Critical Illness C', 'Critical Illness', '{"coverage": "full", "premium": 300, "SA":30000}', 'InsureAI')
(10, 'Accidental A', 'Accidental', '{"coverage": "term", "premium": 100, "SA":10000}', 'InsureAI')
(11, 'Accidental B', 'Accidental', '{"co

In [4]:
import sqlite3
import json

# Connect to the SQLite database
conn = sqlite3.connect('../../insurance.db')
cursor = conn.cursor()

# Fetch all products from the database
cursor.execute("SELECT * FROM products")
products = cursor.fetchall()

# Prepare the company data
company_data = {
    "company_name": "InsureAI",
    "launch_date": "1995-05-15",
    "description": "InsureAI is a leading provider of life, health, and general insurance products in Singapore. We are committed to helping our customers achieve financial security and peace of mind.",
    "company_type": "Life and General Insurance",
    "headquarters": "Singapore",
    "website": "https://www.insureai.sg",
    "contact_email": "info@insureai.sg",
    "contact_phone": "+65 8888 1314",
    "products": []
}

# Add products to the company data
for product in products:
    product_id, name, types, features, company = product
    features_dict = json.loads(features)  # Convert features from JSON string to dict

    product_details = {
        "id": product_id,
        "name": name,
        "types": types,
        "features": features_dict,
        "description": f"{name} is a type of {types} insurance. It is a/an {features_dict.get('coverage')} insurance product offered by {company}. Key features include: {', '.join([f'{k}: {v}' for k, v in features_dict.items()])}."
    }
    company_data["products"].append(product_details)

# Save company data to a JSON file (optional)
with open("company_data.json", "w") as f:
    json.dump(company_data, f, indent=4)

print("Company data generated and saved to company_data.json.")

Company data generated and saved to company_data.json.


In [5]:
import json

# Load company data
with open("company_data.json", "r") as f:
    company_data = json.load(f)

# Prepare fine-tuning data
fine_tuning_data = []

# Add company information as a separate prompt-completion pair
company_prompt = f"Tell me about the company {company_data['company_name']}."
company_completion = (
    f"{company_data['company_name']} is a {company_data['company_type']} company headquartered in {company_data['headquarters']}. "
    f"It was launched on {company_data['launch_date']}. {company_data['description']} "
    f"You can contact them at email: {company_data['contact_email']} or phone: {company_data['contact_phone']}. "
    f"Visit their website at {company_data['website']}."
)
fine_tuning_data.append({"prompt": company_prompt, "completion": company_completion})

# Add product information
for product in company_data["products"]:
    product_prompt = f"Describe the insurance product {product['name']} offered by {company_data['company_name']}."
    product_completion = product["description"]
    fine_tuning_data.append({"prompt": product_prompt, "completion": product_completion})

# Save fine-tuning data to a JSONL file
with open("fine_tuning_data.jsonl", "w") as f:
    for item in fine_tuning_data:
        f.write(json.dumps(item) + "\n")

print("Fine-tuning data prepared and saved to fine_tuning_data.jsonl.")

Fine-tuning data prepared and saved to fine_tuning_data.jsonl.


In [6]:
# Load the fine-tuning data
dataset = load_dataset("json", data_files="fine_tuning_data.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# Tokenize the dataset

# Set pad_token to eos_token (common workaround for GPT-2 models)
tokenizer.pad_token = tokenizer.eos_token

# Define the tokenize function
def tokenize_function(examples):
    # Tokenize 'text' and 'response' fields
    inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(examples["completion"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = outputs["input_ids"]  # Set the response as labels for training
    return inputs

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [8]:
# # Train test split if necessary
# # Split the training data into training and validation sets (e.g., 80% train, 20% validation)
# train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.2)

# # Extract the train and eval datasets
# train_dataset = train_val_split["train"]
# eval_dataset = train_val_split["test"]

In [9]:
# Set up training arguments

os.environ["TOKENIZERS_PARALLELISM"] = "false"
training_args = TrainingArguments(
    output_dir=save_dir,  # Directory to save the fine-tuned model
    per_device_train_batch_size=4,   # Batch size per device
    num_train_epochs=3,              # Number of training epochs
    save_steps=10_000,               # Save checkpoint every 10,000 steps
    save_total_limit=2,              # Keep only the last 2 checkpoints
    logging_dir="./logs",            # Directory for logs
    logging_steps=500,               # Log every 500 steps
    eval_strategy="steps",     # Evaluate every `eval_steps`
    eval_steps=500,                  # Evaluation interval
    warmup_steps=500,                # Warmup steps for learning rate scheduler
    weight_decay=0.01,               # Weight decay for regularization
    fp16=True,                       # Enable mixed precision training (if GPU is available)
)

In [11]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
)

In [12]:
# Fine-tune the model
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=69, training_loss=7.36752584706182, metrics={'train_runtime': 8.543, 'train_samples_per_second': 31.956, 'train_steps_per_second': 8.077, 'total_flos': 17833181184000.0, 'train_loss': 7.36752584706182, 'epoch': 3.0})

In [13]:
# Save the fine-tuned model
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Fine-tuning complete. Model saved to {save_dir}.")

Fine-tuning complete. Model saved to ../../models/fine_tuned_model.
