In [1]:
%pip install pandas numpy faker

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import random
from faker import Faker

# Initialize faker and set the locale to Hong Kong
fake = Faker(locale="en")
filename = "motor_insurance_hk_data_non_pii_114423oct2024.csv"

# Function to generate random motor insurance data without PII
def generate_motor_insurance_data_non_pii(num_records):
    data = []

    for _ in range(num_records):
        driver_id = fake.unique.random_number(digits=8, fix_len=True)
        age = random.randint(18, 70)
        gender = random.choice(['Male', 'Female', 'Other'])
        driving_experience_years = random.randint(1, 50 - 18)
        occupation = random.choice(['Professional', 'Clerical', 'Driver', 'Teacher'])

        vehicle_id = fake.unique.random_number(digits=8, fix_len=True)
        vehicle_make = random.choice(['Toyota', 'BMW', 'Honda', 'Mercedes-Benz', 'Audi', 'Bentley', 'Ferrari', 'McLaren'])
        vehicle_model = random.choice(['Corolla', 'X5', 'Civic', 'C-Class', 'A4', 'Mulliner Batur', 'Daytona SP3', 'Speedtail'])
        year_of_manufacture = random.randint(2005, 2023)
        vehicle_value = round(random.uniform(100000, 1000000), 2)  # Vehicle value in HKD
        engine_size = random.choice([1000, 1500, 2000, 2500, 3000])
        registration_city = random.choice(['Hong Kong', 'Kowloon', 'New Territories'])
        
        policy_id = fake.unique.random_number(digits=10, fix_len=True)
        coverage_type = random.choice(['Third Party', 'Comprehensive'])
        premium_amount = round(random.uniform(5000, 20000), 2)  # HKD
        policy_term = 1  # Typically 1 year
        excess_amount = round(random.uniform(1000, 5000), 2)
        effective_date = fake.date_this_year()
        expiry_date = fake.date_between(start_date=effective_date, end_date='+1y')
        renewal_status = random.choice(['Auto-renew', 'Non-renewal'])

        driving_record = random.choice(['Clean', '1 Accident', 'Multiple Accidents'])
        claims_history = random.randint(0, 5)
        traffic_violations = random.randint(0, 3)
        vehicle_usage = random.choice(['Private', 'Commercial'])
        credit_score = random.randint(300, 850)
        
        underwriting_decision = random.choice(['Approved', 'Declined', 'Pending'])
        risk_class = random.choice(['Low Risk', 'Medium Risk', 'High Risk'])
        reason_for_decline = random.choice(['Poor driving record', 'Low credit score', 'Multiple accidents', 'High vehicle value']) if underwriting_decision == 'Declined' else None

        # Concatenate all the fields to form the content
        content = (
            f"Driver ID: {driver_id}, Age: {age}, Gender: {gender}, Driving Experience: {driving_experience_years} years, "
            f"Occupation: {occupation}, Vehicle: {vehicle_make} {vehicle_model}, Year: {year_of_manufacture}, Value: {vehicle_value} HKD, "
            f"Engine Size: {engine_size}cc, Registration City: {registration_city}, Policy ID: {policy_id}, Coverage: {coverage_type}, "
            f"Premium: {premium_amount} HKD, Policy Term: {policy_term} year(s), Excess: {excess_amount} HKD, Effective Date: {effective_date}, "
            f"Expiry Date: {expiry_date}, Renewal Status: {renewal_status}, Driving Record: {driving_record}, Claims History: {claims_history}, "
            f"Traffic Violations: {traffic_violations}, Vehicle Usage: {vehicle_usage}, Credit Score: {credit_score}, "
            f"Underwriting Decision: {underwriting_decision}, Risk Class: {risk_class}, Reason for Decline: {reason_for_decline}"
        )

        data.append({
            'document': filename,
            'driver_id': driver_id,
            'vehicle_id': vehicle_id,
            'policy_id': policy_id,
            'underwriting_decision': underwriting_decision,
            'risk_class': risk_class,
            'reason_for_decline': reason_for_decline,
            'content': content
        })

    return pd.DataFrame(data)

# Generate sample data with 100 records
df = generate_motor_insurance_data_non_pii(100)

# Save to a CSV file with document and content columns
df.to_csv(filename, index=False)

# Display the first few rows of the generated dataset
print(df.head())

                                            document  driver_id  vehicle_id  \
0  motor_insurance_hk_data_non_pii_114423oct2024.csv   81043220    44470499   
1  motor_insurance_hk_data_non_pii_114423oct2024.csv   38430726    85347174   
2  motor_insurance_hk_data_non_pii_114423oct2024.csv   95941257    94078724   
3  motor_insurance_hk_data_non_pii_114423oct2024.csv   37982188    16933533   
4  motor_insurance_hk_data_non_pii_114423oct2024.csv   59103327    10383184   

    policy_id underwriting_decision risk_class   reason_for_decline  \
0  7295152581               Pending  High Risk                 None   
1  7495839552              Approved   Low Risk                 None   
2  1125745011              Declined  High Risk   High vehicle value   
3  9377901969              Declined   Low Risk  Poor driving record   
4  2395931819               Pending   Low Risk                 None   

                                             content  
0  Driver ID: 81043220, Age: 20, Gender: Ma