In [7]:
import pandas as pd
import random
import string
from datetime import datetime, timedelta

# Helper functions
def random_string(length):
    return ''.join(random.choices(string.ascii_letters, k=length))

def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

def generate_customer_data(num_records):
    data = []
    dob_start_date = datetime(1970, 1, 1)
    dob_end_date = datetime(2000, 1, 1)
    for i in range(1, num_records + 1):
        cid = cid = f"{i:015}"
        clast = random_string(random.randint(3, 10)).capitalize()
        cfirst = random_string(random.randint(3, 10)).capitalize()
        cmiddle = random_string(random.randint(3, 10)).capitalize() if random.random() > 0.5 else None
        csuffix = random.choice(['Jr.', 'Sr.', 'III', None])
        cdob = random_date(dob_start_date, dob_end_date)
        csalutation = random.choice(['Mr.', 'Ms.', 'Dr.', None])
        cemailaddress = f"{cfirst.lower()}.{clast.lower()}@example.com"
        gender = random.choice(['M', 'F', 'U'])
        ssn_tin = ''.join(random.choices(string.digits, k=9))
        ssn_type = random.choice(['SSN', 'TIN', None])
        preferred_language = random.choice(['English', 'Spanish', 'French', 'German', 'Chinese', None])
        start_date = random_date(datetime(2000, 1, 1), datetime(2023, 1, 1))
        end_date = start_date + timedelta(days=random.randint(30, 1000)) if random.random() > 0.5 else None
        created_at = datetime.now()
        updated_at = datetime.now()

        data.append({
            "CID": cid,
            "CLast": clast,
            "CFirst": cfirst,
            "CMiddle": cmiddle,
            "CSuffix": csuffix,
            "CDOB": cdob,
            "CSalutation": csalutation,
            "CEmailAddress": cemailaddress,
            "Gender": gender,
            "SSN_TIN": ssn_tin,
            "SSNType": ssn_type,
            "PreferredLanguage": preferred_language,
            "StartDate": start_date,
            "EndDate": end_date,
            "CreatedAt": created_at,
            "UpdatedAt": updated_at
        })
    return pd.DataFrame(data)

customer_data = generate_customer_data(1000)

def generate_customer_address_data(customer_df):
    """Generate corresponding address data for each customer."""
    address_data = []
    for _, row in customer_df.iterrows():
        cid = row["CID"]
        caddress = f"{random.randint(1, 9999)} {random_string(random.randint(5, 15))} Street"
        ccity = random.choice(["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"])
        cstate = random.choice(["NY", "CA", "IL", "TX", "AZ"])
        czip = ''.join(random.choices(string.digits, k=5))
        is_active = random.choice([True, False])
        created_at = datetime.now()

        address_data.append({
            "CID": cid,
            "CAddress": caddress,
            "CCity": ccity,
            "CState": cstate,
            "CZip": czip,
            "IsActive": is_active,
            "CreatedAt": created_at
        })
    return pd.DataFrame(address_data)

customer_address_data = generate_customer_address_data(customer_data)

customer_data.head(10)

# # Display both DataFrames
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Customer Data", dataframe=customer_data)
# tools.display_dataframe_to_user(name="Customer Address Data", dataframe=customer_address_data)


Unnamed: 0,CID,CLast,CFirst,CMiddle,CSuffix,CDOB,CSalutation,CEmailAddress,Gender,SSN_TIN,SSNType,PreferredLanguage,StartDate,EndDate,CreatedAt,UpdatedAt
0,1,Ukrgvfuw,Tvsmkgcbl,Lkksltrbz,,1997-12-01,Dr.,tvsmkgcbl.ukrgvfuw@example.com,M,439772102,,English,2000-01-05,NaT,2024-12-04 18:14:53.207947,2024-12-04 18:14:53.207957
1,2,Qcnurdfbu,Abalzfaefv,Opcwx,Sr.,1974-03-21,Mr.,abalzfaefv.qcnurdfbu@example.com,M,397583584,,French,2021-02-12,2021-04-11,2024-12-04 18:14:53.207976,2024-12-04 18:14:53.207976
2,3,Pmdshbq,Ccgzl,,Sr.,1981-06-11,Mr.,ccgzl.pmdshbq@example.com,F,937163931,SSN,German,2008-09-15,2011-05-07,2024-12-04 18:14:53.207991,2024-12-04 18:14:53.207991
3,4,Afdyonaq,Mpyi,,,1973-08-09,,mpyi.afdyonaq@example.com,F,465016774,SSN,English,2006-06-10,2007-02-09,2024-12-04 18:14:53.208004,2024-12-04 18:14:53.208005
4,5,Rqpi,Nvx,Fpunkqekd,Jr.,1994-10-30,Mr.,nvx.rqpi@example.com,F,881451967,,,2021-12-22,2022-02-16,2024-12-04 18:14:53.208020,2024-12-04 18:14:53.208020
5,6,Hkkjwaa,Cuibklu,,III,1987-12-25,Dr.,cuibklu.hkkjwaa@example.com,M,369049072,TIN,Spanish,2001-10-02,2003-06-21,2024-12-04 18:14:53.208034,2024-12-04 18:14:53.208035
6,7,Ohldmqefu,Saaqm,,,1976-02-26,,saaqm.ohldmqefu@example.com,U,141489852,,English,2013-07-03,2016-01-26,2024-12-04 18:14:53.208048,2024-12-04 18:14:53.208048
7,8,Amfotgrtjp,Jbpb,Mcvyokocb,Jr.,1997-06-29,Ms.,jbpb.amfotgrtjp@example.com,F,135053380,SSN,Spanish,2009-03-17,NaT,2024-12-04 18:14:53.208062,2024-12-04 18:14:53.208063
8,9,Bcqxtwae,Hcdco,,,1984-10-16,Dr.,hcdco.bcqxtwae@example.com,F,169108597,SSN,Chinese,2013-02-02,NaT,2024-12-04 18:14:53.208074,2024-12-04 18:14:53.208074
9,10,Ujtydj,Lrpuw,,Jr.,1971-05-17,Dr.,lrpuw.ujtydj@example.com,U,291680712,,English,2014-07-01,NaT,2024-12-04 18:14:53.208085,2024-12-04 18:14:53.208086


In [8]:
import random
from decimal import Decimal
from datetime import datetime, timedelta

def random_date(start_date, end_date):
    """Generate a random date between start_date and end_date."""
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)

def generate_health_data(customer_id, start_date, end_date):
    """Generate health-related data for a given customer."""
    # Generate random dates
    health_checkup_date = random_date(start_date, end_date)
    medical_visit_date = health_checkup_date + timedelta(days=random.randint(0, 5))
    lab_report_date = medical_visit_date + timedelta(days=random.randint(0, 3))

    # Health Checkup Record
    health_checkup = {
        'documentId': f'HC-{health_checkup_date.year}-{str(random.randint(100,999))}',
        'customerId': customer_id,
        'type': 'healthCheckup',
        'date': health_checkup_date.strftime('%Y-%m-%d'),
        'vitalSigns': {
            'bloodPressure': {
                'systolic': random.randint(110, 130),
                'diastolic': random.randint(70, 90)
            },
            'pulseRate': random.randint(60, 100),
            'temperature': Decimal(f'{random.uniform(97.0, 99.0):.1f}'),
            'bmi': Decimal(f'{random.uniform(18.5, 30.0):.1f}')
        },
        'examFindings': {
            'general': 'Patient appears healthy',
            'cardiovascular': 'Regular rate and rhythm',
            'respiratory': 'Clear to auscultation bilaterally',
            'musculoskeletal': 'No abnormalities'
        },
        'recommendations': 'Maintain a balanced diet, follow up in 1 year',
        'examiningPhysician': {
            'name': f'Dr. {random.choice(["Sarah Smith", "Emily Brown", "David Johnson"])}',
            'id': f'PHY-{random.randint(100,999)}'
        }
    }

    # Medical Visit Record
    medical_visit = {
        'documentId': f'MV-{medical_visit_date.year}-{str(random.randint(100,999))}',
        'customerId': customer_id,
        'type': 'medicalVisit',
        'date': medical_visit_date.strftime('%Y-%m-%d'),
        'claimNumber': f'CLM-{medical_visit_date.year}-{str(random.randint(1000,9999))}',
        'facility': {
            'name': random.choice(['City Medical Center', 'Community Clinic']),
            'id': f'FAC-{random.randint(100,999)}',
            'type': random.choice(['outpatient', 'inpatient'])
        },
        'provider': {
            'name': f'Dr. {random.choice(["John Lee", "Anna Miller", "Robert Davis"])}',
            'id': f'PHY-{random.randint(100,999)}',
            'specialty': random.choice(['Internal Medicine', 'Family Medicine', 'Pediatrics'])
        },
        'diagnosis': {
            'primary': random.choice(['Acute bronchitis', 'Migraine', 'Hypertension']),
            'secondary': random.sample(['Seasonal allergies', 'Vitamin D deficiency', 'Obesity'], k=random.randint(0, 2))
        },
        'treatment': {
            'procedures': random.sample(['Chest X-ray', 'Blood test', 'Physical therapy'], k=random.randint(1, 3)),
            'medications': random.sample(['Ibuprofen 200mg', 'Amoxicillin 500mg', 'Albuterol inhaler'], k=random.randint(1, 2)),
            'instructions': 'Take medications as prescribed and follow up if symptoms persist'
        },
        'followUp': {
            'required': random.choice([True, False]),
            'date': (medical_visit_date + timedelta(days=random.randint(7, 30))).strftime('%Y-%m-%d'),
            'notes': 'Return if symptoms worsen'
        }
    }

    # Lab Report Record
    lab_report = {
        'documentId': f'LR-{lab_report_date.year}-{str(random.randint(100,999))}',
        'customerId': customer_id,
        'type': 'labReport',
        'date': lab_report_date.strftime('%Y-%m-%d'),
        'reportDate': (lab_report_date + timedelta(days=random.randint(1, 3))).strftime('%Y-%m-%d'),
        'orderedBy': {
            'name': medical_visit['provider']['name'],
            'id': medical_visit['provider']['id']
        },
        'facility': {
            'name': random.choice(['City Lab Services', 'Health Diagnostics Lab']),
            'id': f'LAB-{random.randint(100,999)}'
        },
        'testCategory': 'Complete Blood Count',
        'results': [
            {
                'testName': 'WBC',
                'value': f'{random.uniform(4.5, 11.0):.1f}',
                'unit': 'K/uL',
                'referenceRange': '4.5-11.0',
                'interpretation': 'Normal',
                'flags': []
            },
            {
                'testName': 'Hemoglobin',
                'value': f'{random.uniform(13.5, 17.5):.1f}',
                'unit': 'g/dL',
                'referenceRange': '13.5-17.5',
                'interpretation': 'Normal',
                'flags': []
            }
        ],
        'notes': 'All results within normal ranges'
    }

    return {
        'healthCheckup': health_checkup,
        'medicalVisit': medical_visit,
        'labReport': lab_report
    }



In [9]:
# Generate health-related data for 1000 users based on the previously generated customer data
def generate_health_data_for_customers(customer_df, start_date, end_date):
    health_data_list = []
    for _, row in customer_df.iterrows():
        customer_id = row['CID']
        health_data = generate_health_data(customer_id, start_date, end_date)
        health_data_list.append({
            'customerId': customer_id,
            'healthCheckup': health_data['healthCheckup'],
            'medicalVisit': health_data['medicalVisit'],
            'labReport': health_data['labReport']
        })
    return health_data_list

# Start and end dates for health data generation
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate data for all customers
health_data_all = generate_health_data_for_customers(customer_data, start_date, end_date)

# Display an example health data entry to the user
health_data = pd.json_normalize(health_data_all, sep="_")
health_data['healthCheckup_vitalSigns_bmi'] = pd.to_numeric(health_data['healthCheckup_vitalSigns_bmi'])
health_data['healthCheckup_vitalSigns_temperature'] = pd.to_numeric(health_data['healthCheckup_vitalSigns_temperature'])
health_data.head(10)


Unnamed: 0,customerId,healthCheckup_documentId,healthCheckup_customerId,healthCheckup_type,healthCheckup_date,healthCheckup_vitalSigns_bloodPressure_systolic,healthCheckup_vitalSigns_bloodPressure_diastolic,healthCheckup_vitalSigns_pulseRate,healthCheckup_vitalSigns_temperature,healthCheckup_vitalSigns_bmi,...,labReport_type,labReport_date,labReport_reportDate,labReport_orderedBy_name,labReport_orderedBy_id,labReport_facility_name,labReport_facility_id,labReport_testCategory,labReport_results,labReport_notes
0,1,HC-2024-162,1,healthCheckup,2024-06-06,120,81,76,97.9,30.0,...,labReport,2024-06-09,2024-06-12,Dr. Robert Davis,PHY-694,City Lab Services,LAB-209,Complete Blood Count,"[{'testName': 'WBC', 'value': '9.8', 'unit': '...",All results within normal ranges
1,2,HC-2024-863,2,healthCheckup,2024-01-23,112,74,95,97.4,27.4,...,labReport,2024-01-30,2024-01-31,Dr. Anna Miller,PHY-421,Health Diagnostics Lab,LAB-370,Complete Blood Count,"[{'testName': 'WBC', 'value': '9.2', 'unit': '...",All results within normal ranges
2,3,HC-2024-935,3,healthCheckup,2024-08-04,112,81,85,97.6,21.5,...,labReport,2024-08-09,2024-08-11,Dr. John Lee,PHY-239,Health Diagnostics Lab,LAB-430,Complete Blood Count,"[{'testName': 'WBC', 'value': '10.4', 'unit': ...",All results within normal ranges
3,4,HC-2024-457,4,healthCheckup,2024-02-25,115,70,85,97.9,19.3,...,labReport,2024-03-01,2024-03-02,Dr. Anna Miller,PHY-657,City Lab Services,LAB-829,Complete Blood Count,"[{'testName': 'WBC', 'value': '10.6', 'unit': ...",All results within normal ranges
4,5,HC-2024-843,5,healthCheckup,2024-09-29,128,79,86,97.2,22.4,...,labReport,2024-10-01,2024-10-04,Dr. Robert Davis,PHY-107,City Lab Services,LAB-562,Complete Blood Count,"[{'testName': 'WBC', 'value': '5.8', 'unit': '...",All results within normal ranges
5,6,HC-2024-879,6,healthCheckup,2024-04-07,115,71,95,97.6,26.7,...,labReport,2024-04-10,2024-04-13,Dr. John Lee,PHY-904,City Lab Services,LAB-599,Complete Blood Count,"[{'testName': 'WBC', 'value': '8.0', 'unit': '...",All results within normal ranges
6,7,HC-2024-542,7,healthCheckup,2024-09-29,125,81,74,98.6,19.4,...,labReport,2024-09-30,2024-10-01,Dr. Anna Miller,PHY-183,Health Diagnostics Lab,LAB-200,Complete Blood Count,"[{'testName': 'WBC', 'value': '6.2', 'unit': '...",All results within normal ranges
7,8,HC-2024-293,8,healthCheckup,2024-05-19,111,84,94,97.9,23.5,...,labReport,2024-05-26,2024-05-28,Dr. John Lee,PHY-670,City Lab Services,LAB-488,Complete Blood Count,"[{'testName': 'WBC', 'value': '5.2', 'unit': '...",All results within normal ranges
8,9,HC-2024-633,9,healthCheckup,2024-11-13,129,82,80,97.3,25.7,...,labReport,2024-11-16,2024-11-18,Dr. John Lee,PHY-864,City Lab Services,LAB-836,Complete Blood Count,"[{'testName': 'WBC', 'value': '7.2', 'unit': '...",All results within normal ranges
9,10,HC-2024-814,10,healthCheckup,2024-07-01,123,74,95,97.7,24.3,...,labReport,2024-07-01,2024-07-04,Dr. Anna Miller,PHY-680,Health Diagnostics Lab,LAB-453,Complete Blood Count,"[{'testName': 'WBC', 'value': '7.6', 'unit': '...",All results within normal ranges


In [10]:
def calculate_insurance_premium(row):
    """Calculate insurance premium based on health metrics."""
    base_premium = 500  # Base monthly premium
    multiplier = 1.0
    bmi = float(row['healthCheckup_vitalSigns_bmi'])
    if bmi > 30:
        multiplier *= 1.3
    elif bmi > 25:
        multiplier *= 1.15
    systolic = row['healthCheckup_vitalSigns_bloodPressure_systolic']
    if systolic > 140:
        multiplier *= 1.2
    elif systolic > 130:
        multiplier *= 1.1
    customer_info = customer_data[customer_data['CID'] == row['customerId']].iloc[0]
    age = (datetime.now() - customer_info['CDOB']).days / 365.25
    if age > 60:
        multiplier *= 1.4
    elif age > 40:
        multiplier *= 1.2

    if 'Hypertension' in str(row['medicalVisit_diagnosis_primary']):
        multiplier *= 1.25
    multiplier *= random.uniform(0.9, 1.1)
    premium = base_premium * multiplier
    return round(premium, 2)

In [11]:
health_data['monthly_premium'] = health_data.apply(calculate_insurance_premium, axis=1)

In [12]:
health_data.to_csv('health_data.csv', index=False)