In [3]:
import pandas as pd
import numpy as np

# 1. Setup
np.random.seed(42) 
n_entries = 10000

# 2. Generate Data
# Age: Integer (days). 
age = np.random.randint(20, 75, size=n_entries)

# Gender: Categorical (1 or 2)
gender = np.random.choice(['M', 'F'], size=n_entries)

# Height: Integer (cm)
height = np.random.normal(loc=165, scale=10, size=n_entries).astype(int)

# Weight: Float (kg)
weight = np.random.normal(loc=70, scale=15, size=n_entries).round(1)

# Blood Pressure
ap_hi = np.random.normal(loc=120, scale=15, size=n_entries).astype(int)
ap_lo = np.random.normal(loc=80, scale=10, size=n_entries).astype(int)
ap_lo = np.minimum(ap_lo, ap_hi - 10) # Sanity check: Diastolic < Systolic

# Cholesterol & Glucose (1: normal, 2: above, 3: well above)
cholesterol = np.random.choice([1, 2, 3], size=n_entries, p=[0.75, 0.15, 0.10])
gluc = np.random.choice([1, 2, 3], size=n_entries, p=[0.80, 0.15, 0.05])

# Subjective Features (Binary)
smoke = np.random.choice([0, 1], size=n_entries, p=[0.90, 0.10])
alco = np.random.choice([0, 1], size=n_entries, p=[0.95, 0.05])
active = np.random.choice([0, 1], size=n_entries, p=[0.20, 0.80])

# Target Variable
cardio = np.random.choice([0, 1], size=n_entries)

# 3. Create DataFrame
data = {
    'age': age,
    'gender': gender,
    'height': height,
    'weight': weight,
    'ap_hi': ap_hi,
    'ap_lo': ap_lo,
    'cholesterol': cholesterol,
    'gluc': gluc,
    'smoke': smoke,
    'alco': alco,
    'active': active,
    'cardio': cardio
}

df = pd.DataFrame(data)

# 4. Save to CSV (Updated Filename)
csv_filename = 'medical_examination.csv'
df.to_csv(csv_filename, index=False)

print(f"Dataset successfully saved as: {csv_filename}")

Dataset successfully saved as: medical_examination.csv
