### Step 1: Create a new dataset manually



In [3]:
pip install Faker

Collecting Faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-37.5.3


In [4]:
# To create random names, weights and heights. We can use faker function in Numpy

import numpy as np
import pandas as pd
from faker import Faker
fake = Faker()

In [6]:
# Generate 20 fake patients
n = 20
patients = [fake.name() for i in range(n)]
weights = np.random.randint(45, 100, n) # kg
heights = np.random.randint(140, 220, n) # cm

df_data = pd.DataFrame({
    "Patient Name":patients,
    "Weights (kg)": weights,
    "Heights (cm)": heights
})
df_data.to_csv("patients_data.csv", index=False)
print(df_data.head())

     Patient Name  Weights (kg)  Heights (cm)
0     Marc Farmer            64           156
1      Eric Brown            46           215
2    Patricia Kim            49           218
3      Erin Moore            89           209
4  David Combs MD            89           160


index = true gives includes index number when converting from df to csv. The heading will start as ','. Then first row will have 0 as 1st element and it goes on to 19 since it has 20 rows

Currently as index=false. It gives values as:
Patient Name,Weights (kg),Heights (cm)  
Mr. Jeffery Faulkner DDS,46,199  
Christopher Sanders,88,199  
Mrs. Jessica Carson,62,152  
...  
In the CSV it will reflect same

But, if index=true. The values would be:
',' ,Patient Name,Weights (kg),Heights (cm)  
0,Mr. Jeffery Faulkner DDS,46,199  
1,Christopher Sanders,88,199  
2,Mrs. Jessica Carson,62,152  
...  
In the CSV it will reflect an extra column as mentioned above



### Step 2: Read the csv and extract columns as Numpy arrays

In [7]:
df = pd.read_csv("patients_data.csv")

print(df.head())

     Patient Name  Weights (kg)  Heights (cm)
0     Marc Farmer            64           156
1      Eric Brown            46           215
2    Patricia Kim            49           218
3      Erin Moore            89           209
4  David Combs MD            89           160


In [9]:
patient_names = df["Patient Name"].values  # String array
patient_weights = df["Weights (kg)"].values  # Float/Int array
patient_heights = df["Heights (cm)"].values  # Float array
patient_heights_m = patient_heights / 100

print('Patients names are:',patient_names[:5])
print('Patients weights are:',patient_weights[:5])
print('Patients heights are:',patient_heights_m[:5])

Patients names are: ['Marc Farmer' 'Eric Brown' 'Patricia Kim' 'Erin Moore' 'David Combs MD']
Patients weights are: [64 46 49 89 89]
Patients heights are: [1.56 2.15 2.18 2.09 1.6 ]


### Step 3: Calculate BMI for the whole array

In [10]:
bmi = patient_weights / (patient_heights_m ** 2)
print(bmi)

[26.29848784  9.95132504 10.31057992 20.37499142 34.765625   22.28259211
 16.9865806  30.99173554  9.85939643 26.89767019 20.23950076 20.38156971
 46.42857143 25.390625   15.78848148 28.37821519 21.27600011 12.49872462
 10.69962442 26.31506812]


### Step 4: Categorize BMI ranges for the population using np.where()

In [11]:
# Step 4: Categorize BMI using np.where()
categories = np.where(bmi < 18.5, "Underweight",
              np.where(bmi < 25, "Normal",
              np.where(bmi < 30, "Overweight", "Obese")))


In [12]:
# Step 4: Calculate category counts & percentages
labels, counts = np.unique(categories, return_counts=True)
percentages = (counts / len(categories)) * 100

### Step 5: Display Results

In [13]:
# Step 7: Display results
print("=== Patient BMI Categories ===")
for name, b, cat in zip(patient_names, bmi, categories):
    print(f"{name}: BMI={b:.2f} → {cat}")

=== Patient BMI Categories ===
Marc Farmer: BMI=26.30 → Overweight
Eric Brown: BMI=9.95 → Underweight
Patricia Kim: BMI=10.31 → Underweight
Erin Moore: BMI=20.37 → Normal
David Combs MD: BMI=34.77 → Obese
Austin Wright: BMI=22.28 → Normal
Mr. Daniel Wright MD: BMI=16.99 → Underweight
Morgan Smith: BMI=30.99 → Obese
Jeremy Warren: BMI=9.86 → Underweight
Carl Ward: BMI=26.90 → Overweight
Heather Robinson: BMI=20.24 → Normal
Victoria Harris: BMI=20.38 → Normal
Mr. Allen Thomas Jr.: BMI=46.43 → Obese
Pamela Campbell: BMI=25.39 → Overweight
Tracy Johns: BMI=15.79 → Underweight
Nicole Williams DDS: BMI=28.38 → Overweight
Anna Terry: BMI=21.28 → Normal
Nathan Patterson: BMI=12.50 → Underweight
Renee Moon: BMI=10.70 → Underweight
Laura Hill: BMI=26.32 → Overweight


In [14]:
print("\n=== Category Distribution ===")
for label, count, perc in zip(labels, counts, percentages):
    print(f"{label}: {count} patients ({perc:.1f}%)")


=== Category Distribution ===
Normal: 5 patients (25.0%)
Obese: 3 patients (15.0%)
Overweight: 5 patients (25.0%)
Underweight: 7 patients (35.0%)
