In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import classification_report, accuracy_score 

In [46]:
df = pd.read_csv('insurance.csv')
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [5]:
# copy our data sets because we are doing different feature engineering 
df_copy = df.copy()

In [6]:
# Feature 1: BMI
df_copy['BMI'] = df_copy['weight']/(df_copy['height']**2)
df_copy

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875
...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676


In [13]:
# Feature 2: Age Group 

def age_group(age):
    if age < 25:
        return 'young'
    elif age < 45:
        return 'adult'
    elif  age < 60:
        return 'middle_aged'
    return 'senior'

In [15]:
df_copy['age_group'] = df_copy['age'].apply(age_group)

In [19]:
# Feature 3: Lifestyle Risk

def life_style(row):
    if row['smoker'] and row['BMI'] > 30:
        return 'heigh'
    elif row['smoker'] and row['BMI'] > 27:
        return 'medium'
    return 'low'


In [20]:
df_copy['life_style'] = df_copy.apply(life_style, axis = 1)
df_copy

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI,age_group,life_style
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,low
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,young,heigh
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,low
...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle_aged,low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,low


In [21]:
tier_1_cities =  ['Mumbai', 'Delhi', 'Bangalore', 'Chenni', 'Kolkota', 'Hyderabad', 'Pune']


tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]


In [22]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    return 3

In [23]:
df_copy['city_tier'] = df_copy['city'].apply(city_tier)
df_copy

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,BMI,age_group,life_style,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,low,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,low,3
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,young,heigh,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,low,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,low,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle_aged,low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,low,3


In [25]:
df_copy.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])

Unnamed: 0,income_lpa,occupation,insurance_premium_category,BMI,age_group,life_style,city_tier
0,2.92000,retired,High,49.227482,senior,low,2
1,34.28000,freelancer,Low,30.189017,adult,low,3
2,36.64000,freelancer,Low,21.118382,adult,low,2
3,3.34000,student,Medium,45.535900,young,heigh,1
4,3.94000,retired,High,24.296875,senior,low,2
...,...,...,...,...,...,...,...
95,19.64000,business_owner,Low,21.420747,adult,low,2
96,34.01000,private_job,Low,47.984483,adult,low,1
97,44.86000,freelancer,Low,18.765432,middle_aged,low,1
98,28.30000,business_owner,Low,30.521676,adult,low,3


In [29]:
df_copy.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'BMI', 'age_group', 'life_style', 'city_tier', 'insurance_premium_category']].sample(5)
     

Unnamed: 0,income_lpa,occupation,BMI,age_group,life_style,city_tier,insurance_premium_category
83,2.16,retired,24.338934,senior,low,3,High
62,35.67,business_owner,21.738481,adult,low,3,Low
27,34.33,private_job,35.159702,middle_aged,low,2,Medium
58,3.31,retired,29.930402,senior,medium,2,High
46,25.57,unemployed,33.672766,adult,heigh,3,High


In [33]:
# select feature and target

X = df_copy[['BMI', 'age_group', 'life_style', 'city_tier', 'income_lpa', 'occupation']]
y = df_copy['insurance_premium_category']


In [34]:
X

Unnamed: 0,BMI,age_group,life_style,city_tier,income_lpa,occupation
0,49.227482,senior,low,2,2.92000,retired
1,30.189017,adult,low,3,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,heigh,1,3.34000,student
4,24.296875,senior,low,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,low,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,low,3,28.30000,business_owner


In [35]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [37]:
# Define categorical and numeric features
categorical_features = ["age_group", "life_style", "occupation", "city_tier"]
numeric_features = ["BMI", "income_lpa"]

In [38]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [39]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [40]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [41]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.8

In [42]:
X_test.sample(5)

Unnamed: 0,BMI,age_group,life_style,city_tier,income_lpa,occupation
51,38.827923,middle_aged,heigh,2,28.95,private_job
92,18.319942,adult,low,2,30.0,government_job
36,21.713266,senior,low,1,0.53,retired
33,21.791064,senior,low,1,1.46,retired
10,22.949982,adult,low,3,32.78,business_owner


In [44]:
import pickle 

# save the trained pipeline using pickle

pickle_model_path = 'model.pkl'
with open(pickle_model_path, 'wb') as f:
    pickle.dump(pipeline, f)

In [45]:
ls

[34m__pycache__[m[m/            main.py                 patients.json
Fastapi_ml_model.ipynb  model.pkl               pydantic_demo.py
insurance.csv           [34mmyenv[m[m/
