In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [5]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [6]:
df_feat = df.copy()

In [7]:
df_feat['bmi'] = df_feat["weight"] / (df_feat["height"]/100)**2

In [8]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,492274.819198
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,301890.172893
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,211183.819155
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,455359.001041
4,69,62.2,1.6,3.94,True,Indore,retired,High,242968.75


In [9]:
def age_group(age):
  if age < 25:
    return "young"
  elif age < 45:
    return "adult"
  elif age < 60:
    return "middle_aged"
  else:
    return "senior"

In [10]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [12]:
def lifestyle_risk(row):
  if row["smoker"] and row['bmi'] > 30:
    return "high"
  elif row["smoker"] and row['bmi'] > 27:
    return "medium"
  else:
    return "low"

In [13]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [14]:

tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [15]:
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3


In [16]:
df_feat["city_tier"] = df["city"].apply(city_tier)

In [17]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)


Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
56,2.86,student,424141.519251,young,high,1,Medium
36,0.53,retired,217132.659131,senior,low,1,Medium
25,30.0,government_job,250572.320499,middle_aged,low,2,Low
46,25.57,unemployed,336727.656294,adult,high,1,High
99,28.16664,government_job,276887.781338,adult,high,1,Low


In [18]:


# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [19]:
X


Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,492274.819198,senior,low,2,2.92000,retired
1,301890.172893,adult,low,1,34.28000,freelancer
2,211183.819155,adult,low,2,36.64000,freelancer
3,455359.001041,young,high,1,3.34000,student
4,242968.750000,senior,high,2,3.94000,retired
...,...,...,...,...,...,...
95,214207.472920,adult,low,2,19.64000,business_owner
96,479844.830494,adult,low,1,34.01000,private_job
97,187654.320988,middle_aged,low,1,44.86000,freelancer
98,305216.761261,adult,low,1,28.30000,business_owner


In [20]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [21]:

categorical_features = ["age_group" , "lifestyle_risk", "occupation"]
numerical_features = ["bmi", "city_tier", "income_lpa"]

In [24]:
pipeline = preprocessor = ColumnTransformer(
    transformers=[
        ("cat" , OneHotEncoder(),categorical_features),
        ("num" , "passthrough", numerical_features)
    ]
)

In [27]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', RandomForestClassifier(random_state=42))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [28]:

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [30]:
import pickle

pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
  pickle.dump(pipeline,f)
