In [None]:
import pandas as pd

In [2]:
df = pd.read_csv('insurance.csv')

In [8]:
# df.head()
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
26,33,79.0,1.61,23.61,False,Jaipur,freelancer,Medium
95,36,52.8,1.57,19.64,False,Indore,business_owner,Low
92,37,62.7,1.85,30.0,True,Lucknow,government_job,Low
53,41,101.3,1.85,30.0,True,Delhi,government_job,Medium
90,59,54.0,1.6,21.07,False,Mumbai,business_owner,Low


Get a copy of original data

In [9]:
df_feat = df.copy()

- Feature 1 : BMI

In [10]:
df_feat['bmi'] = df_feat["weight"]/(df_feat["height"]**2)

- Feature 2 : Age group

In [None]:
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "midle_aged"
    return "senior"

In [19]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

- Feature 3 : Lifestyle risk

In [20]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] and row["bmi"] > 27:
        return "medium"
    return "low"

In [21]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

*Cities breakdown*


In [22]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

- Feature 4 : City Tier

In [23]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    return 3

In [24]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [29]:
# df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'],inplace=True)

In [30]:
df_feat.sample(10)

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,lifestyle_risk,city_tier,age_group
31,11.77,private_job,Medium,15.258742,low,2,adult
15,2.99,retired,Medium,21.860828,low,1,senior
10,32.78,business_owner,Medium,22.949982,low,1,adult
98,28.3,business_owner,Low,30.521676,low,1,adult
37,8.09,freelancer,Medium,17.852127,low,2,adult
44,50.0,private_job,Medium,30.078125,high,2,midle_aged
57,1.36,retired,High,26.889815,low,2,senior
12,17.58,freelancer,High,30.046711,high,2,adult
43,1.56,retired,Medium,29.308163,low,1,senior
94,10.542289,government_job,Low,33.266002,low,1,midle_aged


*select features and target*

In [34]:

x = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [35]:
x

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,low,2,2.92000,retired
1,30.189017,adult,low,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,low,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,low,1,34.01000,private_job
97,18.765432,midle_aged,low,1,44.86000,freelancer
98,30.521676,adult,low,1,28.30000,business_owner


In [36]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

**Define categorical and numerical features**

In [37]:
categorical_features = ["age_group","lifestyle_risk","occupation","city_tier"]
numerical_features = ["bmi","income_lpa"]

*Create column transformation for OneHotEncoder*

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
preprocessor = ColumnTransformer(
    transformers = [
        ("cat",OneHotEncoder(),categorical_features),
        ("num","passthrough",numerical_features)
    ]
)

*Create a pipeline with preprocessing and random forest classifier*

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",RandomForestClassifier(random_state=42))
])

*Split and train the model*

In [40]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


*Predict and evaluate*

In [41]:

from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(x_test)
accuracy_score(y_test,y_pred)

0.75

In [42]:
x_test.sample(5)


Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
78,27.932798,midle_aged,medium,2,14.74,freelancer
69,21.942857,midle_aged,low,2,6.034487,government_job
52,47.34472,young,low,2,2.96,student
44,30.078125,midle_aged,high,2,50.0,private_job
56,42.414152,young,high,1,2.86,student


**Save the Trained pipeline using pickle**

In [43]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
    pickle.dump(pipeline,f)