<a href="https://colab.research.google.com/github/vidorc/InsurPredict-FastAPI-Insurance-Premium-Prediction-API/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('/content/insurance.csv')

In [None]:
df.sample(5)
df_feat = df.copy()

In [None]:
df_feat['bmi'] = df_feat['weight']/(df_feat['height']/100)**2

In [None]:
def age_group(age):
  if age < 25 :
    return "young"
  elif age < 45 :
    return "adult"
  elif age < 60 :
    return "middle age"
  else :
    return "old"

In [None]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [None]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"



In [None]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [None]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [None]:
def city_tier(city_name: str) -> int:
    if city_name in tier_1_cities:
        return 1
    elif city_name in tier_2_cities:
        return 2
    else:
        return 3

In [None]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [None]:
df_feat.drop(columns=['smoker'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
49,2.29,student,427014.901458,young,medium,3,Medium
38,1.84,retired,332049.329631,old,high,2,High
61,24.05,unemployed,362811.791383,adult,high,2,High
81,22.19,freelancer,318660.545504,adult,high,2,High
59,1.13,retired,358350.440768,old,medium,2,High


In [None]:
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [None]:
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.85

In [None]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
44,300781.25,middle age,high,2,50.0,private_job
33,217910.640496,old,medium,1,1.46,retired
93,231994.156318,young,medium,2,1.28,student
36,217132.659131,old,medium,1,0.53,retired
39,356434.240363,middle age,high,1,11.99,unemployed


In [None]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)


In [None]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)