<a href="https://colab.research.google.com/github/tharangini2005/medical-Insurance-Premium-/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# --------------------------------------------
# MEDICAL INSURANCE PREMIUM PREDICTION (YOUR DATASET)
# --------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# --------------------------------------------
# 1. LOAD YOUR DATA
# --------------------------------------------
df = pd.read_csv("insurance_sample.csv")

print("Dataset Loaded Successfully!")
print(df.head())
print("\nColumns:", df.columns.tolist())

# --------------------------------------------
# 2. ENCODE CATEGORICAL FEATURES
# --------------------------------------------
le = LabelEncoder()

cat_cols = ["sex", "smoker", "region"]
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

print("\nAfter Encoding:\n", df.head())

# --------------------------------------------
# 3. DEFINE FEATURES & TARGET
# --------------------------------------------
X = df.drop("charges", axis=1)
y = df["charges"]

# --------------------------------------------
# 4. TRAIN-TEST SPLIT
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------------------------------------------
# 5. TRAIN LINEAR REGRESSION
# --------------------------------------------
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print("\n----- LINEAR REGRESSION -----")
print("MAE:", mean_absolute_error(y_test, lr_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, lr_pred)))
print("R² Score:", r2_score(y_test, lr_pred))

# --------------------------------------------
# 6. TRAIN RANDOM FOREST (Better Model)
# --------------------------------------------
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("\n----- RANDOM FOREST -----")
print("MAE:", mean_absolute_error(y_test, rf_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, rf_pred)))
print("R² Score:", r2_score(y_test, rf_pred))

# --------------------------------------------
# 7. PREDICT PREMIUM FOR NEW CUSTOMER
# --------------------------------------------
new_customer = pd.DataFrame({
    "age": [30],
    "sex": [le.fit_transform(["female"])[0]],
    "bmi": [27.5],
    "smoker": [le.fit_transform(["no"])[0]],
    "region": [le.fit_transform(["southwest"])[0]]
})

prediction = rf.predict(new_customer)[0]
print("\nPredicted Premium for new customer:", prediction)


Dataset Loaded Successfully!
   age     sex     bmi smoker     region      charges
0   19  female  27.900    yes  southwest  16884.92400
1   18    male  33.770     no  southeast   1725.55230
2   28    male  33.000     no  southeast   4449.46200
3   33    male  22.705     no  northwest  21984.47061
4   32    male  28.880     no  northwest   3866.85520

Columns: ['age', 'sex', 'bmi', 'smoker', 'region', 'charges']

After Encoding:
    age  sex     bmi  smoker  region      charges
0   19    0  27.900       1       3  16884.92400
1   18    1  33.770       0       2   1725.55230
2   28    1  33.000       0       2   4449.46200
3   33    1  22.705       0       1  21984.47061
4   32    1  28.880       0       1   3866.85520

----- LINEAR REGRESSION -----
MAE: 4227.246575990808
RMSE: 5848.151158679762
R² Score: 0.7797027283499594

----- RANDOM FOREST -----
MAE: 2691.841474790599
RMSE: 4821.999670114754
R² Score: 0.8502294956283176

Predicted Premium for new customer: 4666.0253822666555
