In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load cleaned data
df = pd.read_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/medical_insurance.csv")
print(df.head())

   age  sex     bmi  children  smoker      charges  log_charges  \
0   19    1  27.900         0       1  16884.92400     9.734236   
1   18    0  33.770         1       0   1725.55230     7.453882   
2   28    0  33.000         3       0   4449.46200     8.400763   
3   33    0  22.705         0       0  21984.47061     9.998137   
4   32    0  28.880         0       0   3866.85520     8.260455   

   region_northwest  region_southeast  region_southwest  
0             False             False              True  
1             False              True             False  
2             False              True             False  
3              True             False             False  
4              True             False             False  


In [3]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop(['charges','log_charges'], axis=1)  # Features
y = df['charges']               # Target variable

# Split with stratification on 'smoker' (to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=df['smoker']  # Critical for this dataset
)

print(f"Train size: {len(X_train)} samples")
print(f"Test size: {len(X_test)} samples")

Train size: 2195 samples
Test size: 549 samples


In [5]:
X_train.to_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/X_train.csv", index=False)
X_test.to_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/X_test.csv", index=False)
y_train.to_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/y_train.csv", index=False)
y_test.to_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/y_test.csv", index=False)

In [7]:
import time
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow

In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results.append({
        'Model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': mean_squared_error(y_test, y_pred, squared=False),
        'R2': r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).sort_values('R2', ascending=False)
print(results_df.to_markdown(index=False))
# Create leaderboard dataframe
leaderboard = pd.DataFrame(results).sort_values("R2", ascending=False)
leaderboard["Rank"] = range(1, len(leaderboard)+1)
leaderboard.set_index("Rank", inplace=True)

# Display leaderboard
print(" Model Leaderboard ")
display(leaderboard.style
        .background_gradient(cmap="viridis", subset=["R2"])
        .format({
            "R2": "{:.3f}",
            "MAE": "${:,.0f}",
            "RMSE": "${:,.0f}",
            "Training Time (s)": "{:.3f}"
        }))




| Model             |     MAE |    RMSE |       R2 |
|:------------------|--------:|--------:|---------:|
| XGBoost           | 1137.53 | 2537.65 | 0.949752 |
| Random Forest     | 1353.27 | 2838.01 | 0.937153 |
| Gradient Boosting | 2163.71 | 4081.12 | 0.870039 |
| Decision Tree     | 2376.29 | 4282.97 | 0.856866 |
| Linear Regression | 4187.3  | 5956.15 | 0.723187 |
 Model Leaderboard 




Unnamed: 0_level_0,Model,MAE,RMSE,R2
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,XGBoost,"$1,138","$2,538",0.95
2,Random Forest,"$1,353","$2,838",0.937
3,Gradient Boosting,"$2,164","$4,081",0.87
4,Decision Tree,"$2,376","$4,283",0.857
5,Linear Regression,"$4,187","$5,956",0.723


In [11]:
from sklearn.model_selection import GridSearchCV

In [13]:
# 5. Initialize MLflow
mlflow.set_experiment("Medic_Insurance_Cost_Prediction")

# 6. Define models to compare (now with Decision Tree instead of Ridge)
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# 7. Train and track models
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train model
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log parameters
        mlflow.log_params(model.get_params())
        
        # Log metrics
        mlflow.log_metrics({
            "r2": r2,
            "mae": mae,
            "rmse": rmse
        })
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Print results
        print(f"{model_name:>20} | R2: {r2:.4f} | MAE: ${mae:,.2f} | RMSE: ${rmse:,.2f}")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



   Linear Regression | R2: 0.7232 | MAE: $4,187.30 | RMSE: $5,956.15




       Decision Tree | R2: 0.9377 | MAE: $565.93 | RMSE: $2,826.60




       Random Forest | R2: 0.9372 | MAE: $1,353.27 | RMSE: $2,838.01




   Gradient Boosting | R2: 0.8700 | MAE: $2,163.71 | RMSE: $4,081.12




             XGBoost | R2: 0.9498 | MAE: $1,137.53 | RMSE: $2,537.65


In [None]:
!mlflow ui

In [17]:
import joblib
import os
from datetime import datetime

# Create directory for saved models
model_dir = "saved_insurance_models"
os.makedirs(model_dir, exist_ok=True)

# Save each trained model with timestamp
for model_name, model in models.items():
    # Create filename with model name and timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{model_dir}/{model_name}_{timestamp}.joblib"
    
    # Save the model
    joblib.dump(model, filename)
    
    print(f"Saved {model_name} as: {filename}")

print(f"\n✅ All models saved to: {os.path.abspath(model_dir)}")

Saved Linear Regression as: saved_insurance_models/Linear Regression_20250703_170025.joblib
Saved Decision Tree as: saved_insurance_models/Decision Tree_20250703_170025.joblib
Saved Random Forest as: saved_insurance_models/Random Forest_20250703_170025.joblib
Saved Gradient Boosting as: saved_insurance_models/Gradient Boosting_20250703_170025.joblib
Saved XGBoost as: saved_insurance_models/XGBoost_20250703_170025.joblib

✅ All models saved to: C:\Users\hp\saved_insurance_models
