In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('insurance.csv')
X = df.drop('charges', axis=1)
y = df['charges']

# Preprocess the data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define a pipeline with the preprocessor and models
model_pipelines = {
    'Linear Regression': Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())]),
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestRegressor(n_estimators=100, random_state=42))]),
    'Gradient Boosting': Pipeline(steps=[('preprocessor', preprocessor), ('model', GradientBoostingRegressor(n_estimators=100, random_state=42))]),
    'XGBoost': Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor(n_estimators=100, random_state=42))])
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train and evaluate each model
for model_name, pipeline in model_pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"{model_name}: MAE = {mean_absolute_error(y_test, y_pred):.2f}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}, R2 = {r2_score(y_test, y_pred):.2f}")

Linear Regression: MAE = 4145.45, RMSE = 5812.10, R2 = 0.77
Random Forest: MAE = 2566.62, RMSE = 4610.87, R2 = 0.86
Gradient Boosting: MAE = 2473.62, RMSE = 4408.21, R2 = 0.87
XGBoost: MAE = 2903.06, RMSE = 4968.03, R2 = 0.83
