In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_and_explore_data():
    """Load the insurance dataset and perform initial exploration"""
    print("=== STEP 1: DATA COLLECTION & UNDERSTANDING ===")
    
    # Load data
    df = pd.read_csv('insurance.csv')

    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nDataset info:")
    print(df.info())
    
    print("\nMissing values:")
    print(df.isnull().sum())

    print("\nDuplicate rows:")
    print(df.duplicated().sum())
    if (df.duplicated().sum()>0):
        df = df.drop_duplicates(keep='first')
        print("removing duplicates")
        print(df.duplicated().sum())

    print("\nStatistical summary:")
    print(df.describe())
    
    print("\nCategorical variables unique values:")
    categorical_cols = ['sex', 'smoker', 'region']
    for col in categorical_cols:
        print(f"{col}: {df[col].unique()}")
    
    return df

In [None]:
def visualize_data(df):
    """Create visualizations for data exploration"""
    print("\n=== DATA VISUALIZATION ===")
    
    plt.figure(figsize=(15, 12))
    
    # 1. Charges distribution
    plt.subplot(2, 3, 1)
    plt.hist(df['charges'], bins=30, edgecolor='black', alpha=0.7)
    plt.title('Distribution of Insurance Charges')
    plt.xlabel('Charges')
    plt.ylabel('Frequency')
    
    # 2. Charges vs Age
    plt.subplot(2, 3, 2)
    plt.scatter(df['age'], df['charges'], alpha=0.6)
    plt.title('Charges vs Age')
    plt.xlabel('Age')
    plt.ylabel('Charges')
    
    # 3. Charges vs BMI
    plt.subplot(2, 3, 3)
    plt.scatter(df['bmi'], df['charges'], alpha=0.6)
    plt.title('Charges vs BMI')
    plt.xlabel('BMI')
    plt.ylabel('Charges')
    
    # 4. Charges by Smoker status
    plt.subplot(2, 3, 4)
    df.boxplot(column='charges', by='smoker', ax=plt.gca())
    plt.title('Charges by Smoker Status')
    plt.suptitle('')  # Remove the automatic title
    
    # 5. Charges by Sex
    plt.subplot(2, 3, 5)
    df.boxplot(column='charges', by='sex', ax=plt.gca())
    plt.title('Charges by Sex')
    plt.suptitle('')
    
    # 6. Correlation heatmap
    plt.subplot(2, 3, 6)
    # Convert categorical to numerical for correlation
    df_corr = df.copy()
    df_corr['sex'] = df_corr['sex'].map({'male': 1, 'female': 0})
    df_corr['smoker'] = df_corr['smoker'].map({'yes': 1, 'no': 0})
    df_corr = pd.get_dummies(df_corr, columns=['region'])
    
    corr_matrix = df_corr.corr()
    sns.heatmap(corr_matrix[['charges']].sort_values('charges', ascending=False), 
                annot=True, cmap='coolwarm', center=0)
    plt.title('Features Correlation with Charges')
    
    plt.tight_layout()
    plt.show()


In [None]:
def preprocess_data(df):
    """Preprocess the data for machine learning"""
    print("\n=== STEP 2: DATA PREPROCESSING ===")
    
    # Separate features and target
    X = df.drop('charges', axis=1)
    y = df['charges']
    
    # Define preprocessing for numerical and categorical features
    numerical_features = ['age', 'bmi', 'children']
    categorical_features = ['sex', 'smoker', 'region']
    
    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first'), categorical_features)
        ])
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test, preprocessor

In [None]:
def build_models(X_train, X_test, y_train, y_test, preprocessor):
    """Build and compare different models"""
    print("\n=== STEP 3: MODEL BUILDING ===")
    
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42)
    }
    
    results = {}
    best_model = None
    best_score = float('-inf')
    best_pipeline = None
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'pipeline': pipeline
        }
        
        print(f"MSE: {mse:.2f}")
        print(f"RMSE: {rmse:.2f}")
        print(f"R² Score: {r2:.4f}")
        
        if r2 > best_score:
            best_score = r2
            best_model = name
            best_pipeline = pipeline
    
    print(f"\nBest model: {best_model} with R² Score: {best_score:.4f}")
    
    return results, best_pipeline, best_model

In [None]:
def hyperparameter_tuning(X_train, y_train, preprocessor):
    """Perform hyperparameter tuning for Random Forest"""
    print("\n=== HYPERPARAMETER TUNING ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])
    
    # Define parameter grid
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20],
        'regressor__min_samples_split': [2, 5, 10]
    }
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

In [None]:
def evaluate_model(model, X_test, y_test):
    """Evaluate the best model and create visualizations"""
    print("\n=== STEP 4: MODEL EVALUATION ===")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Final Model Performance:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    # Visualization
    plt.figure(figsize=(12, 5))
    
    # Actual vs Predicted
    plt.subplot(1, 2, 1)
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Charges')
    plt.ylabel('Predicted Charges')
    plt.title('Actual vs Predicted Charges')
    
    # Residual plot
    plt.subplot(1, 2, 2)
    residuals = y_test - y_pred
    plt.scatter(y_pred, residuals, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Charges')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    
    plt.tight_layout()
    plt.show()
    
    return mse, rmse, r2

In [None]:
def save_model(model, filename='insurance_cost_model.pkl'):
    """Save the trained model"""
    print(f"\n=== STEP 5: MODEL SAVING ===")
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

In [None]:
def predict_insurance_cost(model, age, sex, bmi, children, smoker, region):
    """Predict insurance cost for new data"""
    # Create input dataframe
    input_data = pd.DataFrame({
        'age': [age],
        'sex': [sex],
        'bmi': [bmi],
        'children': [children],
        'smoker': [smoker],
        'region': [region]
    })
    
    # Make prediction
    prediction = model.predict(input_data)[0]
    return round(prediction, 2)

In [None]:
"""Main function to run the complete pipeline"""
print("MEDICAL INSURANCE COST PREDICTOR")
print("=" * 50)

# Step 1: Data Collection & Understanding
df = load_and_explore_data()
visualize_data(df)

# Step 2: Data Preprocessing
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df)

# Step 3: Model Building
results, best_pipeline, best_model_name = build_models(
    X_train, X_test, y_train, y_test, preprocessor
)

# Hyperparameter tuning for Random Forest
tuned_model = hyperparameter_tuning(X_train, y_train, preprocessor)

# Step 4: Model Evaluation
mse, rmse, r2 = evaluate_model(tuned_model, X_test, y_test)

# Step 5: Model Saving
save_model(tuned_model)

print("\n=== MODEL TRAINING COMPLETE ===")
print("You can now use the model for predictions!")

# Example prediction
print("\n=== EXAMPLE PREDICTION ===")
sample_prediction = predict_insurance_cost(
    tuned_model, 
    age=25, 
    sex='male', 
    bmi=26.2, 
    children=0, 
    smoker='no', 
    region='southwest'
)
print(f"Predicted insurance cost for a 25-year-old non-smoking male: ${sample_prediction}")