In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/insurance/insurance.csv")
print(df.head(10))
print(f"Shape of data: {df.shape}")

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
5   31  female  25.740         0     no  southeast   3756.62160
6   46  female  33.440         1     no  southeast   8240.58960
7   37  female  27.740         3     no  northwest   7281.50560
8   37    male  29.830         2     no  northeast   6406.41070
9   60  female  25.840         0     no  northwest  28923.13692
Shape of data: (1338, 7)


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [3]:
#check for null values
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [4]:
#check for duplicated 
print(df.duplicated().sum())

1


In [5]:
# drop duplicates values
df.drop_duplicates(inplace = True)

# reset indecies 
df = df.reset_index(drop=True)

In [6]:
# incoding
from sklearn.preprocessing import LabelEncoder

for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
        
        
display(df.head())

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [7]:
# split the data
from sklearn.model_selection import train_test_split

X = df.drop(['charges'], axis = 1)
Y = df['charges']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [8]:
# scaling the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [10]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define hyperparameter grid
rf_hyperparameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Perform Randomized Search Cross Validation
rf_random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_hyperparameters,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit to the training data
rf_best_model_search = rf_random_search.fit(x_train, y_train)

# Get the best model 
best_rf_model = rf_best_model_search.best_estimator_

# Make predictions using the best model
rf_best_predictions = best_rf_model.predict(x_test)


In [11]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Define hyperparameter grid
dt_hyperparameters = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Perform Randomized Search Cross Validation
dt_random_search = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_distributions=dt_hyperparameters,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit to the training data
dt_best_model_search = dt_random_search.fit(x_train, y_train)

# Get the best model from the search
best_dt_model = dt_best_model_search.best_estimator_

# Make predictions using the best model
dt_best_predictions = best_dt_model.predict(x_test)


In [12]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Define hyperparameter grid
knn_hyperparameters = {
    'n_neighbors': [3, 5, 7, 10],
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}

# Perform Randomized Search Cross Validation
knn_random_search = RandomizedSearchCV(
    estimator=KNeighborsRegressor(),
    param_distributions=knn_hyperparameters,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit to the training data
knn_best_model_search = knn_random_search.fit(x_train, y_train)

# Get the best model from the search
best_knn_model = knn_best_model_search.best_estimator_

# Make predictions using the best model
knn_best_predictions = best_knn_model.predict(x_test)


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(predictions, actual_values, model_name):
    mae = mean_absolute_error(actual_values, predictions)
    mse = mean_squared_error(actual_values, predictions)
    r2 = r2_score(actual_values, predictions)

    print(f"{model_name} Metrics:")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  R-squared (R2): {r2}")
    print("\n")

In [14]:
# Evaluate each best model
evaluate_model(dt_best_predictions, y_test, "Best Decision Tree")
evaluate_model(rf_best_predictions, y_test, "Best Random Forest")
evaluate_model(knn_best_predictions, y_test, "Best K-Nearest Neighbors")

Best Decision Tree Metrics:
  Mean Absolute Error (MAE): 2589.5079388937484
  Mean Squared Error (MSE): 19612809.6306479
  R-squared (R2): 0.8932671434110536


Best Random Forest Metrics:
  Mean Absolute Error (MAE): 2408.521561212618
  Mean Squared Error (MSE): 17953711.721556585
  R-squared (R2): 0.9022959497133057


Best K-Nearest Neighbors Metrics:
  Mean Absolute Error (MAE): 3072.954286534451
  Mean Squared Error (MSE): 25713879.431017954
  R-squared (R2): 0.8600651381754307




Comparison and Analysis:

Random Forest outperforms both Decision Tree and K-Nearest Neighbors in terms of MAE and MSE, indicating better accuracy and precision in predictions.

Decision Tree shows a high R-squared value (0.8933), suggesting a good fit to the data. However, Random Forest has a slightly higher R-squared value (0.9023), indicating an even better fit.

K-Nearest Neighbors lags behind in both MAE and MSE compared to the other two models, suggesting it may not be the most suitable model for this particular dataset.

Overall, Random Forest appears to be the most promising model among the three, offering a good balance of accuracy and generalization to the dataset.