In [4]:
import kagglehub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge
import matplotlib.pyplot as plt
import os


In [5]:
path = kagglehub.dataset_download("mirichoi0218/insurance")
data = pd.read_csv(os.path.join(path, "insurance.csv"))
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
# Now, let's find which feature is important and which is not.
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()
print(correlation_matrix["charges"].sort_values(ascending=False))
# So, age, bmi and children are important features. But, what about smoker?
# print(data.groupby("smoker")["charges"].mean())
# Smoker is a categorical feature but it is very important but it is not in numeric format.
# So, we need to convert it into numeric format.
# 'smoker_yes' is already boolean, convert it to integer (0/1) if needed
data["smoker_yes"] = data["smoker_yes"].astype(int)
# Now, let's check the correlation again.
correlation_matrix = data.corr()
print(correlation_matrix["charges"].sort_values(ascending=False))
# So, smoker_yes, age, bmi and children are important features. But, why the region_southeast is still higher than children? Why don't we choose it?
# Common sense tells us that region is not important. So, we will drop it.
X = data[["age", "bmi", "children", "smoker_yes"]]
y = data["charges"]


charges       1.000000
smoker_yes    0.787251
age           0.299008
bmi           0.198341
children      0.067998
Name: charges, dtype: float64
charges             1.000000
smoker_yes          0.787251
age                 0.299008
bmi                 0.198341
region_southeast    0.073982
children            0.067998
sex_male            0.057292
region_northwest   -0.039905
region_southwest   -0.043210
Name: charges, dtype: float64


In [19]:
# DATA PREPROCESSING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [22]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Linear Regression - Accuracy: {accuracy}")

# 78% accuracy is good but we can do better.


Linear Regression - MSE: 33981653.95019775, R2: 0.7811147722517887
Linear Regression - Accuracy: 0.7811147722517887


In [None]:
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Lasso Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Lasso Regression - Accuracy: {accuracy}")
#MSE is quite large. Let's try Ridge Regression.

Lasso Regression - MSE: 33981749.95140859, R2: 0.7811141538814489
Lasso Regression - Accuracy: 0.7811141538814489


In [32]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Ridge Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Ridge Regression - Accuracy: {accuracy}")


Ridge Regression - MSE: 33987477.223127544, R2: 0.7810772629409309
Ridge Regression - Accuracy: 0.7810772629409309
