In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt

# Load the dataset
df = pd.read_csv("insurance_data.csv")

# Display the first few rows to understand the structure
print("Dataset preview:")
print(df.head())

# One-hot encode categorical variables if needed
df = pd.get_dummies(df, drop_first=True)

# Separate features and target variable
X = df.drop('claim', axis=1)
y = df['claim']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(df.describe())

# Impute missing values in 'age' column with the mean (can also use median if preferred)
age_mean = X_train['age'].mean()
X_train['age'] = X_train['age'].fillna(age_mean)
X_test['age'] = X_test['age'].fillna(age_mean)

# MULTIPLE LINEAR REGRESSION
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_linear = lin_reg.predict(X_test)

# Evaluate linear regression
print("\nMultiple Linear Regression:")
print("R^2 Score:", r2_score(y_test, y_pred_linear))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_linear)))

# POLYNOMIAL REGRESSION (degree=2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
y_pred_poly = poly_reg.predict(X_test_poly)

# Evaluate polynomial regression
print("\nPolynomial Regression (Degree 2):")
print("R^2 Score:", r2_score(y_test, y_pred_poly))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_poly)))





Dataset preview:
   index  PatientID   age gender   bmi  bloodpressure diabetic  children  \
0      0          1  39.0   male  23.2             91      Yes         0   
1      1          2  24.0   male  30.1             87       No         0   
2      2          3   NaN   male  33.3             82      Yes         0   
3      3          4   NaN   male  33.7             80       No         0   
4      4          5   NaN   male  34.1            100       No         0   

  smoker     region    claim  
0     No  southeast  1121.87  
1     No  southeast  1131.51  
2     No  southeast  1135.94  
3     No  northwest  1136.40  
4     No  northwest  1137.01  
             index    PatientID          age          bmi  bloodpressure  \
count  1340.000000  1340.000000  1335.000000  1340.000000    1340.000000   
mean    669.500000   670.500000    38.078652    30.668955      94.157463   
std     386.968991   386.968991    11.102924     6.106735      11.434712   
min       0.000000     1.000000    1