# Import Libraries

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Loading dataset

In [2]:
df = pd.read_csv("data_insurance.csv")

# Exploring Data 

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.shape

(1338, 7)

In [7]:
df.index

RangeIndex(start=0, stop=1338, step=1)

# Handling Missing Values

In [8]:
# check any missing value
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
#checking for duplicates

duplicates = df.duplicated()
print("Number of duplicates row: ", duplicates.sum())

Number of duplicates row:  1


In [10]:
print(df[duplicates])

     age   sex    bmi  children smoker     region    charges
581   19  male  30.59         0     no  northwest  1639.5631


In [11]:
#removing duplicates

df = df.drop_duplicates()

In [12]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [13]:
#checking for duplicates

duplicates = df.duplicated()
print("Number of duplicates row: ", duplicates.sum())

Number of duplicates row:  0


# Converting/Encoding Categorical Values

In [14]:
#Identifing categorical columns

categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

Categorical columns: Index(['sex', 'smoker', 'region'], dtype='object')


In [15]:
# Applying Label Encoding to 'region' column
label_encoder = LabelEncoder()

categorical_cols = ['region', 'sex']
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [16]:
# if 'region' in categorical_columns:
#     df['region_encoded'] = label_encoder.fit_transform(df['region'])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,3,16884.924
1,18,1,33.77,1,no,2,1725.5523
2,28,1,33.0,3,no,2,4449.462
3,33,1,22.705,0,no,1,21984.47061
4,32,1,28.88,0,no,1,3866.8552


In [17]:
# Applying One-Hot Encoding to 'sex' and 'smoker' columns
if 'smoker' in categorical_columns:
    df = pd.get_dummies(df, columns = ['smoker'], drop_first = True)

In [18]:
df.head()

Unnamed: 0,age,sex,bmi,children,region,charges,smoker_yes
0,19,0,27.9,0,3,16884.924,1
1,18,1,33.77,1,2,1725.5523,0
2,28,1,33.0,3,2,4449.462,0
3,33,1,22.705,0,1,21984.47061,0
4,32,1,28.88,0,1,3866.8552,0


In [19]:
# df.drop(columns = ['region'], inplace = True)
# df.head()

# Featuring Scaling

In [20]:
# Select the columns to be scaled
columns_to_scale = ['age', 'bmi']

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the selected columns and transform them
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [21]:
df

Unnamed: 0,age,sex,bmi,children,region,charges,smoker_yes
0,-1.440418,0,-0.453160,0,3,16884.92400,1
1,-1.511647,1,0.509422,1,2,1725.55230,0
2,-0.799350,1,0.383155,3,2,4449.46200,0
3,-0.443201,1,-1.305052,0,1,21984.47061,0
4,-0.514431,1,-0.292456,0,1,3866.85520,0
...,...,...,...,...,...,...,...
1333,0.767704,1,0.050269,3,1,10600.54830,0
1334,-1.511647,0,0.206053,0,0,2205.98080,0
1335,-1.511647,0,1.014490,0,2,1629.83350,0
1336,-1.297958,0,-0.797524,0,3,2007.94500,0


# Defining input features and target variable

In [22]:
# Input Feature, x = 'age', 'bmi', 'children', 'region_encoded', 'sex_male', 'smoker_yes' 
# and Target Variable, y = 'charges'

X = df.drop(columns = ['charges'])
y = df['charges']

In [23]:
X.head()

Unnamed: 0,age,sex,bmi,children,region,smoker_yes
0,-1.440418,0,-0.45316,0,3,1
1,-1.511647,1,0.509422,1,2,0
2,-0.79935,1,0.383155,3,2,0
3,-0.443201,1,-1.305052,0,1,0
4,-0.514431,1,-0.292456,0,1,0


In [24]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

# Splitting dataset into train and test

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [26]:
X_train

Unnamed: 0,age,sex,bmi,children,region,smoker_yes
536,-0.443201,0,1.350655,3,3,0
1095,-1.511647,0,0.112582,4,0,0
629,0.340326,0,1.358854,0,1,1
411,0.340326,0,-1.710091,1,0,1
773,-1.440418,0,-0.292456,0,1,1
...,...,...,...,...,...,...
1096,0.838934,0,0.704562,2,0,1
1131,-0.870580,1,2.498538,2,3,0
1295,-1.369188,1,-1.420660,1,3,0
861,-0.087053,0,-0.436761,3,3,0


In [27]:
X_test

Unnamed: 0,age,sex,bmi,children,region,smoker_yes
900,0.696474,1,-1.336209,0,0,0
1064,-0.728120,0,-0.830321,4,3,0
1256,0.838934,0,0.938238,3,1,0
298,-0.585661,1,0.611091,3,1,1
237,-0.585661,1,1.267024,2,2,0
...,...,...,...,...,...,...
175,1.693691,0,1.153876,0,3,1
1337,1.551231,0,-0.261299,0,1,1
81,0.411556,0,1.249806,0,0,0
192,-1.013039,1,-0.807363,0,2,0


In [28]:
y_train

536      5972.37800
1095     4561.18850
629     42983.45850
411     19594.80965
773     17748.50620
           ...     
1096    44641.19740
1131     3693.42800
1295     1964.78000
861      7151.09200
1127     5836.52040
Name: charges, Length: 1203, dtype: float64

In [29]:
y_test

900      8688.85885
1064     5708.86700
1256    11436.73815
298     38746.35510
237      4463.20510
           ...     
175     48824.45000
1337    29141.36030
81       7935.29115
192      2137.65360
1135    11085.58680
Name: charges, Length: 134, dtype: float64

# Training Linear Regression Model

In [30]:
model = LinearRegression()
model.fit(X_train, y_train)

In [31]:
dt_regressor = DecisionTreeRegressor()


In [32]:
# Train the model on the training data
dt_regressor.fit(X_train, y_train)

In [33]:
# Make predictions on the test data
y_pred_1 = dt_regressor.predict(X_test)

In [38]:
# Print the model's R^2 score
r2 = dt_regressor.score(X_test, y_test)
print("Test R^2 Score:", r2)

Test R^2 Score: 0.7394107994785758


# Making predictions on test data

In [39]:
y_pred = model.predict(X_test)

# Evaluating the model using Mean Squared Error and R2 Score

In [40]:
mse = mean_squared_error(y_test, y_pred)
r2_value = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

In [41]:
print("Mean Squared Error: ", mse)
print("R2 Score: ", r2_value)
print("Root Mean Squared Error: ", rmse)

Mean Squared Error:  38521692.08100186
R2 Score:  0.7853814689062371
Root Mean Squared Error:  6206.584574546766
