In [3]:
# Title of Project
print("Predicting Insurance Charges")

# Objective
print("\nObjective: To build a regression model to predict insurance charges based on features.")

# Data Source
data_source = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"

# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Import Data
data = pd.read_csv(data_source)

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], drop_first=True)

# Data Description
print("\nData Description:")
print(data.head())
print(data.info())
print(data.describe())

# Define Target Variable (y) and Feature Variables (X)
X = data.drop('charges', axis=1)
y = data['charges']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling
model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"\nTraining Score: {train_score}")
print(f"Testing Score: {test_score}")

# Prediction
y_pred = model.predict(X_test)

# Model Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\nMean Squared Error: {mse}")
print(f"R-squared: {r2}")


Predicting Insurance Charges

Objective: To build a regression model to predict insurance charges based on features.

Data Description:
   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0  16884.92400     False        True             False   
1   18  33.770         1   1725.55230      True       False             False   
2   28  33.000         3   4449.46200      True       False             False   
3   33  22.705         0  21984.47061      True       False              True   
4   32  28.880         0   3866.85520      True       False              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null