   # Regression - insurance dataset
 


In [1]:
# Import libraries

import numpy as np # creating and manipulating arrays
import matplotlib.pyplot as plt # visualizing data
import sklearn # regression models
import statsmodels.api as sm 
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#### Topic 1:

Import data --> Medical insurance dataset 
(https://www.kaggle.com/code/mariapushkareva/medical-insurance-cost-with-linear-regression/data)

In [2]:
# Import data and look at descriptive stats

df = pd.read_csv("insurance.csv")
#df.head()

In [3]:
# Encoding --> convert categorical into quantitative

df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.codes

df['smoker'] = df['smoker'].astype('category')
df['smoker'] = df['smoker'].cat.codes

df['region'] = df['region'].astype('category')
df['region'] = df['region'].cat.codes

In [None]:
df.head()

In [None]:
# Find missing values

df.describe()

In [None]:
# Create inputs and output

y = df['charges']
X = df.drop('charges',axis=1)
X

#### Topic 2:

Feature selection using statmodels

In [None]:
model = sm.OLS(endog= y, exog= X)
results = model.fit()
print(results.summary())

In [None]:
X = X.drop('children',axis=1)

X

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

In [10]:
reg = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
#y_pred = pd.DataFrame(y_pred, columns = ['Predictions'])

In [None]:
plt.scatter(X_test['age'], y_test, color = "red", label = "Test data")
plt.scatter(X_test['age'], y_pred, color = "green", label = "predictions")
plt.xlabel("Age")
plt.ylabel("Charges")
plt.legend(loc="upper left")
plt.show()

In [None]:
val_error = mean_squared_error(y_test, y_pred)

y_train_pred = reg.predict(X_train)
train_error = mean_squared_error(y_train, y_train_pred)

# Get validation scores
scores = cross_validate(reg, X_train, y_train, cv=3)

print("Training MSE: ", train_error)
print("Validation MSE: ", np.mean(scores['test_score']))

score = reg.score(X_test, y_test)
print("R-squared:", score)