In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   bmi       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 68.5+ KB


In [3]:
# comments:
# (1) there is a small amount of rows with missing values - they can be dropped
# (2) you may want to maake use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner
# (4) the questions part only print answers based on your solution

#### Questions (answer the quesitons, all computations should precede this part)

#### Question 1

In [4]:
# did you remove any numerical predictor from the data?
# if no - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [5]:
df = df.dropna().reset_index(drop=True)
dummy_gender_df = pd.get_dummies(df['gender'])
dummy_smoker_df = pd.get_dummies(df['smoker'])

df = pd.concat([df.drop('gender', axis=1), dummy_gender_df], axis=1)
df = pd.concat([df.drop('smoker', axis=1), dummy_smoker_df], axis=1)
df = df.drop(['female', 'no',], axis=1)
df['male'] = df['male'].astype(int)
df['yes'] = df['yes'].astype(int)

x = df[['age', 'bmi', 'children', 'yes']]
y = df['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,926.9
Date:,"Sat, 17 Feb 2024",Prob (F-statistic):,0.0
Time:,15:16:18,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1240,BIC:,25250.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.207e+04,977.698,-12.341,0.000,-1.4e+04,-1.01e+04
age,258.6890,12.310,21.015,0.000,234.539,282.839
bmi,320.9033,28.282,11.346,0.000,265.417,376.390
children,430.2347,142.626,3.017,0.003,150.420,710.049
yes,2.39e+04,428.191,55.809,0.000,2.31e+04,2.47e+04

0,1,2,3
Omnibus:,271.332,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,618.863
Skew:,1.195,Prob(JB):,4.13e-135
Kurtosis:,5.494,Cond. No.,294.0


In [6]:
df = df.drop(['male', 'weight'], axis=1)

scaler = StandardScaler()
scaler.fit(df[['age', 'bmi', 'children', 'yes']])

predictors_scaled = scaler.transform(df[['age', 'bmi', 'children', 'yes']])
df_scaled = pd.DataFrame(predictors_scaled, columns = ['age_scaled', 'bmi_scaled', 'children_scaled', 'yes_smokers_scaled'])

x = df_scaled[['age_scaled', 'bmi_scaled', 'children_scaled', 'yes_smokers_scaled']]
y = df[['expenses']]

x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,926.9
Date:,"Sat, 17 Feb 2024",Prob (F-statistic):,0.0
Time:,15:16:18,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1240,BIC:,25250.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.326e+04,171.437,77.330,0.000,1.29e+04,1.36e+04
age_scaled,3627.8978,172.631,21.015,0.000,3289.217,3966.578
bmi_scaled,1957.0785,172.484,11.346,0.000,1618.686,2295.471
children_scaled,517.4392,171.535,3.017,0.003,180.909,853.969
yes_smokers_scaled,9573.0601,171.534,55.809,0.000,9236.532,9909.589

0,1,2,3
Omnibus:,271.332,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,618.863
Skew:,1.195,Prob(JB):,4.13e-135
Kurtosis:,5.494,Cond. No.,1.13


#### Question 2

In [7]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your solution above
# display your answer as a dataframe (as in assignment 2)

#### Question 3

In [8]:
# what predictors have a significant contribution to the medical expenses amount?
# report only signifnicant (P<0.05) predictors sorted by their contribution to the prediction from highest to lowest,
# where for each predictor you specify if it has a positive or a negative effect on the medical expenses

# for categorical variables - specify the effect of individual values that appear signfnicant (e.g., "smoker-yes", "smoker-no")

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value