In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale,StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score

In [9]:
dataset = pd.read_csv('insurance.csv')

In [10]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [35]:
print(dataset.isna().any())

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool


In [36]:
dataset.corr() #check how features are correlated 
#from the results below, we can see bmi, age influence charge the most

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [37]:
dummies = pd.get_dummies(dataset[['sex','smoker','region']])
dummies

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [14]:
x = dataset.drop(columns=['charges']) #features
y = dataset['charges'] #labels

In [38]:
# turn category into  numbers 
from  sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer


categorical_features = ["sex", "smoker", "region"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")
transformed_X = transformer.fit_transform(x)
transformed_X

array([[ 1.  ,  0.  ,  0.  , ..., 19.  , 27.9 ,  0.  ],
       [ 0.  ,  1.  ,  1.  , ..., 18.  , 33.77,  1.  ],
       [ 0.  ,  1.  ,  1.  , ..., 28.  , 33.  ,  3.  ],
       ...,
       [ 1.  ,  0.  ,  1.  , ..., 18.  , 36.85,  0.  ],
       [ 1.  ,  0.  ,  1.  , ..., 21.  , 25.8 ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 61.  , 29.07,  0.  ]])

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.50) 

In [45]:
import numpy as np
ss = StandardScaler()
X_train = ss.fit_transform(x_train)
X_test = ss.transform(x_test)
y_train = np.array(y_train)

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

regr = LinearRegression()
regr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [47]:
predictions = regr.predict(x_train)
regressions = regr.predict(x_test)

In [48]:
r2_pred = r2_score(y_train,predictions)
r2_reg = r2_score(y_test,regressions)
print('Prediction:',r2_pred)
print('Regression:',r2_reg)
#the prediction line seems to be more accurate than the regression line 

Prediction: 0.7681704441614442
Regression: 0.7270747381166061


As the test_split ratio increases to 50%, the accuracy increases a more for regression than prediction

In [49]:
#Dummy encoding the object values to and saving them to a new dataframe 
dataset2 = pd.get_dummies(dataset, columns = ['sex', 'smoker', 'region'], dtype = 'int8')

In [50]:
#Setting up the features and labels
x2 = dataset2[['age', 'bmi', 'children', 'sex_female', 'sex_male','smoker_no', 'smoker_yes', 'region_northeast', 
        'region_northwest', 'region_southeast', 'region_southwest']].values
y2 = dataset2['charges'].values

In [62]:
#Using StandardScaler to preprocess the values
scaler = StandardScaler()
x2 = scaler.fit_transform(x2)

x1_train, x1_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.1, random_state = 7)

In [63]:
regr.fit(x1_train, y2_train)

regression_line = regr.predict(x1_train)
y_predict = regr.predict(x1_test)

In [64]:
r2_pred1 = r2_score(y2_train,regression_line)
r2_reg1 = r2_score(y2_test,y_predict)
print('Prediction:',r2_pred1)
print('Regression:',r2_reg1)

Prediction: 0.7517177053267693
Regression: 0.742863804741549


#the regression line is more accurate than the prediction line
#increasing the test size, both accuracies drop, but the regression more than the prediction
#at 10%, the prediction become more accurate 