# Multiple Linear Regression

Objective: Predicting insurance charges  
Dataset: insurance.csv  
Ref: https://www.kaggle.com/macostrans/multiple-linear-regression-model-applied/code

In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('insurance.csv')
print('Dataset Shape', dataset.shape)
dataset.head()

Dataset Shape (1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB


## Modelling

In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 6].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 1] = labelencoder.fit_transform(X[:, 1])
X[:, 4] = labelencoder.fit_transform(X[:, 4])
X[:, 5] = labelencoder.fit_transform(X[:, 5])
onehotencoder = OneHotEncoder(categorical_features = [5])
X = onehotencoder.fit_transform(X).toarray()

print('X', X.shape)
print('Y', y.shape)

X (1338, 9)
Y (1338,)


In [4]:
X

array([[ 0.  ,  0.  ,  0.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  1.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  0.  ,  1.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

In [5]:
#start with second variable
X = X[:, 1:]
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

In [6]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm
X = np.append(arr = np.ones((1338, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 1, 2, 3, 4, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 2, 3, 4, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 2, 4, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 4, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,998.1
Date:,"Wed, 12 Dec 2018",Prob (F-statistic):,0.0
Time:,15:56:17,Log-Likelihood:,-13551.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1333,BIC:,27140.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.21e+04,941.984,-12.848,0.000,-1.4e+04,-1.03e+04
x1,257.8495,11.896,21.675,0.000,234.512,281.187
x2,321.8514,27.378,11.756,0.000,268.143,375.559
x3,473.5023,137.792,3.436,0.001,203.190,743.814
x4,2.381e+04,411.220,57.904,0.000,2.3e+04,2.46e+04

0,1,2,3
Omnibus:,301.48,Durbin-Watson:,2.087
Prob(Omnibus):,0.0,Jarque-Bera (JB):,722.157
Skew:,1.215,Prob(JB):,1.5300000000000003e-157
Kurtosis:,5.654,Cond. No.,292.0


In [7]:
X_opt = X[:, [0, 2, 3, 4, 6, 7, 8]]

In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_opt, y, test_size = 0.2, random_state = 0)



In [9]:
print('X Train', X_train.shape)
print('X Test', X_test.shape)

X Train (1070, 7)
X Test (268, 7)


In [10]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

print('Model accuracy score:', round(regressor.score(X_train,y_train)*100,2))

Model accuracy score: 73.7


In [11]:
y_pred = regressor.predict(X_test)

## Prediction

In [13]:
y_test=regressor.predict(X_test)

charges_pred = pd.DataFrame({'Charges Prediction':y_test}).round(2)
print(charges_pred.shape)
charges_pred.head()

(268, 1)


Unnamed: 0,Charges Prediction
0,11179.78
1,9478.44
2,38323.8
3,16408.18
4,7036.29
