# 1. Business Problem

# 2. Data Understanding
- Load data
- Understand every variable
- Data Exploration
- EDA

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [158]:
df=pd.read_csv('C:\\Users\\yadav\\Downloads\\Jupyter\\Datasets\\insurance.csv')
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


***Dataset Understanding***

In [159]:
df.shape

(1338, 7)

In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [161]:
df['age'].unique()

array([19, 18, 28, 33, 32, 31, 46, 37, 60, 25, 62, 23, 56, 27, 52, 30, 34,
       59, 63, 55, 22, 26, 35, 24, 41, 38, 36, 21, 48, 40, 58, 53, 43, 64,
       20, 61, 44, 57, 29, 45, 54, 49, 47, 51, 42, 50, 39], dtype=int64)

In [162]:
df['age'].describe()

count    1338.000000
mean       39.207025
std        14.049960
min        18.000000
25%        27.000000
50%        39.000000
75%        51.000000
max        64.000000
Name: age, dtype: float64

In [163]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [164]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [165]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [166]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [167]:
df['children'].describe()

count    1338.000000
mean        1.094918
std         1.205493
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         5.000000
Name: children, dtype: float64

In [168]:
df['children'].unique()

array([0, 1, 3, 2, 5, 4], dtype=int64)

In [169]:
df['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [170]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [171]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

***Exploratory Data Analysis***

In [172]:
#Creating List
continuous_features=['age','bmi','expenses'] 
discrete_categorical= ['sex','region','smoker']
discrete_count = ['children']

In [173]:
df[continuous_features].describe()

Unnamed: 0,age,bmi,expenses
count,1338.0,1338.0,1338.0
mean,39.207025,30.663397,13270.422265
std,14.04996,6.098187,12110.011237
min,18.0,15.96,1121.8739
25%,27.0,26.29625,4740.28715
50%,39.0,30.4,9382.033
75%,51.0,34.69375,16639.912515
max,64.0,53.13,63770.42801


In [174]:
df[discrete_categorical].describe()

Unnamed: 0,sex,region,smoker
count,1338,1338,1338
unique,2,4,2
top,male,southeast,no
freq,676,364,1064


In [175]:
df[continuous_features].corr()

Unnamed: 0,age,bmi,expenses
age,1.0,0.109272,0.299008
bmi,0.109272,1.0,0.198341
expenses,0.299008,0.198341,1.0


# 3. Data Preprocessing
- remove unwanted columns using common sense
- add new column if required
- data cleaning (treat wrong data, treat wrong data type, remove duplicates, treat missing value, outlier(retrain, drop, leave as it is))
- data wrangling (encoding, scaling, transformation)
- Note: Scaling is not required in Regression, scaling should not be applied on output var because RMSE becomes low.
- scaling is useful in clustering and classification 

***Data cleaning***

In [176]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [177]:
df.duplicated().sum()

1

In [178]:
df.drop_duplicates(inplace=True)

In [179]:
df.shape

(1337, 7)

***Encoding***

In [180]:
# encoding sex column
df['sex']=df['sex'].replace({'female':0,'male':1})

# encoding smoker column
df['smoker']=df['smoker'].replace({'no':0,'yes':1})

In [181]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830
1334,18,0,31.920,0,0,northeast,2205.98080
1335,18,0,36.850,0,0,southeast,1629.83350
1336,21,0,25.800,0,0,southwest,2007.94500


In [182]:
df.drop('region',axis=1,inplace=True)

In [183]:
X=df.drop('expenses', axis=1)
y=df['expenses']

***Train & Test split***

In [184]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)

# Modelling and Evaluation

***finding best alpha value***

In [1]:
from sklearn.model_selection import GridSearchCV

#model
from sklearn.linear_model import Lasso
estimator=Lasso()

#parameters & values
param_grid = {'alpha': list(range(1,100))}

#Identifying the best value of parameters within given values for the given data
model_hp = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')

model_hp.fit(X_train, y_train)
model_hp.best_params_

NameError: name 'X_train' is not defined

***Build Lasso model using best parameters***

In [186]:
#Modelling
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=60)
lasso_best.fit(X_train, y_train)

#intercept and coefficient
print('Coefficient: ', lasso_best.coef_)
print('Intercept: ', lasso_best.intercept_)

#train predict
ypred_train=lasso_best.predict(X_train)

# train r2
from sklearn.metrics import r2_score
print('Train R2: ', r2_score(y_train,ypred_train))

#cross validation
from sklearn.model_selection import cross_val_score
print('CV score: ', cross_val_score(lasso_best, X_train, y_train, cv=5).mean())

#test predict
ypred_test=lasso_best.predict(X_test)

#test r2
from sklearn.metrics import r2_score
print('Test R2: ', r2_score(y_test,ypred_test))


Coefficient:  [  264.38161528    -0.           316.89561192   373.0709729
 23622.96074097]
Intercept:  -12040.593571887146
Train R2:  0.7591801991925938
CV score:  0.7537058055900724
Test R2:  0.7009201105342885


***we can conclude that sex variable is not important, so we will drop it***

# Final Model

In [192]:
X=df[['age','bmi','children','smoker']]
y=df['expenses']

In [195]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)

In [196]:
#Modelling
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=60)
lasso_best.fit(X_train, y_train)

#intercept and coefficient
print('Coefficient: ', lasso_best.coef_)
print('Intercept: ', lasso_best.intercept_)

#train predict
ypred_train=lasso_best.predict(X_train)

# train r2
from sklearn.metrics import r2_score
print('Train R2: ', r2_score(y_train,ypred_train))

#cross validation
from sklearn.model_selection import cross_val_score
print('CV score: ', cross_val_score(lasso_best, X_train, y_train, cv=5).mean())

#test predict
ypred_test=lasso_best.predict(X_test)

#test r2
from sklearn.metrics import r2_score
print('Test R2: ', r2_score(y_test,ypred_test))

Coefficient:  [  264.38141961   316.89570704   373.07104773 23622.96061177]
Intercept:  -12040.588905108545
Train R2:  0.7591801990707372
CV score:  0.753814164737532
Test R2:  0.7009201316340361


# Prediction on new data

In [237]:
input_data = {'age':26, 'sex':'male','bmi':25, 'children':2, 'smoker':'yes', 'region':'southwest'}
input_data

{'age': 26,
 'sex': 'male',
 'bmi': 25,
 'children': 2,
 'smoker': 'yes',
 'region': 'southwest'}

In [238]:
df_test=pd.DataFrame(input_data, index=[0])
df_test

Unnamed: 0,age,sex,bmi,children,smoker,region
0,26,male,25,2,yes,southwest


In [239]:
# Use data preprocessing as same order as happend while developing model

# If you want to use inplace=True, do it like this:
df_test.drop(['region'], axis=1, inplace=True)

# Use replace with inplace=True (no assignment needed), either assign or use inplace
df_test['sex'].replace({'female': 0, 'male': 1}, inplace=True)
df_test['smoker'].replace({'no': 0, 'yes': 1}, inplace=True)

# Drop sex column
df_test.drop(['sex'], axis=1, inplace=True)

In [240]:
df_test

Unnamed: 0,age,bmi,children,smoker
0,26,25,2,1


In [241]:
print('Expenses for given data: ', lasso_best.predict(df_test))

Expenses for given data:  [27124.82338801]
