# ELASTICNET REGRESSION

# step-1: BUSSINESS PROBLEM UNDERSTANDING

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


# step-2: DATA UNDERSTANDING 

**LOAD DATA AND UNDERSTAND EVERY VARIABLE**

In [3]:
df = pd.read_excel("C:\\Users\\saisu\\OneDrive\\Documents\\insurance.xlsx")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


**DATASET UNDERSTANDING**

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.shape

(1338, 7)

In [7]:
df["sex"].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [10]:
df["bmi"].value_counts()

27.6    17
33.3    17
28.9    16
32.3    15
29.8    14
        ..
39.0     1
40.8     1
20.3     1
18.5     1
53.1     1
Name: bmi, Length: 275, dtype: int64

In [11]:
df["children"].value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [12]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

**exploratory data analysis**

In [17]:
categorical = []
continous = []
check = []
 
d_types = dict(df.dtypes)
for name , type in d_types.items():
    if str(type) == 'object':
        categorical.append(name)
    elif str(type) == "float64":
        continous.append(name)
    else:
        check.append(name)
        
print("categorical features:",categorical)
print("continous features:",continous)
print("features to be checked:",check)

categorical features: ['sex', 'smoker', 'region']
continous features: ['bmi', 'expenses']
features to be checked: ['age', 'children']


In [19]:
d_types = dict(df.dtypes)
for name , type in d_types.items():
    if str(type) == 'object':
        print(f"<========{name}========>")
        print(df[name].value_counts())

male      676
female    662
Name: sex, dtype: int64
no     1064
yes     274
Name: smoker, dtype: int64
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [20]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [21]:
df.corr()

  df.corr()


Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


# step-3: DATA PREPROCESSSING

In [22]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [23]:
#drop the region column
df.drop("region",axis=1,inplace=True)

In [24]:
#encoding sex column 
df["sex"].replace({"female":0,"male":1},inplace=True)

#encoding smoker column 
df["smoker"].replace({"yes":1,"no":0},inplace=True)


**X&y**

In [26]:
X = df.drop("expenses",axis=1)
y = df["expenses"]

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)

# step-4: MODELLING & EVALUATION

**LASSO REGRESSION WITH DEFAULT PARAMETERS**

In [28]:
#MODELLING 
from sklearn.linear_model import ElasticNet
enr_base = ElasticNet()
enr_base.fit(X_train,y_train)

#predictions
train_predictions = enr_base.predict(X_train)
test_predictions = enr_base.predict(X_test)

#evaluation
print("Train R2:",enr_base.score(X_train,y_train))
print("Test R2:",enr_base.score(X_test,y_test))

from sklearn.model_selection import cross_val_score
print("cross validation score:",cross_val_score(enr_base,X,y,cv=5).mean())

Train R2: 0.39155822533558715
Test R2: 0.39702776413473506
cross validation score: 0.3889250431216654


**APPLYING HYPERPARAMETER TUNING FOR LASSO REGRESSION**

In [32]:
from sklearn.model_selection import GridSearchCV

# model
estimator = ElasticNet()

# parameters & values
param_grid = {"alpha": [0.1,0.2,1,2,3,5,10], "l1_ratio": [0.1,0.5,0.75,0.9,0.95,1]}


#Identifying the best value of the parameter within given values for the given data

model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring="neg_mean_squared_error")

model_hp.fit(X_train,y_train)

model_hp.best_params_

{'alpha': 10, 'l1_ratio': 1}

**REBUILT LASSSO MODEL USING BEST HYPERPARAMETERS**

In [35]:
#Modelling

enr_best = ElasticNet(alpha=10,l1_ratio=1)
enr_best.fit(X_train,y_train)

print("Intercept:", enr_best.intercept_)
print("coefficients:",enr_best.coef_)

#predictions

train_predictions = enr_best.predict(X_train)
test_predictions = enr_best.predict(X_test)

 #Evaluation

print("Train R2:", enr_best.score(X_train,y_train))
print("Test R2:", enr_best.score(X_test,y_test))
print("Cross Validation Score:", cross_val_score(enr_best,X,y,cv=5).mean())

Intercept: -11449.28756082979
coefficients: [ 2.56838444e+02 -6.43158858e-01  3.04860929e+02  4.34656692e+02
  2.35631810e+04]
Train R2: 0.7433083585849637
Test R2: 0.7755411716841649
Cross Validation Score: 0.7467299170217538


# Prediction on New Data

**data**

In [49]:
input_data = {"age":31,
              "sex":"female",
              "bmi":25.74,
              "children":0,
              "smoker":"no",
              "region":"northeast"}


**preprocesssing the data**

In [50]:
df_test = pd.DataFrame(input_data, index=[0])


df_test.drop('region',axis=1, inplace=True)

df_test['sex'].replace({'female': 0,'male':1}, inplace=True)

df_test['smoker'].replace({'no':0,'yes':1}, inplace=True) 

transormed_data = df_test




**predict**

In [51]:
transormed_data

Unnamed: 0,age,sex,bmi,children,smoker
0,31,0,25.74,0,0


In [52]:
enr_best.predict(transormed_data)

array([4359.82451623])