# RIDGE REGRESSION

# step-1: BUSINESS PROBLEM UNDERSTANDING

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [3]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

# step-2: DATA UNDERSTANDING

In [4]:
df = pd.read_excel("C:\\Users\\saisu\\OneDrive\\Documents\\insurance.xlsx")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


**DATASET UNDERSTANDING**

In [5]:
df.shape


(1338, 7)

In [6]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'expenses'], dtype='object')

# step-3: DATA PREPROCESSING

In [11]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1333    False
1334    False
1335    False
1336    False
1337    False
Length: 1338, dtype: bool

In [12]:
df.drop_duplicates(inplace=True)


In [13]:
#drop the region column
df.drop("region",axis=1,inplace=True)

In [15]:
#encoding the se column 
df["sex"].replace({"female":0,"male":1},inplace=True)

#encoding smoker column

df["smoker"].replace({"no":0,"yes":1},inplace=True)

In [16]:
X = df.drop("expenses",axis=1)
y = df["expenses"]

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)

# step-4:MODELLING

**applying hyperparameter tuning for ridge regression**

In [22]:
#model

estimator = Ridge()

#parameters & values
param_grid = {"alpha":list(range(1,100))}
 
#identifying the best value of the parameter within given values for the given data

model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring="r2")

model_hp.fit(X_train,y_train)
model_hp.best_params_


{'alpha': 1}

**modelling Ridge regression using best hyperparameters**


In [23]:
#modelling 

ridge_best = Ridge(alpha=1)
ridge_best.fit(X_train,y_train)

print("Intercept:",ridge_best.intercept_)
print("coefficients:",ridge_best.coef_)

Intercept: -12131.383174500303
coefficients: [  264.4786592   -112.37962155   318.56350557   413.12069122
 23853.85951773]


# step-5:EVALUATION

**evaluation on train data**

In [24]:
ypred_train = ridge_best.predict(X_train)
print("Train R2:",r2_score(y_train,ypred_train))
print("CV score:",cross_val_score(ridge_best,X_train,y_train,cv=5).mean())

Train R2: 0.7593639632162803
CV score: 0.753470595394454


**evaluation on test data**

In [25]:
ypred_test = ridge_best.predict(X_test)
print("Test R2:",r2_score(y_test,ypred_test))

Test R2: 0.7008629672692219


# PREDICTION ON UNKNOWN DATA

**Data**

In [26]:
input_data = {"age":31,
              "sex" :"female",
              "bmi":25.74,
              "children":0,
              "smoker":"no",
              "region":"northeast"}

**step-1: prepeocessing the data**

In [28]:
df_test = pd.DataFrame(input_data,index=[0])

df_test.drop("region",axis=1,inplace=True)
df_test["sex"].replace({"female":0,"male":1},inplace=True)
df_test["smoker"].replace({"no":0,"yes":1},inplace=True)

df_test

Unnamed: 0,age,sex,bmi,children,smoker
0,31,0,25.74,0,0


**step-2: predict**

In [29]:
ridge_best.predict(df_test)

array([4267.27989412])