In [1]:
import time
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

In [2]:
data = pd.read_csv('Prediction Insurance.csv')
data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0


In [3]:
data.shape

(381109, 12)

In [4]:
data.groupby(['Previously_Insured','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Previously_Insured,Unnamed: 1_level_2,Unnamed: 2_level_2
0,159929,46552
1,174470,158


In [5]:
data.groupby(['Gender','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2
Female,156835,18185
Male,177564,28525


In [6]:
data.groupby(['Driving_License','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Driving_License,Unnamed: 1_level_2,Unnamed: 2_level_2
0,771,41
1,333628,46669


In [7]:
data = data[['Gender','Driving_License','Previously_Insured','Response']]
data.head()

Unnamed: 0,Gender,Driving_License,Previously_Insured,Response
0,Male,1,0,1
1,Male,1,0,0
2,Male,1,0,1
3,Male,1,1,0
4,Female,1,1,0


In [8]:
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
data.head()

Unnamed: 0,Gender,Driving_License,Previously_Insured,Response
0,0,1,0,1
1,0,1,0,0
2,0,1,0,1
3,0,1,1,0
4,1,1,1,0


In [9]:
x = data.drop('Response', axis=1)
y = data['Response']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)
start = time.time()
model = LinearRegression()

model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time {stop-start} Seconds...")

Training Time 0.012679815292358398 Seconds...


In [10]:
with open('lr_model.pkl','wb') as file:
    pickle.dump(model, file)

In [11]:
y_predict = model.predict(x_test)

In [12]:
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = root_mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

data = {
    "Metric": ["Mean Absolute Error (MAE)", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "R² Score"],
    "Value": [mae, mse, rmse, r2]
}
report_df = pd.DataFrame(data)
report_df.index = report_df.index + 1

report_df

Unnamed: 0,Metric,Value
1,Mean Absolute Error (MAE),0.193403
2,Mean Squared Error (MSE),0.095141
3,Root Mean Squared Error (RMSE),0.308449
4,R² Score,0.118067
