# Linear Regression


In [60]:
import seaborn as sns
import pandas as pd
import numpy as np
import random as r
import joblib as j

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from typing import List

In [None]:
# load the data
data_df = sns.load_dataset('tips')
data_df.info()

# prepare the data
x = data_df[['total_bill']]
y = data_df['tip']

# check the shape of the data. 
# Anything that's being plotted on X-axis should be 2D
print(f"X: {x.shape} \nY: {y.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
X: (244, 1) 
Y: (244,)


In [62]:
# generate a radom SEED value
SEED = r.randint(0,5000)

# split the training and testing data in 90-10
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8,random_state=SEED)

# build the model
model = LinearRegression()

# fit the model
model.fit(xtrain, ytrain)

# check the incercept(m) and coefficient(c) of the line(y = mx + c)
print(f"m: {model.intercept_} \nc: {model.coef_}")


m: 0.955768398178066 
c: [0.10255273]


In [63]:
# predit the Y values from the test set
y_pred = model.predict(xtest)

# calculate the accuracy using R2-score
r2_result = r2_score(ytest, y_pred)

print(f"R2-score: {r2_result}")

R2-score: 0.4795654998797383


## Store the fine model


In [64]:
# # load the data

# data = pd.read_csv('./discounts.csv')
# data.info()

# x = data[["Sales"]]
# y = data['Discount Percentage']

In [69]:
accuracy_list = []
models = []


for i in range(1000):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=i)
    fine_model = LinearRegression()
    fine_model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    r2_result = r2_score(y_test, y_pred)
    
    accuracy_list.append(r2_result)
    models.append(fine_model)
    

In [70]:
# max accuracy in the fitted models
max_accuracy = np.max(accuracy_list)

# to find the index of the max value
modelIdx = np.argmax(accuracy_list)
    
print(f"accuracy list: {accuracy_list}")
print(f"Max accuracy of the fitted model: {max_accuracy}")
print(f"Model Idx with max accuracy: {modelIdx}")

accuracy list: [0.618117283521732, 0.5594223496301813, 0.5965624730787178, 0.5093777281254309, 0.42401782961353374, 0.060230846869993804, 0.5025429536779024, 0.4818139689313783, 0.3516905344477679, 0.5284529496886918, 0.40586637549319604, 0.6220293734407426, 0.4481568100029678, 0.5125741090374887, 0.5170946976212507, 0.4957628784883631, 0.5642122500118025, 0.32993758328451006, 0.5221991644551507, 0.5626707906522497, 0.42847003451834476, 0.3923541290115504, 0.21264115056284483, 0.5784389140496988, 0.3112382377267313, 0.5111339631317372, 0.1673453372046838, 0.4310471737295879, 0.33889009439486817, 0.2590253990035086, 0.5799453501737346, 0.35662666360278505, 0.2512236294202578, 0.5979564951022212, 0.6725802018343601, 0.3597239946037333, 0.0515879167618718, 0.1399906915375968, 0.6081202013777547, 0.5397581055095839, 0.5346538274757792, 0.41742410645916994, 0.5666552408980206, 0.27354307728928995, 0.44207822738791414, 0.3808382362085505, 0.5488653010181832, 0.32013224096285975, 0.3120031752

In [66]:
# save the model

j.dump(models[modelIdx], "fine-model.pkl")

['fine-model.pkl']

In [67]:
# prdict from saved model

trained_model = j.load("./fine-model.pkl")

trained_model.predict([[45]])



array([5.45508602])