In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("dataset/WineData.csv")
df.head()

Unnamed: 0,ID,Brand,FA,VA,CA,RS,chloride,FSD,TSD,density,pH,sulphate,alcohol
0,1,Seagram,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,2,Seagram,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,3,Seagram,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,4,Sula Vineyards,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,5,Seagram,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [3]:
count = 1
for i in df["Brand"].unique():
    df.loc[(df["Brand"] == i), "Brand"] = count
    count+=1

In [4]:
df["Brand"] = df["Brand"].astype("int8")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        1599 non-null   int64  
 1   Brand     1599 non-null   int8   
 2   FA        1599 non-null   float64
 3   VA        1599 non-null   float64
 4   CA        1599 non-null   float64
 5   RS        1599 non-null   float64
 6   chloride  1599 non-null   float64
 7   FSD       1599 non-null   float64
 8   TSD       1599 non-null   float64
 9   density   1599 non-null   float64
 10  pH        1599 non-null   float64
 11  sulphate  1599 non-null   float64
 12  alcohol   1599 non-null   float64
dtypes: float64(11), int64(1), int8(1)
memory usage: 151.6 KB


In [6]:
df.pop("ID")

0          1
1          2
2          3
3          4
4          5
        ... 
1594    1595
1595    1596
1596    1597
1597    1598
1598    1599
Name: ID, Length: 1599, dtype: int64

In [7]:
df.head()

Unnamed: 0,Brand,FA,VA,CA,RS,chloride,FSD,TSD,density,pH,sulphate,alcohol
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,1,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,2,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [8]:
y = np.array(df.pop("alcohol"))
X = np.array(df.copy())

![image.png](attachment:image.png)

In [9]:
from IPython.display import clear_output

In [88]:
class MultiLinearRegressionMBGD:
    def __init__(self, learning_rate=0.0001,epochs=100,batch_size=20):
        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
    
    def fit(self,X,y):
        # init your coefs
        self.intercept_ = 0
        self.coef_ = np.ones(X.shape[1])

        
        for i in range(self.epochs):    
            
            for j in range(int(X.shape[0]/self.batch_size)):
                
                idx = np.random.choice(X.shape[0], self.batch_size, replace=False)               
                    
                # update all the coef and the intercept
                y_hat = np.dot(X[idx],self.coef_) + self.intercept_
                
                #print("Shape of y_hat",y_hat.shape)
                intercept_der = -2 * np.mean((y[idx] - y_hat))
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)
                
                coef_der = -2 * np.dot((y[idx] - y_hat),X[idx])/self.batch_size
                self.coef_ = self.coef_ - (self.lr * coef_der)
        
    def predict(self, X):
        y_hat = self.intercept_ + np.dot(X,self.coef_)
        return y_hat.ravel()

In [179]:
from sklearn.linear_model import LinearRegression
mlrMBGD = MultiLinearRegressionMBGD(0.00001,5000,10)
lr = LinearRegression()

In [180]:
%%time
mlrMBGD.fit(X,y)

CPU times: total: 1min 6s
Wall time: 3min 30s


In [181]:
%%time
lr.fit(X,y)

CPU times: total: 15.6 ms
Wall time: 18.3 ms


In [182]:
mlrMBGD.intercept_

0.28370520417182116

In [183]:
y_hat = mlrMBGD.predict(X)
y_hat_lr = lr.predict(X)

In [184]:
y_hat

array([10.20331983,  9.6692037 ,  9.74752802, ..., 10.6884533 ,
       10.60624752, 10.76324571])

In [185]:
y_hat_lr

array([ 9.55277105,  9.51034599,  9.529343  , ..., 10.3180465 ,
       10.60368088, 10.68632127])

In [186]:
from sklearn.metrics import r2_score

r2_score(y,y_hat)

0.1864588412009235

In [187]:
r2_score(y,y_hat_lr)

0.6766614705735856