# Linear Regression from scratch in numpy

We will start of by importing the necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings("ignore")

We will use wine-quality dataset for our prediction task, i can be found here :- http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/

In [2]:
df = pd.read_csv('winequality-white.csv',delimiter=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Seperate the data and target columns 

In [3]:
df_data = df.iloc[:,:11]
df_target = df.iloc[:,11:]

In [4]:
df_data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [5]:
df_target.head()

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6


## Concepts revisited :-

For linear regression we have our target variable prediction as :

y = W<sup>T</sup> * x  (bias term is already included in W(parameters) and x<sub>0</sub> is 1) 

Also we will use SGD update:

where W<sup>(i)</sup> = W<sup>(i)</sup> - lr * ((W<sup>T</sup> * x<sup>(i)</sup> - y<sup>(i)</sup>) * (x<sub>j</sub>)<sup>(i)</sup>)   

lr = learning rate

In [6]:
def predict(x,weights):
    new_weights = np.array(weights)
    new_array = x.reshape((x.shape[0],))
    new_weights = new_weights.reshape((new_weights.shape[0],))
    y = np.dot(new_weights,new_array)
    return y

In [7]:
def error(x,weights,y):
    return (predict(x,weights) - y)

In [8]:
def normalize(x):
    for j in range(x.shape[1]):
        for i in range(x.shape[0]):
            x[i,j] = (x[i,j] - np.amin(x[:,j]))/(np.amax(x[:,j])-np.amin(x[:,j]))
            

In [9]:
bias_col = np.full(df_data.shape[0],0.5)
bias_col

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5])

In [10]:
def data_split(data,ratio=0.8):
    data_train = data[:int(ratio*data.shape[0]),:]
    data_test  = data[int(ratio*data.shape[0]):,:]
    return data_train,data_test

We will use Root Mean Squared error as evaluation metric

In [11]:
def eval_metric(correct_output,predicted_output):
    error_sum = 0.0
    for i in range(len(correct_output)):
        error_i = (correct_output[i] - predicted_output[i])**2
        error_sum += error_i
    mean_error = error_sum/len(correct_output)
    return math.sqrt(mean_error)

for initializing and updating the weights by SGD we define the following function

In [12]:
def get_parameters(train_data,train_target,lr,epochs):
    weights = [0.0 for i in range(train_data.shape[1])]
    for epoch in range(epochs):
        for i in range(train_data.shape[0]):
            y_pred = predict(train_data[i,:],weights)
            y_pred = np.array(y_pred)
            error = y_pred - train_target[i]
            for j in range(train_data.shape[1]):
                weights[j] = weights[j] - lr*error*train_data[i,j]
    weights = np.array(weights)
    return weights            

Now function to return the predictions of our linear model 

In [13]:
def get_predictions(train_data,train_target,test_data,lr,epochs):
    predictions = []
    weights = get_parameters(train_data,train_target,lr,epochs)
    for i in range(test_data.shape[0]):
        y_pred = predict(test_data[i,:],weights)
        predictions.append(y_pred)
    predictions = np.array(predictions)
    return predictions

Function to evaluate our performance on test set.

In [14]:
def evaluation(train_data,train_targets,test_data,test_targets,out_function,lr,epochs):
    error =  eval_metric(test_targets , out_function(train_data,train_targets,test_data,lr,epochs))
    return error

### Now implementation and testing part : 

Converting our pandas dataframe into numpy array

In [15]:
df_data = df_data.as_matrix()
df_target = df_target.as_matrix()

Normalizing data before using it to train our model

In [16]:
normalize(df_data)
normalize(df_target)

In [17]:
df_data

array([[0.30769231, 0.18627451, 0.21686747, ..., 0.25454545, 0.26744186,
        0.12903226],
       [0.43133998, 0.21568627, 0.20481928, ..., 0.85415604, 0.31395349,
        0.66597891],
       [0.56090808, 0.19607843, 0.24096386, ..., 0.84293728, 0.25581395,
        0.7086199 ],
       ...,
       [1.        , 0.21233966, 0.19      , ..., 0.88656453, 0.39588851,
        0.73167006],
       [0.91216216, 0.26415942, 0.3       , ..., 1.        , 0.30639052,
        1.        ],
       [1.        , 0.1812478 , 0.38      , ..., 1.        , 0.23926702,
        1.        ]])

In [18]:
train_data , test_data = data_split(df_data)
train_targets , test_targets = data_split(df_target)

In [19]:
train_data.shape , test_data.shape , train_targets.shape , test_targets.shape

((3918, 11), (980, 11), (3918, 1), (980, 1))

Adding the bias term(Here I am using 0.5) to our dataset

In [20]:
train_data = np.insert(train_data,0,0.5,axis=1)
test_data = np.insert(test_data,0,0.5,axis=1)

In [21]:
train_data.shape , test_data.shape

((3918, 12), (980, 12))

Now we have to choose our hyperparameters such as learning rate and number of epochs

In [22]:
lr = 0.01
epochs = 25

In [23]:
final_error = evaluation(train_data,train_targets,test_data,test_targets,get_predictions,lr,epochs)

In [24]:
print(final_error)

0.18874237856962267
