In [2]:
# Importing all libraries
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import matplotlib.pyplot as plt

Creating the Class

In [12]:
class LinearRegression:

    def __init__(self, X, y, test_size = 0.3, random_state = 0):
        self.X = X # original X matrix
        self.y = y # original y vector
        # Call a split function to save the splits as attributes
        # Now the class will already contain the splits as attributes
        self.splitdata(test_size, random_state)

    def splitdata(self, test_size, random_state):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size = test_size, random_state=random_state, shuffle=True)
        self.X_train = X_train # X train split
        self.X_test = X_test # X test split
        self.y_train = y_train # y train split
        self.y_test = y_test # y test split
        print("Splits Created and stored as attributes")
        return
    
    def normalizetrain(self):
        mean = np.mean(self.X_train, axis=0)
        std = np.std(self.X_train, axis = 0)
        self.X_train = (self.X_train-mean)/std # updated X_train
        self.X_mean = mean # mean of X train
        self.X_std = std # std of X train
        self.X_train = self.add_X0(self.X_train)
        return
    
    def normalizetest(self):
        self.X_test = (self.X_test - self.X_mean)/self.X_std # updated X_test
        self.X_test = self.add_X0(self.X_test)
        return
    
    def rank(self, X):
        rank = np.linalg.matrix_rank(X)
        return rank
    
    # X.T.dot(X) is invertible if X is full rank and n>d
    
    def checkfullrank(self, X):
        rank = self.rank(X)
        if rank == min(X.shape):
            self.fullrank = True
        else:
            self.fullrank = False
        return
    
    def checklowrank(self, X):
        if X.shape[0] > X.shape[1]:
            self.lowrank = False
        else:
            self.lowrank = True
        return
    
    def closedform(self):
        X = self.X_train
        y = self.y_train
        print("Solving for theta using Closed form Equation")
        self.w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        return self.w

    def add_X0(self, X):
        return np.column_stack([np.ones(X.shape[0]), X])
    
    def fit(self):
        self.normalizetrain()
        self.normalizetest()

        # Closed form solution
        self.checkfullrank(self.X_train)
        self.checklowrank(self.X_train)

        if self.fullrank and not self.lowrank and self.X_train.shape[0] <= 10000:
            self.closedform()
            print(self.w)
        return
    



# Add X0 add
# Use X0 add in fit function

#### Loading Dataset

In [7]:
# Load CSV file into a numpy array
data = np.genfromtxt('housing.csv', delimiter=',')

# Display the shape of the array
print(data.shape)

X = data[:,:-1]
y = data[:,-1]

(506, 14)


#### Testing

In [13]:
reg = LinearRegression(X, y)
reg.fit()

Splits Created and stored as attributes
Solving for theta using Closed form Equation
[22.74548023 -1.01189299  1.05028731  0.07922238  0.61896311 -1.87367112
  2.70526765 -0.27955798 -3.09763515  2.09690865 -1.88608488 -2.26109811
  0.58264435 -3.44050512]
