# Linear Regression model

#### Model Setup emulating scikit-learn object but just using `numpy` and `pandas` packages

In [52]:
# Import packages & setup
import os
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod

np.random.seed = 0

In [53]:
# Setup the random data
has_intercept = True
observations = 1000
features = 10

rand_data = np.around(10 * np.random.rand(observations, features), 2) # 1 <= data < 10
X = pd.DataFrame(rand_data, columns=[f'feature{i}' for i in range(1, features + 1)])
if has_intercept:
    X.insert(0, 'feature0', np.ones(observations)) # Set dummy column in data
y = pd.Series(np.random.randint(2, size=observations))
theta = pd.Series(np.random.rand(len(X.columns)))

print(f'<Random Data Generated>\n(Sample Size, Features) = {observations, features}')
X.head()

<Random Data Generated>
(Sample Size, Features) = (1000, 10)


Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10
0,1.0,9.44,0.86,2.32,1.54,4.88,1.44,8.14,3.68,7.18,8.53
1,1.0,8.74,3.23,8.84,6.34,1.66,1.13,9.13,9.84,6.34,4.89
2,1.0,8.36,4.18,1.77,8.73,0.56,0.59,1.43,7.39,2.06,2.84
3,1.0,7.41,3.16,8.47,8.58,0.28,3.54,3.95,10.0,3.62,2.52
4,1.0,3.36,6.22,2.38,5.09,0.28,6.35,5.94,8.62,3.92,3.25


In [54]:
# Setup metaclasses
class Estimator(ABC):
    @abstractmethod
    def __init__(self, normalize=False):
        self._normalize = normalize
        self._coefs = np.array([])
        
    @abstractmethod
    def fit(self, design_matrix, labels):
        pass
    
    def score(self, design_matrix, labels):
        pass
    
class Predictor(ABC):
    @abstractmethod
    def predict(self, design_matrix):
        pass

# Setup helper Hypothesis class
class _Hypothesis:
    
    models = {
        'linear': lambda X, theta: np.dot(X, theta),
        'logistic': lambda X, theta: 1 / (1 + np.exp(- np.dot(X, theta)))
    }
    
    def __init__(self, model):
        self.model = model
        
    def apply_model(self, design_matrix, parameters):
        try:
            hypothesis_func = models[self.model]
            return hypothesis_func(design_matrix, parameters)
        except:
            raise KeyError(f'Please use one of the follwing: {tuple(models.keys())}')


In [55]:
# Test the Hypothesis class
X = np.array([[1, 2], [-2, 3]])
theta = np.array([1, 2])
h = _Hypothesis('linear')
print(h.apply_model(X, theta))
e = _Hypothesis('mispelled')
try:
    e.apply_model(X, theta)
except KeyError:
    print('Whoops, mispelled the hypothesis function!')

[5 4]
Whoops, mispelled the hypothesis function!


In [56]:
class LinearRegression(Estimator, Predictor):
    """
    This is a class to mimic the functionality from sklearn.linear_model.LinearRegression.
    
    Attributes:
        _coef (Numpy.Array): model coefficients (Thetas)
        _intercept (Float): model interecpt term (Theta0); is None if fit_intercept=False 
        _fit_intercept (Boolean): flag to determine if model should have intercept term fitted (default=True)
        _normalize (Boolean): flag to determine if data should be normalized (default=False)
    """
    
    @staticmethod
    def get_cost_function():
        cost_functions = {
            'mse': lambda X, y, theta, model='linear': np.mean((y - _Hypothesis(model).apply_model(X, theta)) ** 2)
        }
        return cost_functions
    
    def __init__(self, normalize=False, fit_intercept=True):
        """
        Constructor for LinearRegression class.
        
        Parameters:
            fit_intercept (Boolean): flag to determine if model should have intercept term fitted (default=True)
            normalize (Boolean): flag to determine if data should be normalized (default=False)
        """
        super().__init__(normalize)
        self._fit_intercept = fit_intercept
        self._intercept = float() if self._fit_intercept else None
    
    def fit(self, design_matrix, labels):
        # TODO fix logic to see whether features should handle dummy column here or in utility function 
        """
        Estimates the parameters (theta) of the linear regression model using Gradient Descent.
        
        Parameters:
            features (pandas.DataFrame): design matrix (X)
            labels (pandas.Series): target array (y) 
        Returns:
            self
        """
        if self._normalize:
            standardize = lambda arr: (arr - np.mean(arr)) / np.std(arr)
            for col in features.columns:
                design_matrix[col] = standardize(design_matrix[col])
            labels = standardize(labels)
        num_of_features = len(design_matrix.columns)
        degrees_freedom = num_of_features + 1 if self._fit_intercept else num_of_features
        initial_params = np.random.rand(degrees_freedom)
        fitted_params = gradient_descent(design_matrix, labels, initial_params)
        self._coefs = fitted_params[1:]
        
        return self

    def predict(self, design_matrix):
        """
        Constructs prediction array (y hat) using linear_model utilityy function.
        
        Parameters:
            design_matrix (pandas.DataFrame): design matrix (X)
        Returns:
            pandas.Series: an array of predicted labels
        """
        has_intercept = self._fit_intercept
        hyp = _Hypothesis('linear')
        # Slices parameter vector based on self._fit_intercept value (0/False or 1/True)
        predictions = hyp.apply_model(design_matrix, self._coefs[has_intercept:])
        bias = self._intercept
        y_hats = predictions + bias if has_intercept else predictions
        return y_hats
    
    def score(self, design_matrix, labels):
        """
        Calculates R^2 value from data (X) & labels (y)
        
        Parameters:
            design_matrix (pandas.DataFrame): design matrix (X)
            labels (pandas.Series): target array (y) 
        Returns:
            float: R-squared value of model; value lies in [0, 1]
        """
        mean_squared_error = type(self).get_cost_function()['mse']
        mse = mean_squared_error(design_matrix, labels, self._coefs)
        r_squared = 1 - mse / np.var(labels)
        return r_squared
    
    def get_parameters(self):
        """
        Obtains paramters array of Linear Regression model.
        
        Returns:
            numpy.Array: parameters of model (theta)
        """
        return self._coefs
    
    def set_parameters(self, parameters):
        """
        Sets paramters array of Linear Regression model.
        
        Parameters:
            parameters (numpy.Array): parameters of model (theta)
        """
        self._coefs = parameters


In [1]:
class ModifiedMetrics:
    
    def sse(self):
        '''returns sum of squared errors (model vs actual)'''
        squared_errors = (self.target - self.predict(self.data)) ** 2
        self.sq_error_ = np.sum(squared_errors)
        return self.sq_error_
        
    def sst(self):
        '''returns total sum of squared errors (actual vs avg(actual))'''
        avg_y = np.mean(self.target)
        squared_errors = (self.target - avg_y) ** 2
        self.sst_ = np.sum(squared_errors)
        return self.sst_
    
    def r_squared(self):
        '''returns calculated value of r^2'''
        self.r_sq_ = 1 - self.sse()/self.sst()
        return self.r_sq_
    
    def adj_r_squared(self):
        '''returns calculated value of adjusted r^2'''
        self.adj_r_sq_ = 1 - (self.sse()/self._dfe) / (self.sst()/self._dft)
        return self.adj_r_sq_
    
    def mse(self):
        '''returns calculated value of mse'''
        self.mse_ = np.mean( (self.predict(self.data) - self.target) ** 2 )
        return self.mse_
    
    def pretty_print_stats(self):
        '''returns report of statistics for a given model object'''
        items = ( ('sse:', self.sse()), ('sst:', self.sst()), 
                 ('mse:', self.mse()), ('r^2:', self.r_squared()), 
                  ('adj_r^2:', self.adj_r_squared()))
        for item in items:
            print('{0:8} {1:.4f}'.format(item[0], item[1]))


In [2]:
class MyLinearRegressionWithInheritance(ModifiedMetrics):
    
    
    def __init__(self, fit_intercept=True):
        self.coef_ = None
        self.intercept_ = None
        self._fit_intercept = fit_intercept
          
        
    def fit(self, X, y):
        """
        Fit model coefficients.

        Arguments:
        X: 1D or 2D numpy array 
        y: 1D numpy array
        """
        
        # training data & ground truth data
        self.data = X
        self.target = y
        
        # degrees of freedom population dep. variable variance 
        self._dft = X.shape[0] - 1  
        # degrees of freedom population error variance
        self._dfe = X.shape[0] - X.shape[1] - 1
        
        # check if X is 1D or 2D array
        if len(X.shape) == 1:
            X = X.reshape(-1,1)
            
        # add bias if fit_intercept
        if self._fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        
        # closed form solution
        xTx = np.dot(X.T, X)
        inverse_xTx = np.linalg.inv(xTx)
        xTy = np.dot(X.T, y)
        coef = np.dot(inverse_xTx, xTy)
        
        # set attributes
        if self._fit_intercept:
            self.intercept_ = coef[0]
            self.coef_ = coef[1:]
        else:
            self.intercept_ = 0
            self.coef_ = coef
            
    def predict(self, X):
        """Output model prediction.

        Arguments:
        X: 1D or 2D numpy array 
        """
        # check if X is 1D or 2D array
        if len(X.shape) == 1:
            X = X.reshape(-1,1) 
        return self.intercept_ + np.dot(X, self.coef_)
