In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import copy
import math
import scipy
import os

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.datasets import make_regression

np.set_printoptions(precision=4)

In [84]:
directory = os.path.join('Users',
                         'tylerwhitlock'
                         'Development',
                         'machine_learning_algorithms')

# Machine Learning Algorithms


## Linear Regression

### What is Linear Regression

Linear Regression is a machine learning algorithm that comes from statistics that seeks to fit a scalar of values, called the <b>Dependent Variable </b> given a matrix of features, known as the <b>Independent Variables</b>.  Another way to think of this is fitting the <b>y values</b> given the <b>x values</b>

* $$y_{i} = β_{0} + β_{1} x _{i 1} + ⋯ + β _{p} x _{i p} + ε i = x _{i} ^{T} β + ε _{i} , i = 1 , … , n , $$

Linear Regresssion is trained using a method known as gradient descent

## Gradient Descent
Gradient Descent is an optimization method for finding a given set of weights to multiply by our independent variables that will give us the solution, our dependent variable

Gradient Descent is an iterative process by which we apply the Error / Cost function to a given set of examples to determine this set of weights that optimally solves for these examples

## Cost / Error Functions
For Linear Regression we seek to find a given set of weights, or biases that will, when multiplied by the independent variables give us the dependent variable y.

In the case of Linear Regression then we will be trying to find a line of best fit for the sets of independent variables in Ordinary Least Squares in the given training sample.  
We can do this using a few methods:

### Error Functions for Linear Regression
    * Ordinary Least Squares
    
    * Regularized Regressions:  These prevent colinearities, or highly correlated features, from overpowering the model
        * Ridge Regression, a technique that adds regularization, or bias to the Ordinary Least Squares Calculation
        * Lasso Regression, a technique that adds regularization and variable selection to the process of the Least Squares Calculation

## Data Loading

In [85]:
n_features = 25
X, y, coefficients = make_regression(n_samples=1000,
                                     n_features=n_features,
                                     n_informative=15,
                                     n_targets=1,
                                     random_state=1,
                                     coef=True)



In [86]:
print("Our expected results of regression should be approximately these given the randomly generated dataset")
print(f"\nCoefficients: \n {coefficients}")

Our expected results of regression should be approximately these given the randomly generated dataset

Coefficients: 
 [72.6895 26.8971 38.974  21.7287 82.0443  0.     32.2727  0.     76.0003
  0.     10.7963 75.4174  0.     58.2359  0.      0.      0.     86.4786
 28.8514  0.     72.0621 68.1451  0.     80.165   0.    ]


In [93]:
class LinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 num_features,
                 learning_rate=0.003,
                 num_iterations=1500):
        
        self.beta = np.random.random()
        self.num_features = num_features
        self.num_iterations = num_iterations
        self.learning_rate = learning_rate
        self.weights = np.random.random((num_features,))
        self.record = []
        
    def compute_error(self, X, y):
        num_examples = X.shape[0]
        error = 0
        for i in range(num_examples):
            prediction = self.predict(X[i])
            error = error + ((prediction - y[i]) ** 2)
        error = error/(2*num_examples)
        return error
    
    def compute_gradient(self, X, y):
        num_examples, num_features = X.shape
        weights_derivative = np.zeros(num_features)
        beta_derivative = 0.0
        
        for i in range(num_examples):
            error = (self.predict(X[i]) - y[i])
            for j in range(num_features):
                weights_derivative[j] = weights_derivative[j] + error * X[i,j]
            beta_derivative += error
        beta_derivative /= num_examples
        weights_derivative /= num_examples
        
        return weights_derivative, beta_derivative
    
    def gradient_descent(self, X, y):
        
        for iter_ct in range(self.num_iterations):
            weight_derivatives, beta_derivative = self.compute_gradient(X, y)
            self.weights = self.weights - (self.learning_rate * weight_derivatives)
            self.beta = self.beta - (self.learning_rate * beta_derivative)
            self.record.append(self.compute_error(X, y))
            
            if iter_ct % 100 == 0:
                print(f"Iteration {iter_ct} : cost {self.record[iter_ct]}")
        
    
    def fit(self, X, y):
        record = self.gradient_descent(X, y)
        
    
    def predict(self, X):
        prediction = np.dot(X, self.weights) + self.beta
        return prediction

In [94]:
_ = LinearRegressor(n_features)

In [None]:
_.fit(X, y)

Iteration 0 : cost 24952.903649230284
Iteration 100 : cost 14202.844405788557
Iteration 200 : cost 8152.143970133232
Iteration 300 : cost 4716.218238416051
Iteration 400 : cost 2748.624077563715
Iteration 500 : cost 1612.903878697869
Iteration 600 : cost 952.4681843193124


In [None]:
_.score(X, y)