In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.DataFrame(
    data={
        "# correct (x)": [17, 13, 12, 15, 16, 14, 16, 16, 18, 19],
        "attitude (y)": [94, 73, 59, 80, 93, 85, 66, 79, 77, 91]
    }
)

dataset.head()

Unnamed: 0,# correct (x),attitude (y)
0,17,94
1,13,73
2,12,59
3,15,80
4,16,93


---

## Given the # correct answers for a test, predict for "testing attitude" using Linear Regression

Dataset and algorithm from: https://www.youtube.com/watch?v=GhrxgbQnEEU

Linear Regression Function: 
$y = a + bx$  

Slope ($b$) of the Regression Line: 
$b = r\frac{S_{y}}{S_{x}}$

Y-Intercept ($a$) of the Regression Line:
$a = \bar{y} - b\bar{x}$

---

## First, the slope!

The slope is needed to calculate both the y-intercept and y. 
It would be best to calculate this first.

Slope ($b$) of the Regression Line: 
$b = r\frac{S_{y}}{S_{x}}$

But, to calculate the slope we need a couple of other formulas. 
The _Pearson Correlation Coefficient ($r$)_ as well as the formula to calculate the standard deviation of both $X$ and $Y$.

---

## So, Pearson's Correlation Coefficient Formula

$r = \frac {\sum \limits _{i} (x _{i} - \bar{x})(y _{i} - \bar{y})}{\sqrt{\sum \limits _{i} (x _{i} - \bar{x}) ^2}\sqrt{\sum \limits _{i} (y _{i} - \bar{y}) ^2}}$

In [3]:
def pearsons_correlation_coef(
    x: pd.Series, 
    y: pd.Series
):
    
    x_bar, y_bar = x.mean(), y.mean()
    
    numerator = np.sum((x-x_bar)*(y-y_bar))
    denominator = np.sqrt(np.sum((x-x_bar)**2))*np.sqrt(np.sum((y-y_bar)**2))
    
    return numerator / denominator

In [4]:
pearsons_correlation_coef(
    dataset["# correct (x)"], 
    dataset["attitude (y)"]
)

0.5960947613894623

---

## Standard Deviation Function

$\sigma = \sqrt{\frac{\sum (x_{i}-\mu)^2 }{N}}$

In [5]:
def std(s: pd.Series):
    return np.sqrt(np.sum((s-s.mean())**2) / (len(s)-1))

In [6]:
std(dataset["# correct (x)"])

2.1705094128132942

---

## Slope Function

$b = r\frac{S_{y}}{S_{x}}$

In [7]:
def slope(x, y):
    r = pearsons_correlation_coef(x, y)
    slope_ = r*(std(y)/std(x))
    
    return slope_

In [8]:
slope(
    dataset["# correct (x)"], 
    dataset["attitude (y)"]
)

3.1792452830188673

---

## Y-Intercept

$a = \bar{y} - b\bar{x}$

In [9]:
def y_intercept(x, y):
    return y.mean() - (slope(x, y)*x.mean())

In [10]:
y_intercept(
    dataset["# correct (x)"], 
    dataset["attitude (y)"]
)

30.103773584905674

---

## Linear Regression Function!

$y = a + bx$

In [11]:
class LinearRegression:
    
    def __init__(self, y_intercept, slope):
        self._y_intercept = y_intercept
        self._slope = slope
        
    def predict(self, x):
        return self._y_intercept + (self._slope * x)

In [12]:
clf = LinearRegression(
    y_intercept=y_intercept(
        dataset["# correct (x)"], 
        dataset["attitude (y)"]
    ),
    slope=slope(
        dataset["# correct (x)"], 
        dataset["attitude (y)"]
    )
)

In [13]:
clf.predict(14)

74.61320754716982

---

## Cleaned Up

In [14]:
class Variable:
    
    def __init__(self, s: pd.Series, name):
        self.data = s
        self.name = name
        
        self._mu = None
        self._std = None
        self._size = None
        
    @property
    def size(self):
        if self._size is None:
            print(f"Calculating size for {self.name}..")
            self._size = len(self.data)
        
        return self._size
        
    @property
    def mu(self):
        if self._mu is None:
            print(f"Calculating mu for {self.name}..")
            self._mu = self.data.mean()
        
        return self._mu
    
    @property
    def std(self):
        if self._std is None:
            print(f"Calculating std for {self.name}..")
            self._std = self._calc_std()
        
        return self._std
    
    def _calc_std(self):
        return np.sqrt(np.sum((self.data-self.mu)**2) / (self.size-1))
    
    
def pearsons_correlation_coef(
    x: Variable, 
    y: Variable
):
    numerator = np.sum((x-x.mu)*(y-y.mu))
    denominator = np.sqrt(np.sum((x-x.mu)**2))*np.sqrt(np.sum((y-y.mu)**2))
    
    return numerator / denominator
    
    
class SimpleLinearRegression:
    
    def __init__(self, x, y):
        self._x = Variable(x, name="X")
        self._y = Variable(y, name="Y")
        
        self._y_intercept = None
        self._slope = None
        
    @property
    def y_intercept(self):
        if self._y_intercept is None:
            print("Calculating y_intercept..")
            self._y_intercept = self._calc_y_intercept()
            
        return self._y_intercept
    
    @property
    def slope(self):
        if self._slope is None:
            print("Calculating slope..")
            self._slope = self._calc_slope()
            
        return self._slope
        
    def predict(self, x):
        return self.y_intercept + (self.slope * x)
    
    def _calc_y_intercept(self):
        return self._y.mu - (self.slope*self._x.mu)
    
    def _calc_slope(self):
        r = pearsons_correlation_coef(self._x, self._y)
        return r*(self._y.std/self._x.std)
    
    
def pearsons_correlation_coef(
    x: Variable, 
    y: Variable
):
    numerator = np.sum((x.data-x.mu)*(y.data-y.mu))
    denominator = np.sqrt(np.sum((x.data-x.mu)**2))*np.sqrt(np.sum((y.data-y.mu)**2))
    
    return numerator / denominator

In [15]:
clf = SimpleLinearRegression(
    x=dataset["# correct (x)"],
    y=dataset["attitude (y)"]
)

In [16]:
clf.predict(14)

Calculating y_intercept..
Calculating mu for Y..
Calculating slope..
Calculating mu for X..
Calculating std for Y..
Calculating size for Y..
Calculating std for X..
Calculating size for X..


74.61320754716982

In [17]:
clf.predict(14.5)

76.20283018867926