In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import t, f, norm
np.set_printoptions(precision= 5)

In [5]:
class SLR():
    """
    Author: Lucius Vo <https://github.com/vohuynhquangnguyen>
    Construct a simple linear regression (SLR) model and conduct required estimations and hypothesis test.
    Methods of estimating the model's parameters and hypothesis test are based on Mendenhall and Sincich (2013, p.96-165).

    References:
    1. Mendenhall, William, Sincich, Terry T. A Second Course in Statistics - Regression Analysis (7th Edition). Pearson, 2013
    2. Montgomery, Douglas C., Runger, George C. Applied Statistics and Probability for Engineer (7th Edition). Wiley, 2018
    """
    def __init__(self) -> None:
        super(SLR, self).__init__
        pass

    def load_data(self, src: str, X_header: str, Y_header: str):
        """
        Load the dataset from a .csv file using Pandas file handler.
        """
        df = pd.read_csv(src)
        self.X = np.array(df[X_header])
        self.Y = np.array(df[Y_header])
        pass

    def fit(self):
        """
        Compute the parameters of the fitted model using the ordinary least square (OLS) method.
        """
        self.Xbar = np.mean(self.X)
        self.Ybar = np.mean(self.Y)

        self.SS_xx = np.sum((np.power(x - self.Xbar, 2) for x in self.X), axis= -1)
        self.SS_yy = np.sum((np.power(y - self.Ybar, 2) for y in self.Y), axis= -1)
        self.SS_xy = np.sum(((x - self.Xbar) * (y - self.Ybar) for (x,y) in zip(self.X,self.Y)),
                            axis= -1)

        self.B1hat = self.SS_xy / self.SS_xx
        self.B0hat = self.Ybar - self.B1hat * self.Xbar
        pass

    def estimate_variance_residuals(self):
        """
        Estimate the variance of residuals
        """
        self.e = np.array([y - (self.B0hat + self.B1hat * x) for (x,y) in zip(self.X, self.Y)])
        self.SS_E = np.sum((np.power(y - (self.B0hat + self.B1hat * x), 2) for (x,y) in zip(self.X, self.Y)),
                           axis= -1)
        self.s2 = self.SS_E / (len(self.Y) - 2)
        self.SE_B1hat = np.power(self.s2 / self.SS_xx, 1/2)
        self.SE_B0hat = np.power(self.s2 * ( 1 / len(self.Y) + self.Xbar ** 2 / self.SS_xx), 1/2)
        pass

    def ANOVA(self):
        """
        Conduct the analysis of variance (ANOVA) on the fitted model.
        """
        self.SS_R = np.sum((np.power((self.B0hat + self.B1hat * x) - self.Ybar, 2) for x in self.X),
                           axis= -1)
        self.SS_T = self.SS_R + self.SS_E
        self.Rsquare = self.SS_R / self.SS_T
        pass

    def conduct_statistical_inference(self, level: float = 0.95):
        """
        Conduct the hypothesis tests on the fitted model at the significant level (default is 95%).
        """
        self.estimate_variance_residuals()
        self.ANOVA()

        # Test on the estimated slope and intercept (two-tailed T-test):
        t_c = t.ppf((1 - level)/2, df = len(self.Y) - 2)
        self.T_B1hat = (self.B1hat - 0) / self.SE_B1hat
        self.T_B0hat = (self.B0hat - 0) / self.SE_B0hat

        self.pval_B1 = t.sf(abs(self.T_B1hat), len(self.Y) - 2) * 2
        self.pval_B0 = t.sf(abs(self.T_B0hat), len(self.Y) - 2) * 2

        # Test on the Pearson correlation coefficient (two-tailed T-test):
        self.r = np.power(self.SS_xy / (self.SS_xx * self.SS_yy), 1/2)
        self.t_r = np.power(self.r * ((self.n - 2) / (1 - self.r ** 2)), 1/2)
        self.pval_r = t.sf(abs(self.t_r), df = self.n - 2) * 2

        # Test on the significant of regression (two-tailed T-test):
        self.F = (self.SS_R / 1) / (self.SS_E / (self.n - 2))
        self.pval_F = f.sf(self.F, 1, self.n - 2)
        pass

    def compute_intervals(self, level: float = 0.95):
        """
        Compute the confidence intervals and the prediction intervals:
        """
        t_c = t.ppf((1 - level)/2, df = len(self.Y) - 2)

        # Confidence interval for the parameters:
        self.B0hat_CI_lwr = self.B0hat - t_c * self.SE_B0hat
        self.B0hat_CI_upr = self.B0hat + t_c * self.SE_B0hat

        self.B1hat_CI_lwr = self.B1hat - t_c * self.SE_B1hat
        self.B1hat_CI_upr = self.B1hat + t_c * self.SE_B1hat

        # Confidence interval of mean response at x:
        self.func = lambda x: self.B0hat + self.B1hat * x
        self.Yhat_CI_lwr = lambda x: \
          self.B0hat + self.B1hat * x - t_c * np.sqrt(self.SE * (1/len(self.Y) + np.power(x - self.Xbar,2) / (self.SS_xx)))
        self.Yhat_CI_upr = lambda x: \
          self.B0hat + self.B1hat * x + t_c * np.sqrt(self.SE * (1/len(self.Y) + np.power(x - self.Xbar,2) / (self.SS_xx)))

        # Prediction interval of mean response at x:
        self.yhat_PI_lwr = lambda x: \
          self.B0hat + self.B1hat * x - t_c * np.sqrt(self.SE * (1 + 1/len(self.Y) + np.power(x - self.Xbar,2) / (self.SS_xx)))
        self.yhat_PI_upr = lambda x: \
          self.B0hat + self.B1hat * x + t_c * np.sqrt(self.SE * (1 + 1/len(self.Y) + np.power(x - self.Xbar,2) / (self.SS_xx)))
        pass

    def visualize(self):
        """
        Visualize the entire analysis process:
        """

        # Visualize the distribution of the dataset:
        fig, axs = plt.subplots(1, 2, figsize = (8,4), dpi = 100)
        nbins = \
         (np.max(self.X, axis= -1) - np.min(self.X, axis= -1)) * np.power(len(self.Y), 1/3) / (3.49 * np.std(self.X, ddof= 1))
        sns.histplot(self.X, bins = int(nbins), ax = axs[0])
        axs[0].set_title(f"Histogram of {self.X_header}")
        axs[0].grid(True)

        nbins = \
         (np.max(self.Y, axis= -1) - np.min(self.Y,axis= -1)) * np.power(len(self.Y), 1/3) / (3.49 * np.std(self.Y, ddof= 1))
        sns.histplot(self.Y, bins = int(nbins), ax = axs[1])
        axs[1].set_title(f"Histogram of {self.Y_header}")
        axs[1].grid(True)
        fig.tight_layout()

        # Visualize the regression analysis:
        fig, axs = plt.subplots(1, 3, figsize = (15,4), dpi = 100)
        sns.scatterplot({f"{self.X_header}": self.X, f"{self.Y_header}": self.Y},
                        x =  f"{self.X_header}", y = f"{self.Y_header}", ax = axs[0])
        sns.lineplot({"x": self.X, "y": self.func(self.X)},
                     x = "x", y = "y", color = 'red', linestyle = 'solid', linewidth = 1.0, ax = axs[0])
        sns.lineplot({"x": self.X, "y": self.Yhat_CI_lwr(self.X)},
                     x = "x", y = "y", color = 'red', linestyle = 'dashdot', linewidth = 0.75, ax = axs[0])
        sns.lineplot({"x": self.X, "y": self.Yhat_CI_upr(self.X)},
                     x = "x", y = "y", color = 'red', linestyle = 'dashdot', linewidth = 0.75, ax = axs[0])
        sns.lineplot({"x": self.X, "y": self.yhat_PI_lwr(self.X)},
                     x = "x", y = "y", color = 'red', linestyle = 'dotted', linewidth = 0.55, ax = axs[0])
        sns.lineplot({"x": self.X, "y": self.yhat_PI_upr(self.X)},
                     x = "x", y = "y", color = 'red', linestyle = 'dotted', linewidth = 0.55, ax = axs[0])
        axs[0].set_title(f"{self.Y_header} vs. {self.X_header}")
        axs[0].grid(True)

        # Visualize the residual analysis:
        sns.scatterplot({f"{self.Y_header}": self.Y, "Residual": self.e},
                        x = f"{self.Y_header}", y = "Residual",
                        ax = axs[1])
        axs[1].axhline(y = 0, color = 'red', linestyle = 'dashed', linewidth = 1.0)
        axs[1].set_title(f"Residual Plot")
        axs[1].grid(True)

        rank = np.array([(i - 0.375) / (self.n + 0.25) for i in range(1, self.n + 1)])
        z_e = np.array([norm.ppf(i) for i in rank])
        sns.scatterplot({"Residual": sorted(self.e), "Z-score": z_e},
                        x = "Residual", y = "Z-score", ax = axs[2])
        axs[2].set_title(f"Residual Normality Plot")
        axs[2].grid(True)
        fig.tight_layout()
        pass

    def run_regression_analysis(self):
        """
        Generate a full analysis report.
        """
        self.fit()
        self.estimate_variance_residuals()
        self.conduct_statistical_inference()
        self.compute_intervals()
        self.visualize()

        ##
        # Generate a report:
        #
        print(f"\nModel: {self.Y_header} ~ {self.X_header}")
        print(f"\n\t  Min \t Q1 \t Median \t Q3 \tMax")
        print(f"Residuals: {self.e_min} \t {self.e_q1} \t {self.e_q2} \t {self.e_q3} \t {self.e_max}")

        print(f"\n\t  Coefficient \t Std. Error \t Lower-bound CI \t Upper-bound CI \t t-Statistic \t Pr(>|t|) at {100 * self.level}%")
        print(f"Intercept: {self.b0hat} \t {self.se_b0hat} \t {self.b0hat_lwr} \t {self.b0hat_upr} \t {self.t_b0hat} \t {self.pval_b0}")
        print(f"{self.X_header}: {self.b1hat} \t {self.se_b1hat} \t {self.b1hat_lwr} \t {self.b1hat_upr} \t {self.t_b1hat} \t {self.pval_b1}")

        print(f"\nResidual Std. Error: {self.e_stde } on {len(self.Y) - 2} DF")
        print(f"R-square: {self.Rsquare}")
        print(f"Pearson correlation coef.: {self.r}, p-value: {self.pval_r}")
        print(f"F-statistic: {self.F} on 1 predictor and {len(self.Y) - 2} DF, p-value: {self.pval_F}")

        self.visualize()

        pass


In [7]:
slr = SLR()
slr.load_data("./dataset/TAMPALMS.csv", "Market_Val", "Sale_Price")
slr.run_regression_analysis()

FileNotFoundError: ignored