In [1]:
%%javascript
MathJax.Hub.Config({
    TeX: { equationNumbers: { autoNumber: "AMS" } }
});

<IPython.core.display.Javascript object>

In [2]:
# Run for interactive plots with jupyter lab
%matplotlib widget          

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from scipy import stats
import statsmodels.api as sm

import matplotlib
# matplotlib.use('nbagg')    # Doesn't work with jupyter lab
import matplotlib.pyplot as plt

## Ordinary Least Squares

In [3]:
# Creating artificial data for regression

np.random.seed(1)       # For reproducibility
X = np.random.random((30,2))
b = [0.2,0.3]
e = np.random.random(30)

y = np.dot(X,b) + e

stats.describe(e)
# Running the regression

X = sm.add_constant(X)        # Adding an intercept to the regression

m1 = sm.OLS(y,X).fit()
print(m1.summary())

DescribeResult(nobs=30, minmax=(0.04995345894608716, 0.9648400471483856), mean=0.5558004906644833, variance=0.0770657116798012, skewness=-0.23634961375889468, kurtosis=-1.0681272762337057)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.182
Model:                            OLS   Adj. R-squared:                  0.121
Method:                 Least Squares   F-statistic:                     3.002
Date:                Fri, 15 May 2020   Prob (F-statistic):             0.0665
Time:                        19:47:14   Log-Likelihood:                -3.4473
No. Observations:                  30   AIC:                             12.89
Df Residuals:                      27   BIC:                             17.10
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5371      0.120      4.490      0.0

In [11]:
# Sums of squares, R^2, F stat for overall model fit
# SSTotal = SSReg + SSError ;   R^2 = SSReg/SSTotal ; F = (SSReg/df_reg)/(SSError/df_resid) 

# For the sums of squares statsmodels uses  the terms
# centered_tss = ess + ssr

m1.centered_tss,m1.ess, m1.ssr, m1.ess+m1.ssr   

# Confirm rsquared calculation

r2 = m1.ess/m1.centered_tss
r2,m1.rsquared              

# m1.df_model and m1.df_resid give us the degrees of freedom in case we need them

# Confirm F statistic calculation

m1.mse_model/m1.mse_resid, m1.fvalue    

(2.7018948326204675, 0.4915620365070974, 2.21033279611337, 2.7018948326204675)

(0.18193233525316366, 0.18193233525316366)

(3.0023024155071365, 3.0023024155071365)

In [12]:
# Residual plots

fig = plt.figure()
ax = fig.add_subplot(111,title="Residual plot",ylabel=r"$e_i$",xlabel=r"$\haty$")

y = m1.resid
x = m1.predict()
_=plt.scatter(x,y)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [None]:
help(sm.OLS.fit)
help(sm.regression.linear_model.RegressionResults)

In [None]:
# ANOVA
# Need to use a different api to make the anova call

import statsmodels.formula.api as smf

y2 = np.reshape(y,(np.size(y),1))                   # Convert np vector to np array
X2 = X[:,1:3]                                       # Drop column of ones (column 0)
y2X = np.concatenate((y2,X2),axis=1)                # Stack the columns horizontally
df = pd.DataFrame(y2X,columns=['y','x1','x2'])      # Create a data frame

Reg1 = smf.ols(formula='y ~ x1 + x2', data=df)
fit1 = Reg1.fit()
fit1.summary()
sm.stats.anova_lm(fit1,typ=2)         # two-way ANOVA table

In [None]:
ms1 = 0.335897         # Drop x1 from regression
ms2 = 0.155665
mse = 0.081864
ms1/mse, ms2/mse,((ms1+ms2)/2)/mse

**The F statistic in multiple regression**

The regression has $n$ observations, $k$ predictors, and includes an intercept. The F statistic is
$$
F = \frac{\frac{SSR}{k}}{\frac{SSE}{[n-k-1]}} = \frac{MSR}{MSE}
$$

where $SSR$ is the regression sum of squares, $SSE$ is the sum of squared errors, and $MSR$ and $MSE$ are their means respectively.

Since $R^2 = \frac{SSR}{SST}$,
$$
\begin{split}
F &= \frac{SSR}{SSE} \frac{[n-k-1]}{k}\\
  & = \frac{R^2}{1-R^2}\frac{[n-k-1]}{k}
\end{split}
$$

**The Omnibus Test**

This test is for the normality of the distribution of residuals. [See this].(http://work.thaslwanter.at/Stats/html/statsModels.html)



In [13]:
# Omnibus test is for normality of residual distribution
stats.normaltest(m1.resid)

NormaltestResult(statistic=4.062075177125114, pvalue=0.13119931958670275)

In [14]:
help (stats.normaltest)

Help on function normaltest in module scipy.stats.stats:

normaltest(a, axis=0, nan_policy='propagate')
    Test whether a sample differs from a normal distribution.
    
    This function tests the null hypothesis that a sample comes
    from a normal distribution.  It is based on D'Agostino and
    Pearson's [1]_, [2]_ test that combines skew and kurtosis to
    produce an omnibus test of normality.
    
    Parameters
    ----------
    a : array_like
        The array containing the sample to be tested.
    axis : int or None, optional
        Axis along which to compute test. Default is 0. If None,
        compute over the whole array `a`.
    nan_policy : {'propagate', 'raise', 'omit'}, optional
        Defines how to handle when input contains nan.
        The following options are available (default is 'propagate'):
    
          * 'propagate': returns nan
          * 'raise': throws an error
          * 'omit': performs the calculations ignoring nan values
    
    Returns
  