In [34]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
dataset = '../datasets/Boston.csv'
data = pd.read_csv(dataset, index_col=0)
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [36]:
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

In [52]:
lstat_aug = pd.concat([data.lstat, pd.Series(np.ones(data.lstat.shape), index=data.lstat.index, name='intercept')], axis=1)

In [32]:
medv_on_lstat = LinearRegression().fit(data.lstat.values.reshape(-1, 1), data.medv)
medv_on_lstat.coef_, medv_on_lstat.intercept_

(array([-0.95004935]), 34.5538408793831)

In [75]:
def add_intercept_term(X):
    if isinstance(X, (pd.DataFrame, pd.Series)):
        return pd.concat([X, pd.Series(np.ones(data.lstat.shape), index=X.index, name='intercept')], axis=1)
    else:
        return pd.concat([pd.DataFrame(X, columns=[f'X{i+1}' for i in range(X.shape[1])]), pd.Series(np.ones(data.lstat.shape), name='intercept')], axis=1)

In [91]:
def report_model(model, X, y):
    pred_y = model.predict(X)
    residuals = y - pred_y
    residual_stats = pd.Series(residuals).describe()
    print(f'Residual stats:\n{residual_stats}')
    RSS = residuals.T @ residuals
    print(f'RSS: {RSS}')
    resid_from_mean = y - y.mean()
    TSS = resid_from_mean.T @ resid_from_mean
    print(y.mean(), y)
    print(f'TSS: {TSS}')
    R_squared = 1 - RSS / TSS
    print(f'R squared: {R_squared}')
    sigma_squared_hat = RSS / (X.shape[0] - X.shape[1] - 1)
    print(f'Sigma squared estimation: {sigma_squared_hat}')
    aug_X = add_intercept_term(X)
    var_beta_hat = np.linalg.inv(aug_X.T @ aug_X) * sigma_squared_hat
    print(f'Betha variance estimation: {var_beta_hat}')
    for p_ in range(X.shape[1] + 1):
        standard_error = var_beta_hat[p_, p_] ** 0.5
        print(f"SE(beta_hat[{p_}]): {standard_error}")

In [92]:
report_model(medv_on_lstat, data.lstat.values.reshape(-1, 1), data.medv)

Residual stats:
count    506.000000
mean       8.334752
std       16.047476
min      -34.495412
25%       -1.852665
50%        8.141362
75%       17.563991
max       48.058760
Name: medv, dtype: float64
RSS: 165199.20604236706
22.532806324110677 1      24.0
2      21.6
3      34.7
4      33.4
5      36.2
       ... 
502    22.4
503    20.6
504    23.9
505    22.0
506    11.9
Name: medv, Length: 506, dtype: float64
TSS: 42716.29541501976
R squared: -2.867357982178395
Sigma squared estimation: 327.776202465014
Betha variance estimation: [[ 0.01272801 -0.16104831]
 [-0.16104831  2.68553351]]
SE(beta_hat[0]): 0.11281848062357104
SE(beta_hat[1]): 1.6387597468446529


In [39]:
import statsmodels.api as sm

In [76]:
mod = sm.OLS(data.medv, add_intercept_term(data.lstat)).fit()
mod.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,601.6
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,5.08e-88
Time:,17:27:42,Log-Likelihood:,-1641.5
No. Observations:,506,AIC:,3287.0
Df Residuals:,504,BIC:,3295.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
lstat,-0.9500,0.039,-24.528,0.000,-1.026,-0.874
intercept,34.5538,0.563,61.415,0.000,33.448,35.659

0,1,2,3
Omnibus:,137.043,Durbin-Watson:,0.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,291.373
Skew:,1.453,Prob(JB):,5.36e-64
Kurtosis:,5.319,Cond. No.,29.7
