In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
dataset = '../datasets/Boston.csv'
data = pd.read_csv(dataset, index_col=0)
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

In [4]:
lstat_aug = pd.concat([data.lstat, pd.Series(np.ones(data.lstat.shape), index=data.lstat.index, name='intercept')], axis=1)

In [5]:
medv_on_lstat = LinearRegression().fit(data.lstat.values.reshape(-1, 1), data.medv)
medv_on_lstat.coef_, medv_on_lstat.intercept_

(array([-0.95004935]), 34.5538408793831)

In [6]:
def add_intercept_term(X):
    if isinstance(X, (pd.DataFrame, pd.Series)):
        return pd.concat([X, pd.Series(np.ones(data.lstat.shape), index=X.index, name='intercept')], axis=1)
    else:
        return pd.concat([pd.DataFrame(X, columns=[f'X{i+1}' for i in range(X.shape[1])]), pd.Series(np.ones(data.lstat.shape), name='intercept')], axis=1)

In [57]:
def report_model(model, X, y):
    pred_y = model.predict(X)
    residuals = y - pred_y
    residual_stats = pd.Series(residuals).describe()
    print(f'Residual stats:\n{residual_stats}')
    RSS = residuals.T @ residuals
    print(f'RSS: {RSS}')
    resid_from_mean = y - y.mean()
    TSS = resid_from_mean.T @ resid_from_mean
    print(f'TSS: {TSS}')
    R_squared = 1 - RSS / TSS
    print(f'R squared: {R_squared}')
    F_score = ((TSS - RSS) / (X.shape[1])) / (RSS / (X.shape[0] - X.shape[1] - 1))
    print(f'F score: {F_score}')
    sigma_squared_hat = RSS / (X.shape[0] - X.shape[1] - 1)
    print(f'Sigma squared estimation: {sigma_squared_hat}')
    RSE = sigma_squared_hat ** 0.5
    print(f'RSE: {RSE}')
    aug_X = add_intercept_term(X)
    var_beta_hat = np.linalg.inv(aug_X.T @ aug_X) * sigma_squared_hat
    print(f'Betha variance estimation: {var_beta_hat}')
    for p_ in range(X.shape[1] + 1):
        standard_error = var_beta_hat[p_, p_] ** 0.5
        print(f"SE(beta_hat[{p_}]): {standard_error}")

In [58]:
report_model(medv_on_lstat, data.lstat.values.reshape(-1, 1), data.medv)

Residual stats:
count    5.060000e+02
mean    -5.673108e-15
std      6.209603e+00
min     -1.516745e+01
25%     -3.989612e+00
50%     -1.318186e+00
75%      2.033701e+00
max      2.450013e+01
Name: medv, dtype: float64
RSS: 19472.381418326433
TSS: 42716.29541501976
R squared: 0.5441462975864799
F score: 601.6178711098956
Sigma squared estimation: 38.63567741731435
RSE: 6.2157604053980675
Betha variance estimation: [[ 0.00150028 -0.01898311]
 [-0.01898311  0.31654954]]
SE(beta_hat[0]): 0.038733416212639364
SE(beta_hat[1]): 0.5626273549884322


In [15]:
y = data.medv
X = data.lstat.values.reshape(-1, 1)
y_pred = medv_on_lstat.predict(X)

In [60]:
predictions = medv_on_lstat.predict(np.array([5, 10, 15]).reshape(-1, 1))
predictions

array([29.80359411, 25.05334734, 20.30310057])

3.032672762989739