In [3]:
import numpy as np
import pandas as pd
from numpy.linalg import inv
from sklearn.datasets import load_boston
from statsmodels.regression.linear_model import OLS

In [9]:
# load the boston data set
boston = load_boston()

# obtain the feature matrix as a numpy array
X = boston.data

# obtain the target variable as a numpy array
y = boston.target

In [5]:
print(X.shape)

(506, 13)


In [7]:
feature_names = boston.feature_names
print(feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [8]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [32]:
# create vector of ones...
int = np.ones(shape=y.shape)[..., None]

#...and add to feature matrix
X = np.concatenate((int, X), 1)

In [31]:
# calculate coefficients using closed-form solution
coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)

In [30]:
# extract the feature names of the boston data set and prepend the intercept
feature_names = np.insert(boston.feature_names, 0, 'INT')

# collect results into a DataFrame for pretty printing
results = pd.DataFrame({'coeffs':coeffs}, index=feature_names)

print(results.round(2))

ValueError: Shape of passed values is (17, 1), indices imply (14, 1)

In [28]:
# create a linear model and extract the parameters
coeffs_lm = OLS(y, X).fit().params

In [29]:
results['coeffs_lm'] = coeffs_lm

print(results.round(2))

ValueError: Length of values does not match length of index