In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [3]:
bean = datasets.load_boston()
print (bean.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [5]:
X_train, X_test, y_train, y_test = load_boston()

In [6]:
X_train.shape

(379, 13)

In [10]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
list(zip (y_test, clf.predict(X_test)))

[(24.399999999999999, 23.031176094761186),
 (17.5, 16.77336892249506),
 (29.899999999999999, 31.491943193290314),
 (22.0, 27.628250812947883),
 (23.100000000000001, 20.932577387323967),
 (27.899999999999999, 32.180281223111479),
 (19.399999999999999, 17.251572890363981),
 (23.800000000000001, 23.775279759600785),
 (22.800000000000001, 25.267593957185191),
 (16.800000000000001, 20.251336554349233),
 (15.199999999999999, 19.293641888798636),
 (28.699999999999999, 30.909740113561742),
 (25.199999999999999, 27.574983400162285),
 (17.399999999999999, 16.111444251813687),
 (43.799999999999997, 35.126770291351384),
 (15.0, 18.214710628962344),
 (22.0, 26.571184654274813),
 (18.800000000000001, 20.501944904619581),
 (17.399999999999999, 23.134273862932751),
 (18.399999999999999, 16.218095202621569),
 (50.0, 40.668524770774582),
 (13.800000000000001, 0.29719870351832967),
 (20.300000000000001, 23.418671144899484),
 (24.800000000000001, 26.320825955779693),
 (27.5, 19.878377718778193),
 (16.3000

In [12]:
#R2 score calculation
r2_score(y_test, linReg.predict(X_test))

0.74324892514593721

In [15]:
from math import sqrt
mse = mean_squared_error(y_test, linReg.predict(X_test))
print ("MSE: ", mse)

MSE:  18.4652393073


In [17]:
rmse = sqrt(mse)
print ("RMSE: ", rmse)

RMSE:  4.297119885148736


In [19]:
#Ridge Model
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
print ("R2: ", r2_score(y_test, ridge.predict(X_test)))
mse = mean_squared_error(y_test, ridge.predict(X_test))
print ("MSE: ", mse)
print ("RMSE: ", sqrt(mse))

In [23]:
#Optimization of Ridge
optRidge = Ridge(alpha=10)
optRidge.fit(X_train, y_train)
print ("R2: ", r2_score(y_test, optRidge.predict(X_test)))
mse = mean_squared_error(y_test, optRidge.predict(X_test))
print ("MSE: ", mse)
print ("RMSE: ", sqrt(mse))


R2:  0.74579864925
MSE:  18.2818661091
RMSE:  4.275729891966965


In [24]:
optRidge = Ridge(alpha=7.5)
optRidge.fit(X_train, y_train)
print ("R2: ", r2_score(y_test, optRidge.predict(X_test)))
mse = mean_squared_error(y_test, optRidge.predict(X_test))
print ("MSE: ", mse)
print ("RMSE: ", sqrt(mse))

R2:  0.745561262771
MSE:  18.2989386691
RMSE:  4.277725875876651
