In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
#loading data
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
#spliting into train and test datasets
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [4]:
X_train, X_test, y_train, y_test = load_boston()

In [5]:
X_train.shape

(379L, 13L)

linear regression

In [6]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
zip (y_test, clf.predict(X_test))

[(42.799999999999997, 28.906624380905754),
 (19.300000000000001, 21.031756009429071),
 (23.899999999999999, 27.789222815859041),
 (24.699999999999999, 24.287203851663413),
 (25.0, 24.545045405259909),
 (12.6, 18.071874339125579),
 (18.0, 19.093921187435019),
 (19.899999999999999, 20.073544171710015),
 (18.5, 19.410108755297582),
 (24.800000000000001, 26.598893934395818),
 (32.0, 33.638655962561643),
 (10.4, 16.214032221131582),
 (29.0, 31.431566133950735),
 (21.0, 23.298841283462757),
 (27.899999999999999, 20.353454004647883),
 (11.800000000000001, 8.7225047986870656),
 (20.100000000000001, 15.710396333391149),
 (19.800000000000001, 23.230078246226881),
 (19.100000000000001, 24.475405735315039),
 (20.100000000000001, 22.012986324302631),
 (18.699999999999999, 21.146855271528363),
 (22.800000000000001, 24.280524987650882),
 (20.100000000000001, 23.912564079484586),
 (22.600000000000001, 23.95898785982201),
 (8.3000000000000007, 13.574942419606048),
 (12.699999999999999, 11.9797807781836

In [8]:
#mean squared error
y_pred=clf.predict(X_test)
mean_squared_error(y_test, y_pred)

22.064365144527997

In [9]:
y_pred

array([ 28.90662438,  21.03175601,  27.78922282,  24.28720385,
        24.54504541,  18.07187434,  19.09392119,  20.07354417,
        19.41010876,  26.59889393,  33.63865596,  16.21403222,
        31.43156613,  23.29884128,  20.353454  ,   8.7225048 ,
        15.71039633,  23.23007825,  24.47540574,  22.01298632,
        21.14685527,  24.28052499,  23.91256408,  23.95898786,
        13.57494242,  11.97978078,  43.87233557,  19.25735783,
        25.55342892,   8.45342976,  35.44722659,  19.37985477,
        14.51921443,  39.5217915 ,  28.50818918,  29.09299861,
        25.12148053,  25.87472905,  20.8258396 ,  27.19360141,
        20.03977652,  12.3614357 ,  20.61171336,  10.77029682,
        23.13186689,  22.06480215,  23.50730657,  22.40192824,
        21.09143181,  16.79699927,  24.09299268,  18.53420673,
        22.19965097,  28.56015509,  15.45604594,  32.4652097 ,
        15.80796285,  16.28509589,  25.29823234,  13.95811323,
        19.254775  ,  20.25099197,  15.43281462,  34.54

R2 score

In [10]:
r2_score(y_test, y_pred)

0.72766684759445432

implementing sklearn.linear_model.Ridge, adjusting the Linear parameter, source: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [11]:
#Ridge regression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

In [12]:
clearFigRidge = Ridge(alpha=1.5)
clearFigRidge.fit(X_train, y_train)

Ridge(alpha=1.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [13]:
zip (y_test, clf.predict(X_test))

[(42.799999999999997, 28.906624380905754),
 (19.300000000000001, 21.031756009429071),
 (23.899999999999999, 27.789222815859041),
 (24.699999999999999, 24.287203851663413),
 (25.0, 24.545045405259909),
 (12.6, 18.071874339125579),
 (18.0, 19.093921187435019),
 (19.899999999999999, 20.073544171710015),
 (18.5, 19.410108755297582),
 (24.800000000000001, 26.598893934395818),
 (32.0, 33.638655962561643),
 (10.4, 16.214032221131582),
 (29.0, 31.431566133950735),
 (21.0, 23.298841283462757),
 (27.899999999999999, 20.353454004647883),
 (11.800000000000001, 8.7225047986870656),
 (20.100000000000001, 15.710396333391149),
 (19.800000000000001, 23.230078246226881),
 (19.100000000000001, 24.475405735315039),
 (20.100000000000001, 22.012986324302631),
 (18.699999999999999, 21.146855271528363),
 (22.800000000000001, 24.280524987650882),
 (20.100000000000001, 23.912564079484586),
 (22.600000000000001, 23.95898785982201),
 (8.3000000000000007, 13.574942419606048),
 (12.699999999999999, 11.9797807781836

 R2 and MSE after applying Ridge

In [14]:
clf2 = LinearRegression()
clf2.fit(X_test, y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
#Mean squared error
y_pred2=clf2.predict(X_test)
mean_squared_error(y_test, y_pred)

22.064365144527997

In [16]:
y_pred2

array([ 32.41046308,  21.63890562,  27.5066223 ,  25.10961684,
        24.22957702,  18.64461207,  18.64827239,  20.13504782,
        19.03882656,  26.95419984,  32.70676579,  10.89079413,
        32.65026666,  22.68231167,  19.63605539,   8.49068729,
        16.81470849,  21.51437419,  23.91301589,  20.28418914,
        19.58926998,  24.9785934 ,  22.78635105,  25.22318949,
        11.71288315,  15.33868801,  47.56998107,  19.35319072,
        24.54627668,   8.3977061 ,  34.68227845,  19.04463485,
        15.45520637,  43.88729447,  27.89231008,  28.04818566,
        25.23727937,  25.36937549,  20.32765568,  26.95410894,
        21.13542106,  10.46813877,  22.12222892,  10.59228558,
        22.26873188,  24.33190635,  25.36772958,  21.26758416,
        27.18146225,  18.40826281,  26.89877049,  17.21058736,
        22.15766938,  29.10585604,  17.78105042,  32.5685376 ,
        14.782437  ,  18.16924182,  23.31132514,  14.98331159,
        19.32799094,  21.20922502,  14.54154774,  34.53

In [17]:
clearFigRidgeCV = RidgeCV(alphas=[-.2, .1,], store_cv_values = True)
clearFigRidgeCV.fit(X_train, y_train)

RidgeCV(alphas=[-0.2, 0.1], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=True)

In [18]:
zip (y_test, clf.predict(X_test))

[(42.799999999999997, 28.906624380905754),
 (19.300000000000001, 21.031756009429071),
 (23.899999999999999, 27.789222815859041),
 (24.699999999999999, 24.287203851663413),
 (25.0, 24.545045405259909),
 (12.6, 18.071874339125579),
 (18.0, 19.093921187435019),
 (19.899999999999999, 20.073544171710015),
 (18.5, 19.410108755297582),
 (24.800000000000001, 26.598893934395818),
 (32.0, 33.638655962561643),
 (10.4, 16.214032221131582),
 (29.0, 31.431566133950735),
 (21.0, 23.298841283462757),
 (27.899999999999999, 20.353454004647883),
 (11.800000000000001, 8.7225047986870656),
 (20.100000000000001, 15.710396333391149),
 (19.800000000000001, 23.230078246226881),
 (19.100000000000001, 24.475405735315039),
 (20.100000000000001, 22.012986324302631),
 (18.699999999999999, 21.146855271528363),
 (22.800000000000001, 24.280524987650882),
 (20.100000000000001, 23.912564079484586),
 (22.600000000000001, 23.95898785982201),
 (8.3000000000000007, 13.574942419606048),
 (12.699999999999999, 11.9797807781836

In [19]:
clearFigRidge0 = Ridge(alpha=2.0)
clearFigRidge0.fit(X_train, y_train)

Ridge(alpha=2.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [20]:
print 'Ridge r2:-', r2_score(y_test, clearFigRidge0.predict(X_test))

Ridge r2:- 0.728283881878


In [21]:
print 'Ridge MSE:-', mean_squared_error(y_test, clearFigRidge0.predict(X_test))

Ridge MSE:- 22.0143731784
