# Predicting House Price With Regression
In this simple machine learning problem of house price prediction I have used famous boston dataset to evaluate different regression algorithm.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

boston=load_boston()
print(boston.data.shape)
print(boston.feature_names)
print(np.max(boston.target),np.min(boston.target),np.mean(boston.target))

(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
50.0 5.0 22.5328063241


In [2]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=0.25,random_state=33)

from sklearn.preprocessing import StandardScaler
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(np.reshape(y_train,(-1,1)))

X_train = scalerX.transform(X_train)
y_train = scalery.transform(np.reshape(y_train,(-1,1)))
X_test = scalerX.transform(X_test)
y_test = scalery.transform(np.reshape(y_test,(-1,1)))

print (np.max(X_train), np.min(X_train), np.mean(X_train), np.max(y_train), np.min(y_train), np.mean(y_train))
# converting reshaped Y vector into array again
y_train=y_train.flatten(order='C')
y_test=y_test.flatten(order='C')

10.2028980046 -4.66702040845 2.47038706385e-15 2.91774920367 -1.93147098641 3.58552238032e-16




In [3]:
from sklearn.cross_validation import *
def train_and_evaluate(clf, X_train, y_train):
    
    clf.fit(X_train, y_train)
    
    print ("Coefficient of determination on training set:",clf.score(X_train, y_train))
    
    cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print ("Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores))

In [4]:
from sklearn.linear_model import SGDRegressor
clf1=SGDRegressor(loss='squared_loss', penalty=None,  random_state=33)
train_and_evaluate(clf1,X_train,y_train)
print (clf1.coef_)

Coefficient of determination on training set: 0.740281703689
Average coefficient of determination using 5-fold crossvalidation: 0.713630596255
[-0.07634694  0.06117706 -0.03404977  0.1076101  -0.06620428  0.35855438
 -0.0098127  -0.21344242  0.0921319  -0.03985987 -0.18753121  0.05267773
 -0.37137355]




In [5]:
clf2 = SGDRegressor(loss='squared_loss', penalty='l2',  random_state=42)
train_and_evaluate(clf2,X_train,y_train)

Coefficient of determination on training set: 0.743616743208
Average coefficient of determination using 5-fold crossvalidation: 0.71081206667




In [6]:
clf3 = SGDRegressor(loss='squared_loss', penalty='l1',  random_state=42)
train_and_evaluate(clf3,X_train,y_train)

Coefficient of determination on training set: 0.74358692291
Average coefficient of determination using 5-fold crossvalidation: 0.710763609874




# Use of SVM for regression
Now we will import suport vector regressor from svm class in sklearn module. We will test it for different kernels and test how it performs.

In [7]:
from sklearn import svm
svr1= svm.SVR(kernel='linear')
train_and_evaluate(svr1,X_train,y_train)

Coefficient of determination on training set: 0.71886923342
Average coefficient of determination using 5-fold crossvalidation: 0.707838419194


In [8]:
svr2=svm.SVR(kernel='poly')
train_and_evaluate(svr2,X_train,y_train)

Coefficient of determination on training set: 0.904109273301
Average coefficient of determination using 5-fold crossvalidation: 0.779288545488


In [9]:
svr3=svm.SVR(kernel='rbf')
train_and_evaluate(svr3,X_train,y_train)

Coefficient of determination on training set: 0.900132065979
Average coefficient of determination using 5-fold crossvalidation: 0.833662221567


# Use of Random Forest regressor
Now we will use enseble regressor for regression and evaluating results as before.

In [10]:
from sklearn import ensemble
et1=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(et1,X_train,y_train)

Coefficient of determination on training set: 1.0
Average coefficient of determination using 5-fold crossvalidation: 0.861758978344


In [11]:
important=zip(et1.feature_importances_,boston.feature_names)
print (sorted(important))

[(0.0050438532027558842, 'ZN'), (0.015142513715149682, 'B'), (0.017052578400506287, 'AGE'), (0.018941821085751577, 'RAD'), (0.023602561777571307, 'CHAS'), (0.025733049004581798, 'CRIM'), (0.031874162235100457, 'NOX'), (0.034405644939308928, 'INDUS'), (0.039713133345196064, 'DIS'), (0.046618521397262996, 'TAX'), (0.099511801492762245, 'PTRATIO'), (0.28421522796368465, 'LSTAT'), (0.35814513144036819, 'RM')]


In [12]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n")

        
measure_performance(X_test,y_test,et1, show_accuracy=False, show_classification_report=False,
                    show_confusion_matrix=False, show_r2_score=True)

Coefficient of determination:0.802 

