In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures

from sklearn.datasets import load_boston

In [27]:
data = pd.read_csv(load_boston()['filename'], skiprows=1)

In [28]:
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [30]:
X = data.drop(['MEDV'], axis=1)
y = data['MEDV']

In [31]:
ss_scaler = preprocessing.StandardScaler()
Xss = ss_scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(Xss, y, test_size=0.3, random_state=0)

In [49]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


knn = KNeighborsRegressor()
svr = SVR()
las = Lasso()
ri = Ridge()
rf = RandomForestRegressor()

param_knn = {'n_neighbors': range(1,11)}
param_svr = {'kernel': ['poly', 'rbf']}
param_la  = {'alpha': [.0001, .001, .01, .5, 10.]}
param_ri  = {'alpha': [0.1, 0.2, 0.5, 1.0, 5., 10.]}
param_rf  = {
                'max_depth': range(2, 19, 2),
                'min_samples_leaf': range(2, 19, 2),
                'n_estimators': [5, 10, 15, 20, 50, 70, 90]
            }

In [46]:
best_estimators = []
for reg, param in zip([knn, svr, las, ri, rf], [param_knn, param_svr, param_la, param_ri, param_rf]):
    grid = GridSearchCV(reg, param, cv=10, scoring='r2')
    grid.fit(X_train, y_train)
    print(reg.__class__.__name__, ':', grid.best_params_,"\t", 'acc:', grid.best_score_)
    best_estimators.append(grid.best_estimator_)

KNeighborsRegressor : {'n_neighbors': 6} 	 acc: 0.7499747205202721
SVR : {'kernel': 'rbf'} 	 acc: 0.6771878081071613
Lasso : {'alpha': 0.01} 	 acc: 0.7143705613266764
Ridge : {'alpha': 10.0} 	 acc: 0.716864156252387
RandomForestRegressor : {'max_depth': 18, 'min_samples_leaf': 2, 'n_estimators': 50} 	 acc: 0.8292257402327795


In [47]:
from sklearn.metrics import r2_score
r2_score(y_test, best_estimators[4].predict(X_test))

0.796750242537227