In [48]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

In [2]:
load_boston = load_boston()

In [3]:
df = pd.DataFrame(load_boston.data, columns=load_boston.feature_names)

In [4]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
y = pd.DataFrame(load_boston.target, columns=['target'])

In [6]:
y.head()

Unnamed: 0,target
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [7]:
df.isnull().count()

CRIM       506
ZN         506
INDUS      506
CHAS       506
NOX        506
RM         506
AGE        506
DIS        506
RAD        506
TAX        506
PTRATIO    506
B          506
LSTAT      506
dtype: int64

In [8]:
df.shape

(506, 13)

In [9]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [10]:
df['CRIM'] = np.log(df['CRIM'])
df['TAX'] = np.log(df['TAX'])

In [11]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,-0.780436,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,5.931405,18.455534,356.674032,12.653063
std,2.16205,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,0.396367,2.164946,91.294864,7.141062
min,-5.064036,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,5.231109,12.6,0.32,1.73
25%,-2.500488,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,5.631212,17.4,375.3775,6.95
50%,-1.360641,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,5.799093,19.05,391.44,11.36
75%,1.302119,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,6.50129,20.2,396.225,16.955
max,4.488369,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,6.566672,22.0,396.9,37.97


In [17]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=70)

In [27]:
lr = LinearRegression()
dt = DecisionTreeRegressor()

In [28]:
lr.fit(x_train, y_train)
dt.fit(x_train, y_train)

DecisionTreeRegressor()

In [29]:
lr_pred = lr.predict(x_test)
dt_pred = dt.predict(x_test)

In [30]:
r2_score(y_test, lr_pred)
r2_score(y_test, dt_pred)

0.7941739688051356

# Cross validation

In [31]:
lr_score = cross_val_score(lr, df, y, cv=10)
dt_score = cross_val_score(dt, df, y, cv=10)

In [32]:
lr_score
dt_score

array([ 0.51697051,  0.61186199, -1.484546  ,  0.06056769,  0.76886507,
        0.2275603 ,  0.20139733,  0.33963421, -1.99091805,  0.10835556])

In [33]:
print(np.mean(lr_score))
print(np.mean(dt_score))

0.22396687464126316
-0.06402514063666186


# LASSO and Ridge regression

In [35]:
ri = Ridge()

In [36]:
ri.fit(x_train, y_train)

Ridge()

In [37]:
ri_pred = ri.predict(x_test)

In [38]:
r2_score(y_test, ri_pred)

0.7418063249928931

In [39]:
ri_score = cross_val_score(ri, df, y, cv=10)

In [40]:
ri_score

array([ 0.73700682,  0.56385641, -0.60431388,  0.64991085,  0.5745772 ,
        0.76284241,  0.41961646, -0.08663226, -0.65844701,  0.39104708])

In [41]:
np.mean(ri_score)

0.27494640757298383

In [60]:
ls = Lasso(alpha=1e-15)

In [61]:
ls.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=1e-15)

In [62]:
ls_pred = ls.predict(x_test)

In [63]:
r2_score(y_test, ls_pred)

0.7435523313554935

In [64]:
ls_score = cross_val_score(ls, df, y, cv=10)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [65]:
np.mean(ls_score)

0.22396687464126835

# Grid search CV

In [49]:
params = {'alpha':['1e-15', '1e-13', '1e-11', '1e-9', '1e-7', '1e-5', '0.0001','0.001','0.01', '0.1', '0', '1']}

In [56]:
a = [1e-15, 1e-13, 1e-11, 1e-9, 1e-7, 1e-5, 0.0001, 0.001,0.01, 0.1, 0, 1]
for i in range(len(a)):
    rid = Ridge(alpha= a[i])
    rid.fit(x_train, y_train)
    rid_pred = rid.predict(x_test)
    print(f'the accuracy for {a[i]} is ', r2_score(y_test, rid_pred))

the accuracy for 1e-15 is  0.7435523313554937
the accuracy for 1e-13 is  0.7435523313554941
the accuracy for 1e-11 is  0.7435523313555362
the accuracy for 1e-09 is  0.7435523313597512
the accuracy for 1e-07 is  0.7435523317812437
the accuracy for 1e-05 is  0.7435523739285743
the accuracy for 0.0001 is  0.7435527569106606
the accuracy for 0.001 is  0.7435565693745214
the accuracy for 0.01 is  0.7435929892291684
the accuracy for 0.1 is  0.7438135639361431
the accuracy for 0 is  0.7435523313554936
the accuracy for 1 is  0.7418063249928931


In [51]:
grid_rid = GridSearchCV(ri, params, cv=10)

In [52]:
grid_rid.fit(df, y)

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': ['1e-15', '1e-13', '1e-11', '1e-9', '1e-7',
                                   '1e-5', '0.0001', '0.001', '0.01', '0.1',
                                   '0', '1']})

In [53]:
grid_rid.best_params_

{'alpha': '1'}

In [57]:
grid_las = GridSearchCV(ls, params, cv=10)

In [58]:
grid_las.fit(df,y)

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 844, in fit
    self.path(X, y[:, k],
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 510, in enet_path
    l1_reg = alpha * l1_ratio * n_samples
TypeError: can't multiply sequence by non-int of type 'float'

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 844, in fit
    self.path(X, y[:, k],
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 510, in enet_path
    l1_reg = alpha * l1_ratio * n_samples
TypeError: can't multiply sequence by non-int of type 'float'

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 844, in fit
    self.path(X, y[:, k],
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 510, in enet_path
    l1_reg = alpha * l1_ratio * n_samples
TypeError: can't multiply sequence by non-int of type 'float'

Traceback (most recent call last):
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sunil\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py

TypeError: can't multiply sequence by non-int of type 'float'

In [59]:
grid_las.best_params_

{'alpha': '1e-15'}