In [2]:
import pandas as pd
import os
os.chdir("D:\CDAC\PML\Datasets")
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import numpy as np

In [3]:
boston = pd.read_csv("Boston.csv")
boston.shape

(506, 14)

In [4]:
X = boston.drop('medv', axis=1)
y = boston['medv']

In [5]:
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(lr, X, y, cv=kfold)
results

array([0.74514306, 0.69363489, 0.73264725, 0.68377809, 0.73767041])

In [6]:
results.mean()

0.718574739923582

degree = 2

In [7]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(pipe, X, y, cv=kfold)
results

array([0.72242314, 0.83168531, 0.74203917, 0.85068021, 0.85340561])

In [8]:
results.mean()

0.8000466875131209

In [9]:
dgs = [2,3,4]
scores = []
for d in dgs:
    poly = PolynomialFeatures(degree=d)
    lr = LinearRegression()
    pipe = Pipeline([('POLY',poly), ('LR', lr)])
    kfold = KFold(n_splits=5, shuffle=True, random_state=23)
    results = cross_val_score(pipe, X, y, cv=kfold)
    scores.append(results.mean())
print(dict(zip(dgs, scores)))
i_max = np.argmax(scores)
print("Best Param:", dgs[i_max])
print("Best Score:", scores[i_max])

{2: 0.8000466875131209, 3: -9071.292438992035, 4: -1978.9579616109731}
Best Param: 2
Best Score: 0.8000466875131209


### Grid Search CV

In [10]:
from sklearn.model_selection import GridSearchCV 
poly = PolynomialFeatures()
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
params = {'POLY__degree': [2,3,4]}
#print(pipe.get_params())

In [11]:
gcv = GridSearchCV(pipe, cv=kfold, param_grid=params)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

0.8000466875131209
{'POLY__degree': 2}


#### Housing

In [12]:
hous = pd.read_csv("Housing.csv")
dum_hous = pd.get_dummies(hous, drop_first=True)
X = dum_hous.drop('price', axis=1)
y = dum_hous['price']

In [13]:
poly = PolynomialFeatures()
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
params = {'POLY__degree': [2,3,4]}
gcv = GridSearchCV(pipe, cv=kfold, param_grid=params)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

0.5427161924430148
{'POLY__degree': 2}
