## Grid Search and Cross Validation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
path='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/automobileEDA.csv'
df_1 = pd.read_csv(path)

In [None]:
df_2 = pd.read_csv('/content/drive/My Drive/courses/data analytics basics/datasets/oil_dataset.csv')
df_2.head()

Unnamed: 0,Mendacium,Depth,Price
0,3.359,1722.533,74.048691
1,1.348,2062.571,83.321907
2,0.418,2013.507,82.748964
3,8.193,1420.607,84.760494
4,5.21,951.957,54.588671


# Cross-validation Score view
Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data.

This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # this is for validation of score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [None]:
X = df_2[['Mendacium','Depth']]
y = df_2['Price']

trainX, testX, trainY, testY = train_test_split(X, y, test_size=.2,random_state=0)

In [None]:
decision_tree_model = DecisionTreeRegressor(max_depth=10,)
decision_tree_model.fit(trainX,trainY)
decision_tree_model.score(testX,testY) * 100

92.23868646185383

In [None]:
scores = cross_val_score(decision_tree_model,X,y,cv=4)
scores*100

array([94.51001445, 93.23075053, 92.08982053, 92.93140963])

In [None]:
scores.mean()

0.9314159291939627

In [None]:
df_1.shape

(201, 29)

In [None]:
X = df_1[['horsepower']]
y = df_1['price']
trainX, testX, trainY, testY = train_test_split(X, y, test_size=.2,random_state=0)

In [None]:
svr_model = SVR(kernel='poly')
svr_model.fit(trainX,trainY)
svr_model.score(testX,testY) * 100

69.59953142552831

In [None]:
scores = cross_val_score(svr_model,X,y,cv=5)
scores*100

array([56.33229179, 34.84226144, 21.22501213, 92.78464941, 15.35120766])

In [None]:
scores.mean()

0.44107084486117937

#Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = [
          {'criterion':['mse','mae','friedman_mse']},
          {'max_depth':[1,5,9,10,11,]},
          {'splitter':['best','random']}
]

In [None]:
model  =  DecisionTreeRegressor()

In [None]:
grid_search_1 = GridSearchCV(model, params , cv = 4)

In [None]:
grid_search_1.fit(trainX,trainY)
bestDecisionModel = grid_search_1.best_estimator_
bestDecisionModel

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=9,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
bestDecisionModel.score(testX,testY)*100

92.89788941455824

testing on SVR

In [None]:
params = [
        {"kernel":['linear','poly','rbf','sigmoid']},
        {'C':[.5,1,2]}
]

In [None]:
svr_model = SVR()

In [None]:
grid_search_1 = GridSearchCV(svr_model, params , cv = 4)
grid_search_1.fit(trainX,trainY)
bestSVRModel = grid_search_1.best_estimator_
bestSVRModel

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
bestSVRModel.score(testX,testY)*100

80.95985358598419