In [3]:
from IPython.display import display, HTML
display(HTML('<h1>Finding best model and hyper parameter tunning using GridSearchCV </h1>'))


In [11]:
from sklearn import datasets, svm
import pandas as pd
iris = datasets.load_iris()  
# iris.feature_names
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] =  df['flower'].apply(lambda x : iris.target_names[x])
df


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [13]:
from IPython.display import display, HTML
display(HTML("""<h1>Approach 1: Use train_test_split and manually tune parameters by trial and error </h1>"""))


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9333333333333333

In [27]:
from IPython.display import display, HTML
display(HTML("""<h1>Approach 2: Use K Fold Cross validation </h1>"""))


In [24]:
from sklearn.model_selection import cross_val_score
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

# Creating a loop instead of doing above 3 lines manually 
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for i in kernels:
    for cval in C:
        print(i,f'Kernel with cval = {cval} ->\t',cross_val_score\
        (svm.SVC(kernel=i,C=cval,gamma='auto'),iris.data, iris.target, cv=5))
    print()
# We can see linear kernel with 10 cval is best 
        

rbf Kernel with cval = 1 ->	 [0.96666667 1.         0.96666667 0.96666667 1.        ]
rbf Kernel with cval = 10 ->	 [0.96666667 1.         0.96666667 0.96666667 1.        ]
rbf Kernel with cval = 20 ->	 [0.96666667 1.         0.9        0.96666667 1.        ]

linear Kernel with cval = 1 ->	 [0.96666667 1.         0.96666667 0.96666667 1.        ]
linear Kernel with cval = 10 ->	 [1.         1.         0.9        0.96666667 1.        ]
linear Kernel with cval = 20 ->	 [1.         1.         0.9        0.93333333 1.        ]



In [26]:
from IPython.display import display, HTML
display(HTML("""<h1>Approach 3: Now Using GridSearchCV </h1>"""))


In [33]:
# GridSearchCV  helps to remove the loop created above
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    svm.SVC(gamma='auto'),          # model name   svc or logistics or decision tree with params ()
    
    {'C':[1,10,20],
    'kernel': ['rbf','linear']},    # all the different variants need to choose
    
    cv=5, return_train_score=False   # finally choosing number of stack to split and train test data
)


clf.fit(iris.data, iris.target)
# clf.cv_results_
clf.best_params_                     # will give the best results automatically
clf.best_score_                      # returns best score of the kernel


{'C': 1, 'kernel': 'rbf'}

In [34]:
from IPython.display import display, HTML
display(HTML("""<h1>Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation</h1>"""))


In [36]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(
    
    svm.SVC(gamma='auto'),             # model name
    {
        'C': [1,10,20],
        'kernel': ['rbf','linear']     # params 
    }, 
    cv=5,                              # stacks
    return_train_score=False, 
    n_iter=2                           # number of iterations need to be done
)
rs.fit(iris.data, iris.target)         # input , output

pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]    # storing the results 

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=2,
                   param_distributions={'C': [1, 10, 20],
                                        'kernel': ['rbf', 'linear']})

In [37]:
from IPython.display import display, HTML
display(HTML("""<h1>
Now We  combine all the models , all the params and run the search for Best results 
"""))


In [2]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    
    "LogisticRegression":{                            #   model name LogisticRegression
        "model_name":LogisticRegression(),
        "params":{
            'C': [1,5,10]}     },
    
    'svm':{
        "model_name":svm.SVC(gamma='auto'),           #  model name SVM
        "params": { 
            "C":[1,10,20],
            'kernel': ['rbf','linear']  }    },
    
    
    "RandomForestClassifier":{                        #   model name RandomForestClassifier
        "model_name":RandomForestClassifier(),
        "params":{
            'n_estimators': [1,5,10] }   }
    
    
}

In [12]:

scores = []

for model_name, mp in models.items():
    clf =  GridSearchCV(
        mp['model_name'],      # model name with params 
        mp['params'],          # models params 
        cv=5,                  # number of stacks
        return_train_score=False)
    
    clf.fit(iris.data, iris.target)  # input , output
    
    # Storing results
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
# Displaying results
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [2]:
from IPython.display import display, HTML
display(HTML('<h1>Finding best model and hyper parameters for sklearn digits dataset classification</h1>'))


In [25]:
import pandas as pd

from sklearn import datasets
iris = datasets.load_iris() 
digits = datasets.load_digits()


from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn import svm

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.model_selection import GridSearchCV


models = {
    'svm': {
        'model_name': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model_name': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model_name': LogisticRegression(solver='liblinear',multi_class='auto', max_iter=1000),
        'params': {
            'C': [1,5,10]}
    },
    
    'linear_regression' : {
        'model_name': LinearRegression(),
        'params': {}
    },
    
    'naive_bayes_gaussian': {
        'model_name': GaussianNB(),
        'params': {}
    },
    
    'naive_bayes_multinomial': {
        'model_name': MultinomialNB(),
        'params': {}
    },
    
    
    'decision_tree': {
        'model_name': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
        }
    }   
    
    
}

scores = []

for model_name, mp in models.items():
    clf =  GridSearchCV(
        mp['model_name'],      # model name with params 
        mp['params'],          # models params 
        cv=5,                  # number of stacks
        return_train_score=False)
    
    clf.fit(iris.data, iris.target)  # input , output
    
#     # Storing results
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
# # Displaying results
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 1}
2,logistic_regression,0.966667,{'C': 5}
3,linear_regression,0.322561,{}
4,naive_bayes_gaussian,0.953333,{}
5,naive_bayes_multinomial,0.953333,{}
6,decision_tree,0.966667,{'criterion': 'gini'}
