In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns

import warnings
warnings. filterwarnings('ignore')


In [34]:
from sklearn.datasets import load_iris
iris = load_iris()

In [35]:
features = iris.data 
target = iris.target

# Evaluate Model

In [20]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

def model_evaluation(pred , actual):
    cm = confusion_matrix(actual , pred)
    print("Confusion Matrix:")
    print(cm)

    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("F1 Score:", f1)

    # Calculate the precision score
    precision = precision_score(y_test, y_pred, average='weighted')
    print("Precision Score:", precision)

    # Calculate the recall score
    recall = recall_score(y_test, y_pred, average='weighted')
    print("Recall Score:", recall)

    plt.figure(figsize=(10,7))
    sns.heatmap(cm , annot=True)
    plt.xlabel('Predicted')
    plt.ylabel('Truth')



In [21]:
def get_score(model , x_train , x_test , y_train , y_test):
    model.fit(x_train , y_train)
    return model.score(x_test , y_test)

# Using CV for BEST Score


In [26]:
from sklearn.model_selection import  cross_val_score

from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import  SVC
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB

In [36]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),features, target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

## HyperParameter Tunning can be done by .. 

1. Grid Search
2. Randomized Search



# Grid-Search: 

To implement the Grid-Search, we have a Scikit-Learn library called GridSearchCV. The computational time would be long, but it would reduce the manual efforts by avoiding the ‘n’ number of lines of code. Library itself perform the search operations and returns the performing model and its score. In which each model are built for each permutation of a given hyperparameter, internally it would be evaluated and ranked across the given cross-validation folds.

In [37]:
from sklearn.model_selection import  GridSearchCV

In [62]:
params_grid =  {
    'C': [1,10,20],
    'kernel': ['rbf','linear'],
    'gamma' : ['auto']
}

In [63]:
clf = GridSearchCV( SVC() , params_grid , cv=5  )
clf.fit(features , target)
clf.cv_results_['params']

[{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'},
 {'C': 1, 'gamma': 'auto', 'kernel': 'linear'},
 {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
 {'C': 10, 'gamma': 'auto', 'kernel': 'linear'},
 {'C': 20, 'gamma': 'auto', 'kernel': 'rbf'},
 {'C': 20, 'gamma': 'auto', 'kernel': 'linear'}]

In [64]:
cv_df = pd.DataFrame( clf.cv_results_)
cv_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000603,0.001205,0.002609,0.003329,1,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.0,0.0,0.003202,0.003922,1,auto,linear,"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001597,0.003194,0.0,0.0,10,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.0,0.0,0.003201,0.003921,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.003201,0.00392,0.001604,0.003207,20,auto,rbf,"{'C': 20, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.0,0.0,0.0,0.0,20,auto,linear,"{'C': 20, 'gamma': 'auto', 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [67]:
cv_df[['param_C','param_kernel','param_gamma' ,'mean_test_score']]

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
0,1,rbf,auto,0.98
1,1,linear,auto,0.98
2,10,rbf,auto,0.98
3,10,linear,auto,0.973333
4,20,rbf,auto,0.966667
5,20,linear,auto,0.966667


In [68]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 '

In [54]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [55]:
clf.best_score_

0.9800000000000001

In [56]:
clf.best_estimator_

In [57]:
clf.best_index_

1

In [59]:
clf.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'n_jobs': None,
 'param_grid': {'C': [1, 10, 20], 'kernel': ['rbf', 'linear']},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

# Random Search:
 The Grid Search one that we have discussed above usually increases the complexity in terms of the computation flow, So sometimes GS is considered inefficient since it attempts all the combinations of given hyperparameters.  But the Randomized Search is used to train the models based on random hyperparameters and combinations. obviously, the number of training models are small column than grid search.

In simple terms, In Random Search, in a given grid, the list of hyperparameters are trained and test our model on a random combination of given hyperparameters.

In [79]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(SVC(), params_grid, 
    cv=5, 
    return_train_score=False, 
    n_iter=3
)


In [80]:
rs.fit(features, target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.973333
1,1,rbf,0.98
2,10,rbf,0.98


# How about different models with different hyperparameters?

In [81]:
from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import  SVC
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB

In [108]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
      'Gaussian Naive Bayes' : {
        'model': GaussianNB(),
        'params': {
            'priors': [None],
            "var_smoothing" : [1e-6 , 1e-7 ]        }
    },
    'Decisiong Tree' : {
        'model': DecisionTreeClassifier(),
        'params': {
            "criterion" : ['gini', 'entropy', 'log_loss'] , 
            "splitter" : ['best', 'random'] ,
            "min_samples_split" : [ 2 ,  4 , 6 , 8] ,
            "max_features" : ['auto' , 'sqrt' , 'log2']
    }
}
}
model_score = []

for model_name,item in model_params.items():
    gscv = GridSearchCV( item['model'] , item['params'] , cv=5  )
    gscv.fit(features , target)
    model_score.append({
         'model' : model_name ,
          'best params' : gscv.best_params_ , 
          'best score' : gscv.best_score_ 
          })

model_df = pd.DataFrame(model_score )


In [110]:
model_df['best params'][4]

{'criterion': 'log_loss',
 'max_features': 'log2',
 'min_samples_split': 2,
 'splitter': 'random'}

In [None]:
from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import  SVC
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.model_selection import  GridSearchCV

model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
      'Gaussian Naive Bayes' : {
        'model': GaussianNB(),
        'params': {
            'priors': [None],
            "var_smoothing" : [1e-6 , 1e-7 ]        }
    },
    'Decisiong Tree' : {
        'model': DecisionTreeClassifier(),
        'params': {
            "criterion" : ['gini', 'entropy', 'log_loss'] , 
            "splitter" : ['best', 'random'] ,
            "min_samples_split" : [ 2 ,  4 , 6 , 8] ,
            "max_features" : ['auto' , 'sqrt' , 'log2']
    }
}
}
model_score = []

for model_name,item in model_params.items():
    gscv = GridSearchCV( item['model'] , item['params'] , cv=5  )
    gscv.fit(x_test_count , y_train)
    model_score.append({
         'model' : model_name ,
          'best params' : gscv.best_params_ , 
          'best score' : gscv.best_score_ 
          })

model_df = pd.DataFrame(model_score )
