#### Which model to use for classification?
- suppose you want to perform classification on iris dataset.
- there are many models to choose from (Random forest, SVM, Decisin Tree, ogistic Regression, Naive Bayes, etc.)
- supose you choose SVM
- model = svm .SVC(kernel="rbf", C=30, gamma="auto")
- how to choose the hyperparameters(kernal, C, gemma)
- **Hyperparameter tuning** - the process of choosing the optimal parameters.

In [13]:
from sklearn import svm, datasets
iris = datasets.load_iris()

In [14]:
import numpy as np
import pandas as pd
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df["flower"] = iris.target
iris_df["flower"] = iris_df["flower"].apply(lambda x: iris.target_names[x])
iris_df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [16]:
model = svm.SVC(kernel="rbf", C=30, gamma="auto")
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.9111111111111111

- The score changes if the train and test data is changed
- We use K Fold Cross Validation to overcome this issue

In [17]:
from sklearn.model_selection import cross_val_score

cross_val_score(svm.SVC(kernel="linear", C=10, gamma="auto"), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [18]:
cross_val_score(svm.SVC(kernel="rbf", C=10, gamma="auto"), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [19]:
cross_val_score(svm.SVC(kernel="linear", C=20, gamma="auto"), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.93333333, 1.        ])

- This method is very manual and repeatetive. Becoz there are so many values you can supply as combination.
- Use for loop

In [21]:
kernels = ["rbf", "linear"]
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval, C=cval, gamma="auto"), iris.data, iris.target, cv=5)
        avg_scores[kval + "_" + str(cval)] = np.average(cv_scores)

avg_scores

# this is also not good since as the c vals increase the iterations will increase

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

Sklearn provides an api which do the exact same thing as above with a single line of code

In [22]:
from sklearn.model_selection import GridSearchCV

classifier = GridSearchCV(svm.SVC(gamma="auto"), {
    "C":[1,10,20],
    "kernel":["rbf", "linear"]
}, cv=5, return_train_score=False)

classifier.fit(iris.data, iris.target)
classifier.cv_results_

{'mean_fit_time': array([0.00260277, 0.0020062 , 0.00140162, 0.00159731, 0.00200624,
        0.00199518]),
 'std_fit_time': array([0.00049608, 0.00061782, 0.00048624, 0.00048751, 0.00089324,
        0.00062955]),
 'mean_score_time': array([0.00260425, 0.00159841, 0.0013968 , 0.00060096, 0.00139632,
        0.00140414]),
 'std_score_time': array([0.00049165, 0.00079466, 0.00049545, 0.0004907 , 0.00048719,
        0.00049306]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],
 'split0_test_score': ar

In [23]:
df = pd.DataFrame(classifier.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002603,0.000496,0.002604,0.000492,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002006,0.000618,0.001598,0.000795,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001402,0.000486,0.001397,0.000495,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001597,0.000488,0.000601,0.000491,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.002006,0.000893,0.001396,0.000487,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.001995,0.00063,0.001404,0.000493,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [24]:
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [25]:
print(classifier.best_score_)
print(classifier.best_params_) # know you kow the best paramrters

0.9800000000000001
{'C': 1, 'kernel': 'rbf'}


- One isuue that can happen with GridSearchCV is that the computation cost, our dataset now is very limited but imagine having millions of data point in the dataset and for parameters you have so many values then the no of possible permutation will become very high.
- to tackle this issue sklearn with another class called RandomizedSearchCV
- RandomizedSearchCV will not try every single permutation and combination of parameter but it will try random combibnations of these parameter value and you can choose what those iteration culd be.

In [31]:
from sklearn.model_selection import RandomizedSearchCV

rs_classifier = RandomizedSearchCV(svm.SVC(gamma="auto"), {
    "C":[1,10,20],
    "kernel":["rbf", "linear"]
}, cv=5, return_train_score=False, n_iter=2) # trying only 2 combinations here above we tried total 6

rs_classifier.fit(iris.data, iris.target)
pd.DataFrame(rs_classifier.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.98
1,1,rbf,0.98


#### Now, Which Model to Choose?

In [33]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [36]:
model_params = {
    "svm" : {
        "model" : svm.SVC(gamma="auto"),
        "params" : {
            "C" : [1,10,20],
            "kernel" : ["rbf", "linear"]
        }
    },
    "random_forest" : {
        "model" : RandomForestClassifier(),
        "params" : {
            "n_estimators" : [1,5,10]
        }
    },
    "logictic_regression" : {
        "model" : LogisticRegression(solver="liblinear", multi_class="auto"),
        "params" : {
            "C" : [1,5,10]
        }
    }
}

In [39]:
# suppress warning for clean notebook
import warnings
warnings.filterwarnings("ignore")

In [40]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        "model" : model_name,
        "best_score" : clf.best_score_,
        "best_params" : clf.best_params_
    })

In [41]:
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 5}
2,logictic_regression,0.966667,{'C': 5}
