# Exercise: Finding Optimal Model and Hyperparameters
For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

In [1]:
# Import the datasets module from scikit-learn
from sklearn import datasets
# Load the digits dataset, which consists of 8x8 pixel images of digits
digits = datasets.load_digits()

In [2]:
# Import various machine learning models from scikit-learn
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
# Define a dictionary to hold configuration for different models and their hyperparameters for grid search
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),  # Support Vector Machine model
        'params' : {  # Parameters to try in grid search
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),  # Random Forest classifier
        'params' : {
            'n_estimators': [1,5,10]  # Number of trees in the forest
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),  # Logistic Regression model
        'params': {
            'C': [1,5,10]  # Inverse of regularization strength
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),  # Gaussian Naive Bayes model
        'params': {}  # No parameters specified, using default
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),  # Multinomial Naive Bayes model
        'params': {}  # No parameters specified, using default
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),  # Decision Tree classifier
        'params': {
            'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
        }
    }     
}

In [4]:
# Import GridSearchCV for hyperparameter tuning and pandas for data manipulation
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []  # List to store the results of grid search

In [5]:
# Iterate through the model configurations defined above
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)  # Initialize GridSearchCV
    clf.fit(digits.data, digits.target)  # Fit the model to the digits dataset
    scores.append({  # Append the performance and best parameters of the model to the scores list
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [6]:
# Convert the scores list to a DataFrame for easy viewing
df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
# Display the DataFrame
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.903747,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision_tree,0.80582,{'criterion': 'entropy'}


# svm (C=1, kernel=linear) with 94.93% score. 