In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

data= fetch_openml('mnist_784', version=1, parser="auto")#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])

In [None]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in stratSplit.split(dfData[data["feature_names"]], dfData["target"]):
    X_train = dfData[data["feature_names"]].iloc[train_index]
    X_test = dfData[data["feature_names"]].iloc[test_index]
    
    y_train = dfData["target"].iloc[train_index]
    y_test = dfData["target"].iloc[test_index]

In [None]:
from sklearn.model_selection import GridSearchCV

params = [{"penalty":["l1"], "C":[0.1,1,10]}] #my thing
          #[{"penalty":["l1","l2"],"C":[0.1,1,10]}]#,
          #{"penalty":["elasticnet"],"C":[0.1,1,10, 100],"l1_ratio":[0.1,0.3]}] #"max_iter":[100]

#grid search is going through all the combinations of parameters
logreg_clf_gscv = GridSearchCV(estimator=LogisticRegression(solver="liblinear"), #got to use "liblinear" or it will cause a solver=lbfgs error
                       param_grid=params,
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True #here: looking for the best model on roc_auc_ovr_weighted score
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_gscv.fit(X_train,y_train)

In [None]:
print(logreg_clf_gscv.cv_results_.keys()) #read the stats you can use

In [None]:
resultsCVDF = pd.DataFrame(logreg_clf_gscv.cv_results_)
print(resultsCVDF.sort_values("mean_fit_time",ascending=True)) #display the stats for mean_fit_time

In [None]:
# best_index_ #only with refit for multi-scoring cases
resultsCVDF.iloc[logreg_clf_gscv.best_index_] #display the best index

In [None]:
#display the best performing model
print(logreg_clf_gscv.best_estimator_) #only with refit
logreg_clf_gscv.best_estimator_.predict(X_train) #access the best estimator

In [None]:
#looking at the best score and the best parameters
print(logreg_clf_gscv.best_score_) #only with refit for multi-scoring cases
print(logreg_clf_gscv.best_params_) #only with refit for multi-scoring cases

In [None]:
#gives us the exact parameters used for our training model
logreg_clf_gscv.get_params()

In [None]:
#another way to make predictions from the model
logreg_clf_gscv.predict(X_train)#only when refit is true

In [None]:
#random choice via RandomizedSearchCV: so much faster!
from sklearn.model_selection import RandomizedSearchCV

logreg_clf_rscv = RandomizedSearchCV(estimator = LogisticRegression(),
                       param_distributions = params,
                        n_iter = 10,# num param settings sampled
                        random_state = None,#if not none uses this integer as seed
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_rscv.fit(X_train,y_train)

In [None]:
resultsRSDF = pd.DataFrame(logreg_clf_rscv.cv_results_)
print(resultsRSDF.sort_values("mean_fit_time",ascending=True))

In [None]:
params = [{"penalty":["l1"],"C":[1],"solver":["saga"]}]

# params = [{"penalty":["l1"],"C":[0.1,1,10],"solver":["saga"]},
#          {"penalty":["l2"],"C":[0.1,1,10],"solver":["saga"]},
#          {"penalty":["l2"],"C":[0.1,1,10],"solver":["lbfgs"]}]

logreg_clf_gscv = GridSearchCV(estimator=LogisticRegression(),
                       param_grid=params,
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_gscv.fit(X_train,y_train)

In [None]:
print(pd.DataFrame(logreg_clf_gscv.cv_results_).iloc[0])