In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.metrics import precision_recall_fscore_support

data= fetch_openml('mnist_784', version=1, parser="auto")#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])

In [2]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in stratSplit.split(dfData[data["feature_names"]], dfData["target"]):
    X_train = dfData[data["feature_names"]].iloc[train_index]
    X_test = dfData[data["feature_names"]].iloc[test_index]
    
    y_train = dfData["target"].iloc[train_index]
    y_test = dfData["target"].iloc[test_index]

In [3]:
from sklearn.linear_model import LogisticRegression
logReg_clf = LogisticRegression()

In [4]:
from sklearn.model_selection import cross_validate

results = cross_validate(logReg_clf,X = X_train,y=y_train,
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                         
                        cv = 5,#If our estimator is classifier automatically do stratified CV
                        n_jobs=1,#Num CPUs to use for calculation, -1 means all
                        verbose = True,#Output status updates
                        return_train_score=True,
                        return_estimator=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/pre

In [5]:
print(np.mean(results["test_f1_macro"]))
#Variation in our predictions
print(np.std(results["test_f1_macro"]))

0.9169489149878114
0.0014939252830863203


#### Now that we know our performance, how can we improve it?

In [9]:
logReg_clf = LogisticRegression(tol = 0.0001,#requirement for convergence (how far we wanna go until we actually stop training, no error until reaching this value, the lower the slower)
                               fit_intercept=True,#should a bias be added to the decision function?
                               class_weight = {"1":1,"2":2,"3":3,"4":4, #set specific weights for your labels
                                              "5":5,"6":6,"7":7,"8":8,
                                              "9":9,"0":0},#balanced, None (all labels carrying the same weight)
                               max_iter = 100,#Maximum number of iterations to do before stopping if not converged
                               solver="lbfgs",#Algorithm for optimization
                               multi_class="auto",#Multiclass process to use
                               verbose=False,#Output status updates
                               warm_start=False,#Save training states (True, disadvantage no reproducible results) otherwise start from scratch, advantage: see every state of your processing
                               n_jobs=1#Number of CPUs to use for parallel training across multiple classes
                               )
#Other parameters we'll look at after regularization

In [7]:
import sklearn
print(sklearn.__version__)

1.3.0
