In [1]:
# Grid search report generator
from time import time
import numpy as np

from sklearn import svm
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn import datasets

from libitmal import dataloaders as itmaldataloaders # Needed for load of iris, moon and mnist

currmode="N/A" # GLOBAL var!

def SearchReport(model):

    def GetBestModelCTOR(model, best_params):
        def GetParams(best_params):
            ret_str=""
            for key in sorted(best_params):
                value = best_params[key]
                temp_str = "'" if str(type(value))=="<class 'str'>" else ""
                if len(ret_str)>0:
                    ret_str += ','
                ret_str += f'{key}={temp_str}{value}{temp_str}'
            return ret_str
        try:
            param_str = GetParams(best_params)
            return type(model).__name__ + '(' + param_str + ')'
        except:
            return "N/A(1)"

    print("\nBest model set found on train set:")
    print()
    print(f"\tbest parameters={model.best_params_}")
    print(f"\tbest '{model.scoring}' score={model.best_score_}")
    print(f"\tbest index={model.best_index_}")
    print()
    print(f"Best estimator CTOR:")
    print(f"\t{model.best_estimator_}")
    print()
    try:
        print(f"Grid scores ('{model.scoring}') on development set:")
        means = model.cv_results_['mean_test_score']
        stds  = model.cv_results_['std_test_score']
        i=0
        for mean, std, params in zip(means, stds, model.cv_results_['params']):
            print("\t[%2d]: %0.3f (+/-%0.03f) for %r" % (i, mean, std * 2, params))
            i += 1
    except:
        print("WARNING: the random search do not provide means/stds")

    global currmode
    assert "f1_micro"==str(model.scoring), f"come on, we need to fix the scoring to be able to compare model-fits! Your scoreing={str(model.scoring)}...remember to add scoring='f1_micro' to the search"
    return f"best: dat={currmode}, score={model.best_score_:0.5f}, model={GetBestModelCTOR(model.estimator,model.best_params_)}", model.best_estimator_

def ClassificationReport(model, X_test, y_test, target_names=None):
    assert X_test.shape[0]==y_test.shape[0]
    print("\nDetailed classification report:")
    print("\tThe model is trained on the full development set.")
    print("\tThe scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred, target_names))
    print()

def FullReport(model, X_test, y_test, t):
    print(f"SEARCH TIME: {t:0.2f} sec")
    beststr, bestmodel = SearchReport(model)
    ClassificationReport(model, X_test, y_test)
    print(f"CTOR for best model: {bestmodel}\n")
    print(f"{beststr}\n")
    return beststr, bestmodel

def LoadAndSetupData(mode, test_size=0.3):
    assert test_size>=0.0 and test_size<=1.0

    def ShapeToString(Z):
        n = Z.ndim
        s = "("
        for i in range(n):
            s += f"{Z.shape[i]:5d}"
            if i+1!=n:
                s += ";"
        return s+")"

    global currmode
    currmode=mode
    print(f"DATA: {currmode}..")

    if mode=='moon':
        X, y = itmaldataloaders.MOON_GetDataSet(n_samples=5000, noise=0.2)
        itmaldataloaders.MOON_Plot(X, y)
    elif mode=='mnist':
        X, y = itmaldataloaders.MNIST_GetDataSet(load_mode=2)
        if X.ndim==3:
            X=np.reshape(X, (X.shape[0], -1))
    elif mode=='iris':
        X, y = itmaldataloaders.IRIS_GetDataSet()
    else:
        raise ValueError(f"could not load data for that particular mode='{mode}', only 'moon'/'mnist'/'iris' supported")

    print(f'  org. data:  X.shape      ={ShapeToString(X)}, y.shape      ={ShapeToString(y)}')

    assert X.ndim==2
    assert X.shape[0]==y.shape[0]
    assert y.ndim==1 or (y.ndim==2 and y.shape[1]==0)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0, shuffle=True
    )

    print(f'  train data: X_train.shape={ShapeToString(X_train)}, y_train.shape={ShapeToString(y_train)}')
    print(f'  test data:  X_test.shape ={ShapeToString(X_test)}, y_test.shape ={ShapeToString(y_test)}')
    print()

    return X_train, X_test, y_train, y_test

# Qa
(Code review)

The gridsearchCv model is constructed, and tuning parameters are parsed into the constructor.
The gridsearch is then performed by calling the fit method on the grid object.
The best model is printed with help from the FullReport function

# Qb


In [9]:
from sklearn.linear_model import SGDClassifier

# Setup data
X_train, X_test, y_train, y_test = LoadAndSetupData(
    'iris')  # 'iris', 'moon', or 'mnist'

# Setup search parameters
model = SGDClassifier(
)

tuning_parameters = {
    'penalty': ('l1', 'l2', 'elasticnet'),
    'l1_ratio': [0, 1], #Only used if 'elasticnet'
    'fit_intercept': [True, False],
    'max_iter': [100, 1000, 5000],
    'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'alpha': [0.1, 1, 10, 100, 1000],
    'epsilon': [0.1, 1, 10, 100, 1000],

}

CV = 5
VERBOSE = 0

# Run GridSearchCV for the model
start = time()
grid_tuned = GridSearchCV(model,
                          tuning_parameters,
                          cv=CV,
                          scoring='f1_micro',
                          verbose=VERBOSE,
                          n_jobs=-1,
                          iid=True)
grid_tuned.fit(X_train, y_train)
t = time() - start

# Report result
b0, m0 = FullReport(grid_tuned, X_test, y_test, t)
print('OK(grid-search)')


DATA: iris..
  org. data:  X.shape      =(  150;    4), y.shape      =(  150)
  train data: X_train.shape=(  105;    4), y_train.shape=(  105)
  test data:  X_test.shape =(   45;    4), y_test.shape =(   45)

SEARCH TIME: 30.87 sec

Best model set found on train set:

	best parameters={'alpha': 0.1, 'epsilon': 0.1, 'fit_intercept': True, 'l1_ratio': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'penalty': 'l1'}
	best 'f1_micro' score=1.0
	best index=75

Best estimator CTOR:
	SGDClassifier(alpha=0.1, l1_ratio=1, loss='squared_hinge', penalty='l1')

Grid scores ('f1_micro') on development set:
	[ 0]: 0.762 (+/-0.120) for {'alpha': 0.1, 'epsilon': 0.1, 'fit_intercept': True, 'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'l1'}
	[ 1]: 0.867 (+/-0.203) for {'alpha': 0.1, 'epsilon': 0.1, 'fit_intercept': True, 'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'l2'}
	[ 2]: 0.876 (+/-0.205) for {'alpha': 0.1, 'epsilon': 0.1, 'fit_intercept': True, 'l1_ratio': 0, 'loss': 



# Qc)
 investigate the n_iter parameter...in code and write an conceptual explanation in text.

In [14]:
from sklearn.linear_model import SGDClassifier

# Setup data
X_train, X_test, y_train, y_test = LoadAndSetupData(
    'iris')  # 'iris', 'moon', or 'mnist'

# Setup search parameters
model = SGDClassifier(
)

tuning_parameters = {
    'penalty': ('l1', 'l2', 'elasticnet'),
    'l1_ratio': [0, 1], #Only used if 'elasticnet'
    'fit_intercept': [True, False],
    'max_iter': [100, 1000, 5000],
    'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'alpha': [0.1, 1, 10, 100, 1000],
    'epsilon': [0.1, 1, 10, 100, 1000],
}

CV = 5
VERBOSE = 0

# Run GridSearchCV for the model
start = time()
random_tuned = RandomizedSearchCV(model,
                          tuning_parameters,
                          n_iter=20,
                          random_state=42,
                          cv=CV,
                          scoring='f1_micro',
                          verbose=VERBOSE,
                          n_jobs=-1,
                          iid=True)
random_tuned.fit(X_train, y_train)
t = time() - start

# Report result
b0, m0 = FullReport(random_tuned, X_test, y_test, t)
print('OK(grid-search)')


DATA: iris..
  org. data:  X.shape      =(  150;    4), y.shape      =(  150)
  train data: X_train.shape=(  105;    4), y_train.shape=(  105)
  test data:  X_test.shape =(   45;    4), y_test.shape =(   45)

SEARCH TIME: 0.35 sec

Best model set found on train set:

	best parameters={'penalty': 'l1', 'max_iter': 5000, 'loss': 'modified_huber', 'l1_ratio': 0, 'fit_intercept': False, 'epsilon': 10, 'alpha': 0.1}
	best 'f1_micro' score=0.9714285714285714
	best index=17

Best estimator CTOR:
	SGDClassifier(alpha=0.1, epsilon=10, fit_intercept=False, l1_ratio=0,
              loss='modified_huber', max_iter=5000, penalty='l1')

Grid scores ('f1_micro') on development set:
	[ 0]: 0.838 (+/-0.143) for {'penalty': 'elasticnet', 'max_iter': 1000, 'loss': 'hinge', 'l1_ratio': 1, 'fit_intercept': False, 'epsilon': 1000, 'alpha': 0.1}
	[ 1]: 0.505 (+/-0.328) for {'penalty': 'l2', 'max_iter': 100, 'loss': 'perceptron', 'l1_ratio': 1, 'fit_intercept': False, 'epsilon': 0.1, 'alpha': 1000}
	[ 2]: 0.



# Qd)

In [None]:
from sklearn.linear_model import SGDClassifier

# Setup data
X_train, X_test, y_train, y_test = LoadAndSetupData(
    'mnist')  # 'iris', 'moon', or 'mnist'

# Setup search parameters
model = SGDClassifier(
)

tuning_parameters = {
    'penalty': ('l1', 'l2', 'elasticnet'),
    'l1_ratio': [0, 1], #Only used if 'elasticnet'
    'fit_intercept': [True, False],
}

CV = 5
VERBOSE = 0

# Run GridSearchCV for the model
start = time()
random_tuned = RandomizedSearchCV(model,
                          tuning_parameters,
                          n_iter=2,
                          random_state=42,
                          cv=CV,
                          scoring='f1_micro',
                          verbose=VERBOSE,
                          n_jobs=-1,
                          iid=True)
random_tuned.fit(X_train, y_train)

t = time() - start

# Report result
b0, m0 = FullReport(random_tuned, X_test, y_test, t)
print('OK(grid-search)')