> **tomato juice dataset**
<br>` 'quality' is the target feature for classification `
<br>` the other features are chemical properties of our product `

**Import the main libraries**

In [None]:
import numpy as np
import pandas as pd

from time import time

_import the local library_

In [None]:
# add parent folder path where lib folder is
import sys
if ".." not in sys.path:import sys; sys.path.insert(0, '..') 

In [None]:
from mylib import show_labels_dist, show_metrics, bias_var_metrics

**Import the Dataset**

In [None]:
## file path: windows style
df = pd.read_csv('..\\datasets\\tomatjus.csv')

## file path: unix style
#df = pd.read_csv('../datasets/tomatjus.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(df.shape[0], df.shape[1]))

In [None]:
df.info()

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**_Let's skip the checking_**

**<br>Classification target feature**
<br>_Make it a multi-class problem, using text labels_

In [None]:
##  divide into classes by giving a range for quality
##  Make it a multi-class problem: {3,4,5} {6} {7.8}
bins = (2, 5, 6, 8)
group_names = ['Average', 'Premium', 'Special']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'quality'
y = df[labels_col]

## Features used for prediction 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe 
X = df.copy()
X.drop(labels_col, axis=1, inplace=True)

***
**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=50, 
                                                    stratify=y)
# train_test_split does random selection, 
#      so we should reset the dataframe indexes
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

**<br>Scaling** comes _after_ test // train split

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

In [None]:
# scaling the Numeric columns 
# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1

# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))

**<br>Classifier Selection**

 **_Hyperparameter tuning is specific to each classifier_**

In [None]:
# Classifier Selection - for these examples
models = []
from sklearn.neighbors import KNeighborsClassifier 
models.append(("K-NNeighbors", KNeighborsClassifier())) 
print(models[0][0])
print(models[0][1])

**<br>Target Label Distributions** (standard block)

In [None]:
# from our local library
show_labels_dist(X_train,X_test,y_train,y_test)

**<br>Fit and Predict** (standard block)

In [None]:
# evaluate each model in turn
results = []

print('macro average: unweighted mean per label')
print('weighted average: support-weighted mean per label')
print('MCC: correlation between prediction and ground truth')
print('     (+1 perfect, 0 random prediction, -1 inverse)\n')

for name, clf in models:
    trs = time()
    print('Confusion Matrix:', name)
    
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)
    results.append((name, ygx))
    
    tre = time() - trs
    print ("Run Time {} seconds".format(round(tre,2)) + '\n')
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]

    show_metrics(y_test, ygx, clf.classes_)   # from our local library
    print('\nParameters: ', clf.get_params(), '\n\n')

**Bias - Variance Decomposition** (standard block)

In [None]:
# from our local library
# reduce (cross-validation) folds for faster results
folds = 20
for name, clf in models:
    print('Bias // Variance Decomposition:', name)
    bias_var_metrics(X_train,X_test,y_train,y_test,clf,folds)

***

***
**Hyperparameter Tuning**
General pattern:<br>
_baseline model_<br>
>    1. Classifier selection<br> 
   2. Fit and Predict<br>
   3. Bias-Variance Tradeoff<br>
    
_optimised model_<br>
>    4. Select strategy and hyperparameters<br>
   5. Plug in the best parameter values<br>
   6. Fit and Predict<br>
   7. Bias-Variance Tradeoff

***
***

**<br>Imports** for parameter testing

In [None]:
# Default scorer for classification is sklearn.metrics.accuracy_score 
# For imbalanced classification, the accuracy score is often uninformative
# For the list of options see
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

# average for each label weighted by support 
#        (number of true instances for each label)
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

eval_metric = ['wtd.avg.accuracy', 
               'balanced_accuracy', 
               'wtd.avg.f1_score', 
               'f1_weighted']

# for graphs
import matplotlib.pyplot as plt

 ***

**<br>Parameter Testing**
> * Strategy: Simple loop (no CV)
> * Strategy: Single Parameter Cross-Validation Curve
> * Strategy: Bias-Variance Decomposition 
> * Strategy: Parameter grid search

 ***

**<br>Strategy: simple loop (no CV)**

In [None]:
# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter values to test: one constant, one set
cname = 'n_neighbors'
cval = 3

paramname = 'weights'
paramrange = ['uniform', 'distance']
# ----    ---- #

for param in paramrange:
    print(models[0][0],':',cname,'=',cval,' ',paramname,'=',param)

# use a dict to set multiple parameters
    pdict = {cname:cval, paramname:param}
    clfp = clf.set_params(**pdict)

# fit and predict with the new values
    clfp.fit(X_train,y_train)
    pred = clfp.predict(X_test)
    lbls = clfp.classes_

    waa = balanced_accuracy_score(y_test, pred)
    print('Weighted Average Accuracy:  %.3f' % waa)
    
    waf = f1_score(y_test, pred, average='weighted')
    print('Weighted Average f1_score:  %.3f' % waf)

    print()

*<br>5. Plug in the best parameter values* [manual editing]

In [None]:
## -- put best parameters into the model --##
## be sure to use a valid value from the result above !!
param = 'distance'
# ----    ---- #

# use a dict to set multiple parameters
best_vals = {cname:cval, paramname:param}
models[0][1].set_params(**best_vals)

print(models[0][0],': Best Values ')
print(models[0][1])

In [None]:
## Copy & Paste blocks:
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***

**<br>Strategy: Single Parameter Cross-Validation Curve**

In [None]:
from sklearn.model_selection import StratifiedKFold
# a variation of KFold that returns stratified folds,  
#   preserving the percentage of samples for each class
from yellowbrick.model_selection import ValidationCurve

# ---- Specific to each classifier! ---- #
clf = models[0][1]

## hyperparameter (can only graph one)
paramname = "n_neighbors"
# range for the hyperparameter 
#        start, stop, and step 
paramrange = np.arange(2, 9, 1)

## or, for example:
#paramname = "p"
#paramrange = [1, 2, 4]

## or, for example:
#paramname = 'weights'
#paramrange = ['uniform', 'distance']

# ----  Cross Validation  ---- #
# number of rounds
folds = 3

# choose a metric
scorer = eval_metric[1]
# ----    ---- #

# start the timer
trs = time()

print(models[0][0], '\t(timer started)')
print('Validation Curve for parameter [',paramname,'], scoring =',scorer)
print()

skf = StratifiedKFold(shuffle=True, random_state = 11, n_splits=folds)

# Create the validation curve visualizer
viz = ValidationCurve(
    clf, param_name=paramname, param_range=paramrange,
    logx=True, cv=skf, scoring=scorer, n_jobs= -1)

viz.fit(X_train, y_train)
viz.show()

tre = time() - trs
print ("Run Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
## Copy & Paste blocks:
# 5. Plug in the best parameter values [manual editing]
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***

**<br>Strategy: Bias - Variance Decomposition**

In [None]:
from mlxtend.evaluate import bias_variance_decomp
## bias_variance_decomp() requires 
##    1. numpy ndarrays
##    2. numeric targets
from sklearn.preprocessing import LabelEncoder

# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter values to test: one constant, one set
cname = 'n_neighbors'
cval = 3

paramname = "p"
paramrange = [1, 2, 4]

# ----  Cross Validation  ---- #
# adjust num_rounds (default=200) for bootstrap cross validation
folds = 3
# ----    ---- #

# start the timer
trs = time()

## bias_variance_decomp() requires 
##    1. numpy ndarrays
##    2. numeric targets
ytrain = LabelEncoder().fit_transform(y_train)
ytest = LabelEncoder().fit_transform(y_test)

# for graphs
cn = models[0][0] + "  Parameter: " + paramname 
rx = paramrange     # the x locations for the groups

bias, var, err = [], [], []
for parm in paramrange:
    print(models[0][0],'(',cname,'=',cval,' ',paramname,'=',parm,')',end='') # no newline at the end

# use a dict to set multiple parameters
    pdict = {cname:cval, paramname:parm}
    clfp = clf.set_params(**pdict)

# fit and predict with the new values
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        clf, X_train.values, ytrain, X_test.values, ytest, 
        loss='0-1_loss', num_rounds=folds, random_seed=11)
    err.append(avg_expected_loss)
    bias.append(avg_bias)
    var.append(avg_var)

    msg="  Bias: %0.3f  Variance: %0.3f  E.loss: %0.3f" % (avg_bias, avg_var, avg_expected_loss)
    print(msg)

tre = time() - trs
print ("\tRun Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
# line plot
fig, ax = plt.subplots()
ax.plot(rx, err, 'b', label = 'total_error')
ax.plot(rx, bias, 'k', label = 'bias')
ax.plot(rx, var, 'y', label = 'variance')
ax.legend()
ax.set_xticks(rx)
ax.set_title('Bias-Variance Decomposition')
plt.show()
print(cn)

In [None]:
# stacked bar plot
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(rx, err, color = 'b', width= 0.35)
ax.bar(rx, bias, color = 'g', width= 0.35)
ax.bar(rx, var, color = 'r', width= 0.35)
ax.legend(labels=['E.loss', 'Bias', 'Var'])
ax.set_title('Bias-Variance Decomposition')
ax.set_xticks(rx)
plt.show()
print(cn)

In [None]:
## Copy & Paste blocks:
# 5. Plug in the best parameter values [manual editing]
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***

**<br>Strategy: Parameter grid search**

In [None]:
# Each parameter increases time exponentially!
from sklearn.model_selection import GridSearchCV

# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter: values to test (minimum 2x2 Grid)

value_grid = {'n_neighbors': [3, 4], 
              'p': [1, 2, 4], 
              'weights': ['uniform', 'distance']}

# ----  Cross Validation  ---- #
# number of rounds
folds = 3

# choose a metric
scorer = eval_metric[1]
# ----    ---- #

# Start the timer
trs = time()

print('GridSearchCV:',folds,'folds, timer started')
print('%s with scoring = %s' % (clf, scorer[0]))
    
grid_search = GridSearchCV(estimator=clf, param_grid=value_grid, 
                           scoring=scorer, cv=folds, verbose=1, n_jobs= -1)
grid_search.fit(X_train, y_train)

## uncomment these to see the details 
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# Print best parameters and score
print("\tBest parameters: {}".format(grid_search.best_params_))
print("\tBest CV score: {:.3f}".format(grid_search.best_score_))

tre = time() - trs
print ("Run Time {} seconds".format(round(tre,2)) + '\n')

Visualisations:<br>
    https://sklearn-evaluation.readthedocs.io/en/latest/user_guide/grid_search.html

*5. Plug in the best parameter values*

In [None]:
# Grid_Search returns a dict of best paraeters
models[0][1].set_params(**grid_search.best_params_)

print(models[0][0],': Best Values ')
print(models[0][1])

In [None]:
## Copy & Paste blocks:
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***