> **tomato juice dataset**
<br>` 'quality' is the target feature for classification `
<br>` the other features are chemical properties of our product `

**Import the main libraries**

In [None]:
import numpy as np
import pandas as pd

import warnings
# supress all
warnings.filterwarnings("ignore")

**Import the Dataset**

In [None]:
## file path: windows style
df = pd.read_csv('..\\datasets\\tomatjus.csv')

## file path: unix style
#df = pd.read_csv('../datasets/tomatjus.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(df.shape[0], df.shape[1]))

In [None]:
df.info()

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**_Let's skip the checking_**

**Classification target feature**
<br>_Make it a multi-class problem, using text labels_

In [None]:
##  divide into classes by giving a range for quality
##  Make it a multi-class problem: {3,4,5} {6} {7.8}
bins = (2, 5, 6, 8)
group_names = ['Average', 'Premium', 'Special']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'quality'
y = df[labels_col]

## Features used for prediction 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe 
X = df.copy()
X.drop(labels_col, axis=1, inplace=True)

In [None]:
# generate a sorted list of unique labels to use later
from sklearn.utils.multiclass import unique_labels
targetlabels = unique_labels(y)

***
**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=50, stratify=y)

**<br>Target Label Distributions**

In [None]:
# shape method gives the dimensions of the dataset
print('X_train: {} rows, {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('X_test:  {} rows, {} columns'.format(X_test.shape[0], X_test.shape[1]))
print()
print('y_train: {} rows, 1 column'.format(y_train.shape[0]))
print('y_test:  {} rows, 1 column'.format(y_test.shape[0]))
print()

## Here's a nice report:  
# 1. series to dataframe conversion
my_train = pd.DataFrame(y_train)
my_test = pd.DataFrame(y_test)
# 2. dataframe copy with [[ -- ]]
av_train = my_train[[labels_col]].apply(lambda x: x.value_counts())
av_test = my_test[[labels_col]].apply(lambda x: x.value_counts())
# 3. add a new column
av_train['pct_train'] = round((100 * av_train / av_train.sum()),2)
av_test['pct_test'] = round((100 * av_test / av_test.sum()),2)
# 4. combine the dataframes
av_tt = pd.concat([av_train,av_test], axis=1) 
# 5. print the report
print('Frequency and Distribution of labels')
print(av_tt)

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Check column names of numeric attributes**
<br>Features with numeric values need to be normalised
<br>by changing them to small numbers in a specific range (scaling)

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

**Scaling** comes _after_ test // train split

In [None]:
# scaling the Numeric columns 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1
# ColumnTransformer returns a numpy.ndarray so we lose the feature names;
# we process one column at a time to preserve the dataframe

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))
    

**<br>Imports** for perfomance metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

**<br>Function** to calculate perfomance metrics

In [None]:
def show_metrics(y_test,ygx,lbls):
    tptn_df = pd.DataFrame(confusion_matrix(y_test, ygx, labels=lbls), 
                           index=['train:{:}'.format(x) for x in lbls], 
                           columns=['pred:{:}'.format(x) for x in lbls])
    print(tptn_df)    
    print("\n~~~~")
    
    TP = np.diag(tptn_df.values)
    FP = tptn_df.values.sum(axis=0) - TP
    FN = tptn_df.values.sum(axis=1) - TP
    TN = np.sum(tptn_df.values) - (FP + FN + TP)
# false positive rates
    FPR = FP/(FP+TN)
# false negative rates
    FNR = FN/(TP+FN)
# overall 
    sfpr=FP.sum()/(FP.sum()+TN.sum())
    sfnr=FN.sum()/(TP.sum()+FN.sum())
    
    if len(lbls) >2:
        for x in range(len(lbls)):
            print('{:>12} : '.format(lbls[x]),
                  'FPR = %.3f   FNR = %.3f' % (FPR[x], FNR[x]))
        print()

    print('{:>12} : '.format('macro avg'),
          'FPR = %.3f   FNR = %.3f'  % (FPR.mean(), FNR.mean()))
    print('weighted avg :  FPR = %.3f   FNR = %.3f' % (sfpr, sfnr))
 
    print("\n~~~~")
    
#    macro average: unweighted mean per label 
# weighted average: support-weighted mean per label  
    print(classification_report(y_test, ygx, digits=3, target_names=lbls))

    print("~~~~")
# Matthews correlation coefficient: 
#   correlation between prediction and ground truth
#   (+1 perfect, 0 random prediction, -1 inverse)

    mcc = matthews_corrcoef(y_test, ygx)
    print('MCC: Overall :  %.3f' % mcc)
    if len(lbls) >2:
        for tc in lbls:
            bin_mcc = matthews_corrcoef(y_test == tc, ygx == tc)
            print('{:>12} :'.format(tc),' %.3f' % bin_mcc)  

    return '~~~~'

***
**Hyperparameter Tuning**
> General pattern:<br>
    1. Classifier selection<br> 
    2. Fit and Predict<br>
    3. Bias-Variance Tradeoff<br>
    4. Select strategy and hyperparameters<br>
    5. Plug in the best parameter values<br>
    6. Fit and Predict<br>
    7. Bias-Variance Tradeoff

 ***
 **_These examples only work with one classifier_**
 ***

In [None]:
# Classifier Selection - Only One!
models = []
from sklearn.neighbors import KNeighborsClassifier 
models.append(("K-NNeighbors", KNeighborsClassifier())) 
print(models[0][0])
print(models[0][1])

**<br>Fit and Predict** (standard block)

In [None]:
# evaluate each model in turn
results = []
for name, clf in models:
    print('Confusion Matrix:', name)
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)

    results.append((name, ygx))
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]
    show_metrics(y_test,ygx,clf.classes_)
    print('\nParameters: ', clf.get_params(), '\n\n')

***
***

**<br>Imports** for Bias - Variance Decomposition

In [None]:
from mlxtend.evaluate import bias_variance_decomp
## bias_variance_decomp() requires 
##    1. numpy ndarrays
##    2. numeric targets
from sklearn.preprocessing import LabelEncoder

**<br>Function** to calculate Bias - Variance Decomposition

In [None]:
def bias_var_metrics(clf,folds=200):
# slow because it does num_rounds (default=200) bootstrap cross validation

# numeric targets
    ytrain = LabelEncoder().fit_transform(y_train)
    ytest = LabelEncoder().fit_transform(y_test)

    print('Bias // Variance Decomposition:', clf)
    avg_loss, avg_bias, avg_var = bias_variance_decomp(
        clf, X_train.values, ytrain, X_test.values, ytest, 
        loss='0-1_loss', num_rounds=folds, random_seed=44)
    print('   Average bias: %.3f' % avg_bias)
    print('   Average variance: %.3f' % avg_var)
    print('   Average expected loss: %.3f  "Goodness": %.3f' % (avg_loss, (1-avg_loss)))
    print()

 ***

**<br>Bias - Variance Decomposition** (standard block)

In [None]:
bias_var_metrics(clf=models[0][1], folds=20)

**<br>Imports** for parameter testing

In [None]:
# Default scorer for classification is sklearn.metrics.accuracy_score 
# In unbalanced classification, the accuracy score is often uninformative
# For the list of options see
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

# average for each label weighted by support (number of true instances for each label)
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

#cross-validation ALWAYS takes a long time
from time import time

 ***

**<br>Parameter Testing**
>Select this block - Go to the Run menu - Run all Above
<br>Then pick a strategy and run the blocks one at a time

 ***

**<br>Strategy: simple loop (no CV), manual selection**

In [None]:
# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter values to test: one constant, one set
cname = 'n_neighbors'
cval = 3

paramname = 'weights'
paramrange = ['uniform', 'distance']
# ----    ---- #

for param in paramrange:
    print(models[0][0],':',cname,'=',cval,' ',paramname,'=',param)

# use a dict to set multiple parameters
    pdict = {cname:cval, paramname:param}
    clfp = clf.set_params(**pdict)

# fit and predict with the new values
    clfp.fit(X_train,y_train)
    pred = clfp.predict(X_test)
    lbls = clfp.classes_

    waa = balanced_accuracy_score(y_test, pred)
    print('Weighted Average Accuracy:  %.3f' % waa)
    
    waf = f1_score(y_test, pred, average='weighted')
    print('Weighted Average f1_score:  %.3f' % waf)

    print()

*<br>MANUAL EDITING: Plug in the best parameter values*

In [None]:
## -- put best parameters into the model --##
## be sure to use a valid value from the result above !!
param = 'distance'
# ----    ---- #
# use a dict to set multiple parameters
best_vals = {cname:cval, paramname:param}
models[0][1].set_params(**best_vals)

print(models[0][0],': Best Values ')
print(models[0][1])

**<br>Fit and Predict**

In [None]:
# paste in the standard block from above

In [None]:
# evaluate each model in turn
results = []
for name, clf in models:
    print('Confusion Matrix:', name)
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)

    results.append((name, ygx))
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]
    show_metrics(y_test,ygx,clf.classes_)
    print('\nParameters: ', clf.get_params(), '\n\n')

**<br>Bias - Variance Decomposition**

In [None]:
# paste in the standard block from above

***
***

**<br>Strategy: Single Parameter Cross-Validation Curve**

In [None]:
from sklearn.model_selection import StratifiedKFold
# a variation of KFold that returns stratified folds,  
#   preserving the percentage of samples for each class
from yellowbrick.model_selection import ValidationCurve

# ---- Specific to each classifier! ---- #
clf = models[0][1]

## hyperparameter (can only graph one)
paramname = "n_neighbors"
# range for the hyperparameter 
#        start, stop, and step 
paramrange = np.arange(2, 9, 1)

## or, for example:
#paramname = "p"
#paramrange = [1, 2, 4]

## or, for example:
#paramname = 'weights'
#paramrange = ['uniform', 'distance']

# ----  Cross Validation  ---- #
# number of rounds
folds = 3

# average for each label weighted by support (number of true instances for each label)
scorer = ['wtd.avg.accuracy', 'balanced_accuracy']
#scorer = ['wtd.avg.f1_score', 'f1_weighted']
# ----    ---- #

# start the timer
trs = time()

print(models[0][0], '\t(timer started)')
print('Validation Curve for parameter [',paramname,'], scoring =',scorer[0])
print()

skf = StratifiedKFold(shuffle=True, random_state = 11, n_splits=folds)

# Create the validation curve visualizer
viz = ValidationCurve(
    clf, param_name=paramname, param_range=paramrange,
    logx=True, cv=skf, scoring=scorer[1], n_jobs= -1)

viz.fit(X_train, y_train)
viz.show()

tre = time() - trs
print ("Run Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
## Copy & Paste blocks:
# 5. MANUAL EDITING: Plug in the best parameter values
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***

**<br>Strategy: Bias - Variance Decomposition: Parameter testing**

In [None]:
# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter values to test: one constant, one set
cname = 'n_neighbors'
cval = 3

paramname = "p"
paramrange = [1, 2, 4]

# ----  Cross Validation  ---- #
# adjust num_rounds (default=200) for bootstrap cross validation
folds = 3
# ----    ---- #

# start the timer
trs = time()

## bias_variance_decomp() requires 
##    1. numpy ndarrays
##    2. numeric targets
ytrain = LabelEncoder().fit_transform(y_train)
ytest = LabelEncoder().fit_transform(y_test)

bias, var, err = [], [], []

for parm in paramrange:
    print(models[0][0],'(',cname,'=',cval,' ',paramname,'=',parm,')',end='') # no newline at the end

# use a dict to set multiple parameters
    pdict = {cname:cval, paramname:parm}
    clfp = clf.set_params(**pdict)

# fit and predict with the new values
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        clf, X_train.values, ytrain, X_test.values, ytest, 
        loss='0-1_loss', num_rounds=folds, random_seed=11)
    err.append(avg_expected_loss)
    bias.append(avg_bias)
    var.append(avg_var)

    msg="  Bias: %0.3f  Variance: %0.3f  E.loss: %0.3f" % (avg_bias, avg_var, avg_expected_loss)
    print(msg)

tre = time() - trs
print ("\tRun Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
# line plot
import matplotlib.pyplot as plt
plt.plot(range(3,17), err, 'b', label = 'total_error')
plt.plot(range(3,17), bias, 'k', label = 'bias')
plt.plot(range(3,17), var, 'y', label = 'variance')

In [None]:
## Copy & Paste blocks:
# 5. MANUAL EDITING: Plug in the best parameter values
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***

**<br>Strategy: Parameter grid search**

In [None]:
# Each parameter increases time exponentially!
from sklearn.model_selection import GridSearchCV

# ---- Specific to each classifier! ---- #
clf = models[0][1]

# hyperparameter: values to test (minimum 2x2 Grid)

value_grid = {'n_neighbors': [3, 4], 
              'p': [1, 2, 4], 
              'weights': ['uniform', 'distance']}

# ----  Cross Validation  ---- #
# number of rounds
folds = 3

# average for each label weighted by support (number of true instances for each label)
scorer = ['wtd.avg.accuracy', 'balanced_accuracy']
#scorer = ['wtd.avg.f1_score', 'f1_weighted']
# ----    ---- #

# Start the timer
trs = time()

print('GridSearchCV:',folds,'folds, timer started')
print('%s with scoring = %s' % (clf, scorer[0]))
    
grid_search = GridSearchCV(estimator=clf, param_grid=value_grid, 
                           scoring=scorer[1], cv=folds, verbose=1, n_jobs= -1)
grid_search.fit(X_train, y_train)

## uncomment these to see the details 
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# Print best parameters and score
print("\tBest parameters: {}".format(grid_search.best_params_))
print("\tBest CV score: {:.3f}".format(grid_search.best_score_))

tre = time() - trs
print ("Run Time {} seconds".format(round(tre,2)) + '\n')

*<br>Plug in the best parameter values*

In [None]:
# Grid_Search returns a dict of best paraeters
models[0][1].set_params(**grid_search.best_params_)

print(models[0][0],': Best Values ')
print(models[0][1])

In [None]:
## Copy & Paste blocks:
# 6. Fit and Predict (standard block)
# 7. Bias-Variance Tradeoff (standard block)

***
***