> **tomato juice dataset**
<br>` 'quality' is the target feature for classification `
<br>` the other features are chemical properties of our product `

**Import the main libraries**

In [None]:
import numpy as np
import pandas as pd

import warnings
# supress all
warnings.filterwarnings("ignore")

**Import the Dataset**

In [None]:
## file path: windows style
df = pd.read_csv('..\\datasets\\tomatjus.csv')

## file path: unix style
#df = pd.read_csv('../datasets/tomatjus.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(
    df.shape[0], df.shape[1]))

In [None]:
df.info()

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**_Let's skip the checking_**

**Classification target feature**
<br>_Make it a multi-class problem, using text labels_

In [None]:
##  divide into classes by giving a range for quality
##  Make it a multi-class problem: {3,4,5} {6} {7.8}
bins = (2, 5, 6, 8)
group_names = ['Average', 'Premium', 'Special']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'quality'
y = df[labels_col]

## Features used for prediction 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe 
X = df.copy()
X.drop(labels_col, axis=1, inplace=True)

In [None]:
# generate a sorted list of unique labels to use later
from sklearn.utils.multiclass import unique_labels
targetlabels = unique_labels(y)

***
**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=50, stratify=y)

**<br>Target Label Distributions**

In [None]:
# shape method gives the dimensions of the dataset
print('X_train: {} rows, {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('X_test:  {} rows, {} columns'.format(X_test.shape[0], X_test.shape[1]))
print()
print('y_train: {} rows, 1 column'.format(y_train.shape[0]))
print('y_test:  {} rows, 1 column'.format(y_test.shape[0]))
print()

## Here's a nice report:  
# 1. series to dataframe conversion
my_train = pd.DataFrame(y_train)
my_test = pd.DataFrame(y_test)
# 2. dataframe copy with [[ -- ]]
av_train = my_train[[labels_col]].apply(lambda x: x.value_counts())
av_test = my_test[[labels_col]].apply(lambda x: x.value_counts())
# 3. add a new column
av_train['pct_train'] = round((100 * av_train / av_train.sum()),2)
av_test['pct_test'] = round((100 * av_test / av_test.sum()),2)
# 4. combine the dataframes
av_tt = pd.concat([av_train,av_test], axis=1) 
# 5. print the report
print('Frequency and Distribution of labels')
print(av_tt)

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# scaling the Numeric columns 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1
# ColumnTransformer returns a numpy.ndarray so we lose the feature names;
# we process one column at a time to preserve the dataframe

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))
    

**<br>Function** to calculate perfomance metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

In [None]:
def show_metrics(y_test,ygx,lbls):
    tptn_df = pd.DataFrame(confusion_matrix(y_test, ygx, labels=lbls), 
                           index=['train:{:}'.format(x) for x in lbls], 
                           columns=['pred:{:}'.format(x) for x in lbls])
    print(tptn_df)    
    print("\n~~~~")
    
    TP = np.diag(tptn_df.values)
    FP = tptn_df.values.sum(axis=0) - TP
    FN = tptn_df.values.sum(axis=1) - TP
    TN = np.sum(tptn_df.values) - (FP + FN + TP)
# false positive rates
    FPR = FP/(FP+TN)
# false negative rates
    FNR = FN/(TP+FN)
# overall 
    sfpr=FP.sum()/(FP.sum()+TN.sum())
    sfnr=FN.sum()/(TP.sum()+FN.sum())
    
    if len(lbls) >2:
        for x in range(len(lbls)):
            print('{:>12} : '.format(lbls[x]),
                  'FPR = %.3f   FNR = %.3f' % (FPR[x], FNR[x]))
        print()

    print('{:>12} : '.format('macro avg'),
          'FPR = %.3f   FNR = %.3f'  % (FPR.mean(), FNR.mean()))
    print('weighted avg :  FPR = %.3f   FNR = %.3f' % (sfpr, sfnr))
 
    print("\n~~~~")
    
#    macro average: unweighted mean per label 
# weighted average: support-weighted mean per label  
    print(classification_report(y_test, ygx, digits=3, target_names=lbls))

    print("~~~~")
# Matthews correlation coefficient: 
#   correlation between prediction and ground truth
#   (+1 perfect, 0 random prediction, -1 inverse)

    mcc = matthews_corrcoef(y_test, ygx)
    print('MCC: Overall :  %.3f' % mcc)
    if len(lbls) >2:
        for tc in lbls:
            bin_mcc = matthews_corrcoef(y_test == tc, ygx == tc)
            print('{:>12} :'.format(tc),' %.3f' % bin_mcc)  

    return '~~~~'

**<br>Classifier Selection**

In [None]:
# prepare list
models = []

##  --  Linear  --  ## 
from sklearn.linear_model import LogisticRegression 
models.append (("LogReg",LogisticRegression())) 
#from sklearn.linear_model import SGDClassifier 
#models.append (("StocGradDes",SGDClassifier())) 
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
#models.append(("LinearDA", LinearDiscriminantAnalysis())) 
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
#models.append(("QuadraticDA", QuadraticDiscriminantAnalysis())) 

##  --  Support Vector  --  ## 
#from sklearn.svm import SVC 
#models.append(("SupportVectorClf", SVC())) 
#from sklearn.svm import LinearSVC 
#models.append(("LinearSVC", LinearSVC())) 
#from sklearn.linear_model import RidgeClassifier
#models.append (("RidgeClf",RidgeClassifier())) 

##  --  Non-linear  --  ## 
from sklearn.tree import DecisionTreeClassifier 
models.append (("DecisionTree",DecisionTreeClassifier())) 
#from sklearn.naive_bayes import GaussianNB 
#models.append (("GaussianNB",GaussianNB())) 
from sklearn.neighbors import KNeighborsClassifier 
models.append(("K-NNeighbors", KNeighborsClassifier())) 

##  --  Ensemble: bagging  --  ## 
from sklearn.ensemble import RandomForestClassifier 
models.append(("RandomForest", RandomForestClassifier())) 
##  --  Ensemble: boosting  --  ## 
#from sklearn.ensemble import AdaBoostClassifier 
#models.append(("AdaBoost", AdaBoostClassifier())) 
#from sklearn.ensemble import GradientBoostingClassifier 
#models.append(("GradientBoost", GradientBoostingClassifier())) 

##  --  NeuralNet (simplest)  --  ## 
#from sklearn.neural_network import MLPClassifier 
#models.append(("MultiLayerPtron", MLPClassifier())) 

print(models)

**<br>Fit and Predict**

In [None]:
# evaluate each model in turn
results = []
for name, clf in models:
    print('Confusion Matrix:', name)
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)

    results.append((name, ygx))
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]
    show_metrics(y_test,ygx,clf.classes_)
    print('\nParameters: ', clf.get_params(), '\n\n')

***
**Model Comparison: Cross Validation**

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
# a variation of KFold that returns stratified folds,  
#   preserving the percentage of samples for each class

# define the strategy
folds = 5
skf = StratifiedKFold(shuffle=True, random_state = 11, n_splits=folds)

# Default scorer for classification is sklearn.metrics.accuracy_score 
# In unbalanced classification, the accuracy score is often uninformative
# For the list of options see
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

# average for each label weighted by support (number of true instances for each label)
#scorer = ['wtd.avg.Precision', 'precision_weighted']
#scorer = ['wtd.avg.Recall', 'recall_weighted']
scorer = ['wtd.avg.f1_Score', 'f1_weighted']

from time import time
trs = time()
print('KFold CV: %i folds with scoring = %s \n\t timer started' % (folds, scorer[0]))

results = []
names = []
for name, model in models:
    print(name, end='')    # no newline at the end
    cv_results = cross_val_score(model, X_train, y_train, cv=skf, scoring=scorer[1])
    results.append(cv_results)
    names.append(name)
    msg = ":\t%s = %0.3f +/- (%0.3f)" % (scorer[0], cv_results.mean(), cv_results.std())
    print(msg)

tre = time() - trs
print ("\tRun Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
# boxplot model comparison
# One box and whisker plot for each algorithm’s sample of results. 
# The box shows the middle 50 percent of the data, 
# the orange line in the middle of each box shows the median of the sample, 
# and the green triangle in each box shows the mean of the sample.

import matplotlib.pyplot as plt

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results, showmeans=True)
#ax.set_xticklabels(names)
plt.show()
print(names)

In [None]:
# individual model per-fold results
from yellowbrick.model_selection import cv_scores
for name, model in models:
    viz = cv_scores(model, X_train, y_train, cv=skf, scoring=scorer[1])

***
**Model Comparison: Bias - Variance Decomposition**

In [None]:
from mlxtend.evaluate import bias_variance_decomp
from sklearn.preprocessing import LabelEncoder

## bias_variance_decomp() requires 
##    1. numpy ndarrays
##    2. numeric targets

# slow because it does num_rounds (default=200) bootstrap cross validation
folds = 12

from time import time
trs = time()
print('Bias-Variance: Model Comparison \n\t timer started')

ytrain = LabelEncoder().fit_transform(y_train)
ytest = LabelEncoder().fit_transform(y_test)

cn, bias, var, err = [], [], [], []

for name, clf in models:
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        clf, X_train, ytrain, X_test, ytest, 
        loss='0-1_loss', num_rounds=folds, random_seed=150)
    err.append(avg_expected_loss)
    bias.append(avg_bias)
    var.append(avg_var)
    cn.append(name)
    
    print(name,end='')    # no newline at the end
    msg=": Bias: %0.3f  Variance: %0.3f  E.loss: %0.3f" % (avg_bias, avg_var, avg_expected_loss)
    print(msg)
    
tre = time() - trs
print ("\tRun Time {} seconds".format(round(tre,2)) + '\n')

In [None]:
# stacked bar plot
import matplotlib.pyplot as plt

rx = np.arange(len(models))     # the x locations for the groups
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(rx, err, color = 'b', width= 0.35)
ax.bar(rx, bias, color = 'g', width= 0.35)
ax.bar(rx, var, color = 'r', width= 0.35)
ax.legend(labels=['E.loss', 'Bias', 'Var'])
ax.set_title('Bias-Variance Decomposition')
ax.set_xticks(rx)
plt.show()
print(cn)