> **tomato juice dataset**
<br>` 'quality' is the target feature for classification `
<br>` the other features are chemical properties of our product `

**Import the main libraries**

In [None]:
import numpy as np
import pandas as pd

from time import time

_import the local library_

In [None]:
# add parent folder path where lib folder is
import sys
if ".." not in sys.path:import sys; sys.path.insert(0, '..') 

In [None]:
from mylib import show_labels_dist, show_metrics, bias_var_metrics

**Import the Dataset**

In [None]:
## file path: windows style
df = pd.read_csv('..\\datasets\\tomatjus.csv')

## file path: unix style
#df = pd.read_csv('../datasets/tomatjus.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(
    df.shape[0], df.shape[1]))

In [None]:
df.info()

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**Check for missing values**

In [None]:
cnt=0
print('Missing Values - ')
for col in df.columns:
    nnul = pd.notnull(df[col]) 
    if (len(nnul)!=len(df)):
        cnt=cnt+1
        print('\t',col,':',(len(df)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

# address missing values here

**Quick visual check of unique values, deal with unique identifiers**

In [None]:
# Identify columns with only one value 
# or with number of unique values == number of rows
n_eq_one = []
n_eq_all = []

print('Unique value count (',df.shape[0],'Rows in the dataset )')
for col in df.columns:
    lc = len(df[col].unique())
    print(col, ' ::> ', lc)
    if lc == 1:
        n_eq_one.append(df[col].name)
    if lc == df.shape[0]:
        n_eq_all.append(df[col].name)

In [None]:
# Drop columns with only one value
if len(n_eq_one) > 0:
    print('Dropping single-valued features')
    print(n_eq_one)
    df.drop(n_eq_one, axis=1, inplace=True)

# Drop or bin columns with number of unique values == number of rows
if len(n_eq_all) > 0:
    print('Dropping unique identifiers')
    print(n_eq_all)
    df.drop(n_eq_all, axis=1, inplace=True)

# continue with featue selection / feature engineering

**<br>Classification target feature**
<br>"the Right Answers", or more formally "the desired outcome"
<br>Must be in a separate dataset for classification ,,,

_Make it a multi-class problem, using text labels_

In [None]:
##  divide into classes by giving a range for quality
##  Make it a multi-class problem: {3,4,5} {6} {7.8}
bins = (2, 5, 6, 8)
group_names = ['Average', 'Premium', 'Special']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'quality'
y = df[labels_col]

## Features used for prediction 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe 
X = df.copy()
X.drop(labels_col, axis=1, inplace=True)

**<br>Check column names of numeric attributes**
<br>Features with numeric values need to be normalised by changing the values to
small numbers in a specific range (scaling). _Note that scaling comes_ after _the test//train split!_

In [None]:
categori = X.select_dtypes(include=['object','category']).columns
print(categori.to_list())

**<br>Check column names of numeric attributes**
<br>Features with numeric values need to be normalised by changing the values to
small numbers in a specific range (scaling). _Note that scaling comes_ after _the test//train split!_

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

***
**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=50, stratify=y)

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# data before normalization
X_test.head()

In [None]:
# scaling the Numeric columns 
# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1

# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))
    

In [None]:
# data after normalization
X_test.head()

**<br>Classifier Selection**

In [None]:
# prepare list
models = []

##  --  Linear  --  ## 
#from sklearn.linear_model import LogisticRegression 
#models.append (("LogReg",LogisticRegression())) 
#from sklearn.linear_model import SGDClassifier 
#models.append (("StocGradDes",SGDClassifier())) 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
models.append(("LinearDA", LinearDiscriminantAnalysis())) 
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
#models.append(("QuadraticDA", QuadraticDiscriminantAnalysis())) 

##  --  Support Vector  --  ## 
#from sklearn.svm import SVC 
#models.append(("SupportVectorClf", SVC())) 
#from sklearn.svm import LinearSVC 
#models.append(("LinearSVC", LinearSVC())) 
#from sklearn.linear_model import RidgeClassifier
#models.append (("RidgeClf",RidgeClassifier())) 

##  --  Non-linear  --  ## 
#from sklearn.tree import DecisionTreeClassifier 
#models.append (("DecisionTree",DecisionTreeClassifier())) 
#from sklearn.naive_bayes import GaussianNB 
#models.append (("GaussianNB",GaussianNB())) 
#from sklearn.neighbors import KNeighborsClassifier 
#models.append(("K-NNeighbors", KNeighborsClassifier())) 

##  --  Ensemble: bagging  --  ## 
#from sklearn.ensemble import RandomForestClassifier 
#models.append(("RandomForest", RandomForestClassifier())) 
##  --  Ensemble: boosting  --  ## 
#from sklearn.ensemble import AdaBoostClassifier 
#models.append(("AdaBoost", AdaBoostClassifier())) 
#from sklearn.ensemble import GradientBoostingClassifier 
#models.append(("GradientBoost", GradientBoostingClassifier())) 

##  --  NeuralNet (simplest)  --  ## 
#from sklearn.linear_model import Perceptron 
#models.append (("SingleLayerPtron",Perceptron())) 
#from sklearn.neural_network import MLPClassifier 
#models.append(("MultiLayerPtron", MLPClassifier()))

print(models)

**<br>Target Label Distributions** (standard block)

In [None]:
# from our local library
show_labels_dist(X_train,X_test,y_train,y_test)

**<br>Fit and Predict** (standard block)

In [None]:
# evaluate each model in turn
results = []

print('macro average: unweighted mean per label')
print('weighted average: support-weighted mean per label')
print('MCC: correlation between prediction and ground truth')
print('     (+1 perfect, 0 random prediction, -1 inverse)\n')

for name, clf in models:
    trs = time()
    print('Confusion Matrix:', name)
    
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)
    results.append((name, ygx))
    
    tre = time() - trs
    print ("Run Time {} seconds".format(round(tre,2)) + '\n')
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]

    show_metrics(y_test, ygx, clf.classes_)   # from our local library
    print('\nParameters: ', clf.get_params(), '\n\n')

**Bias - Variance Decomposition** (standard block)

In [None]:
# from our local library
# reduce (cross-validation) folds for faster results
folds = 20
for name, clf in models:
    print('Bias // Variance Decomposition:', name)
    bias_var_metrics(X_train,X_test,y_train,y_test,clf,folds)

***
**<br>Visualisations**

In [None]:
import matplotlib.pyplot as plt

* Class Balance

In [None]:
from yellowbrick.target import ClassBalance
# The ClassBalance visualizer has a “compare” mode, 
#   to create a side-by-side bar chart instead of a single bar chart 

# Instantiate the visualizer
visualizer = ClassBalance()
visualizer.fit(y_train, y_test)        # Fit the data to the visualizer
_ = visualizer.show()                  # Finalize and render the figure
# assign visualizer.show() to a null variable to avoid printing some trash

 ***
 **_These examples only work with one classifier_** for example
>models[0][1]  <br>models[1][1]  <br>models[2][1]
 ***

* Confusion matrix _(there are many alternatives)_<br>A "normalised" confusion matrix shows percentages rather than quantity

In [None]:
# import matplotlib.pyplot as plt
from scikitplot.metrics import plot_confusion_matrix as splot_cm
splot_cm(y_test, results[0][1], normalize=False)
plt.show()

In [None]:
# mlextend confusion_matrix can show one or both

# import matplotlib.pyplot as plt
from mlxtend.evaluate import confusion_matrix as mlx_cnfmtx
from mlxtend.plotting import plot_confusion_matrix

cnfmat = mlx_cnfmtx(y_test, results[0][1])
fig, ax = plot_confusion_matrix(conf_mat=cnfmat,
                                show_absolute=True,
                                show_normed=True,
#                                colorbar=True,
                                figsize=(4, 4))
plt.show()

***
* Plot ROC, Report AUC
<br>Note: This **will not work** for these classifiers, because
<br>they do not have a _predict_proba()_ method:
> LinearSVC()
<br>RidgeClassifier()
<br>Perceptron()

In [None]:
# import matplotlib.pyplot as plt
from scikitplot.metrics import plot_precision_recall, plot_roc

clf = models[0][1]
clf.fit(X_train, y_train)
probas = clf.predict_proba(X_test)

plot_roc(y_test, probas)
plt.show()

* Precision-Recall Curve

In [None]:
# import matplotlib.pyplot as plt

# uncomment below if you do not have ROC_AUC above
#from scikitplot.metrics import plot_precision_recall, plot_roc

#clf = models[0][1]
#clf.fit(X_train, y_train)
#probas = clf.predict_proba(X_test)

plot_precision_recall(y_test, probas)
plt.show()