### WILL DUCKWORTH 09/08/2022

### This Program was designed for testing and training a K-Nearest Neighbour machine learning model on the built in SK-Learn "Wine" data set. There are three seperate types of wine, classified by 13 unique features. The K-Nearest Neighbour model is implemented to find the most suited number of neighbours, the best K fold, and the two optimum features to classify the different wines.


### Author - Will Duckworth

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import *
from matplotlib import pyplot as plt
import seaborn as sns

#### Create Python class to group the data with methods

In [2]:
class WineClassifier:

    data_path = datasets.load_wine()

    def __init__(self, data_frame=None, x=None, y=None):
        if data_frame is None:
            self.data_frame = pd.DataFrame(WineClassifier.data_path["data"],
                                           columns=WineClassifier.data_path["feature_names"])
        if x is None:
            self.x = self.data_frame.copy()
            self.x["target"] = WineClassifier.data_path["target"]
        if y is None:
            self.y = self.x.pop("target")
            self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(
                self.x, self.y, test_size=.2)

    def train_forrest(self, estimators=50):
        forrest = ensemble.RandomForestClassifier(n_estimators=estimators)
        forrest.fit(self.x_train, self.y_train)
        prediction = forrest.predict(self.x_test)
        forrest.score(self.x_test, self.y_test)
        self.get_cross_val(forrest, self.x_test, self.y_test)
        print(model_selection.cross_val_score(forrest, self.x, self.y, scoring="accuracy").mean())
        return prediction
        
    
    def train_lin_regression(self):
        lin_mod = linear_model.LogisticRegression()
        lin_mod.fit(self.x_train, self.y_train)
        prediction = lin_mod.predict(self.x_test)
        print(lin_mod.score(self.x_test, self.y_test))
        print(model_selection.cross_val_score(lin_mod, self.x, self.y, scoring="accuracy").mean())

    def find_knn_neighbours(self):
        k_range = range(1,20)
        scores = []
        scores_dict = {}
        for k in k_range:
            knn = neighbors.KNeighborsClassifier(n_neighbors=k)
            knn.fit(self.x_train, self.y_train)
            scores.append(knn.score(self.x_test, self.y_test))
        plt.figure()
        plt.xlabel("k neighbours")
        plt.ylabel("predictions")
        plt.scatter(k_range, scores)
        plt.grid()
        plt.xticks([i for i in range(0, 35, 5)])
        plt.xlim([0, 30])
        plt.ylim([0, 1])
        plt.show()

    def knn_train(self, *args):
        self.x = self.x.filter([args[0], args[1]])
        self.x["target"] = WineClassifier.data_path["target"]
        self.y = self.x.pop("target")
        self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(
            self.x, self.y, test_size=.2)
        knn = neighbors.KNeighborsClassifier(n_neighbors=4)
        knn.fit(self.x_train, self.y_train)
        prediction = knn.predict(self.x_test)
        score = knn.score(self.x_test, self.y_test)
        print(model_selection.cross_val_score(knn, self.x, self.y, scoring="accuracy").mean())
        print(metrics.classification_report(self.y_test, prediction))
        wine_prediction = [i for i in prediction]

        try:
            if args[2] == "prediction":
                return wine_prediction
        except Exception as e:
            print()
        finally:
            return wine_prediction

    def train_svm(self):
        svm_mod = svm.SVC()
        dd = svm_mod.fit(self.x_train, self.y_train)
        prediction = svm_mod.predict(self.x_test)
        print(svm_mod.score(self.x_test, self.y_test))
        print(model_selection.cross_val_score(svm_mod, self.x, self.y, scoring="accuracy").mean())

    def get_cross_val(self, function, x, y):
        method_score = model_selection.cross_val_score(function, x, y)
        print(method_score)
        return method_score

    def check_classifiers(self):
        for i in range(len(self.x)):
            score_dict = {}
            all_scores = []

            for i in self.data_frame.keys():
                for j in self.data_frame.keys():
                    if j != i:
                        check_x = self.x.filter([str(i), str(j)])
                        x_train, x_test, y_train, y_test = model_selection.train_test_split(
                            check_x, self.y, test_size=.2)
                        knn = neighbors.KNeighborsClassifier(n_neighbors=4)
                        knn.fit(x_train, y_train)
                        prediction = knn.predict(x_test)
                        score = knn.score(x_test, y_test)
                        score_dict[f'{str(self.x[j].name)}{str(self.x[i].name)}'] = score
                        all_scores.append(score)

            score_keys = list(score_dict.keys())
            score_values = list(score_dict.values())
            highest_score = score_values.index(max(all_scores))
            print(f'{score_keys[highest_score]}":"{max(all_scores)}')
            return f'{score_keys[highest_score]}":"{max(all_scores)}'

    def get_training_split(self):
        test_sizes = [round(float(i * .1), 2) for i in range(1, 9)].__reversed__()
        test_sizes = list(test_sizes)
        knn = neighbors.KNeighborsClassifier(n_neighbors=4)
        plt.figure()

        for test_size in test_sizes:
            scores = []
            for i in range(1, 100):
                self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(self.x, self.y, test_size=1 - test_size)
                knn.fit(self.x_train, self.y_train)
                scores.append(knn.score(self.x_test, self.y_test))
            plt.plot(test_size, np.mean(scores), "r+")
        plt.plot()
        plt.xlabel("training % split")
        plt.ylabel("predictions")
        plt.show()

    def plot_roc_curve(self, function=None):
        nclasses = 3
        classifier = function
        classifier.fit(self.x_test, self.y_test)
        x = self.x.filter(["flavanoids", "alcohol"])
        self.x["target"] = WineClassifier.data_path["target"]
        self.y = self.x.pop("target")
        self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(
            self.x, self.y, test_size=.2)
        y_score = classifier.predict_proba(self.x_test)
        y_test_bin = preprocessing.label_binarize(self.y_test, classes=[0, 1, 2])
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(nclasses):
            fpr[i], tpr[i], _ = metrics.roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        fig, ax = plt.subplots(figsize=(5, 5))
        colors = ['cyan', 'magenta', 'purple']
        plt.plot([0,1], [0,1], "r--")
        for i, color in zip(range(nclasses), colors):
            plt.plot(fpr[i], tpr[i], color=color,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                           ''.format(i, roc_auc[i]))
        ax.set_xlabel("False Positives")
        ax.set_ylabel("True Positives")
        ax.set_title("ROC Curve of Classifications")
        ax.legend(["y=x", "Wine 0", "Wine 1", "Wine 2"])
        plt.show()

    def show_scatter(self, model=neighbors.KNeighborsClassifier(n_neighbors=4)):
        knn = model
        knn.fit(self.x_train, self.y_train)
        predictions = knn.predict(self.x_test)
        plt.scatter(predictions, self.y_test)
        plt.title('Predicted values v Actual values')
        plt.xlabel('Predictions')
        plt.ylabel('Actual values')
        plt.show()

    def get_confusion_matrix(self, prediction):
        cm = metrics.confusion_matrix(self.y_test, prediction)
        plt.figure(figsize=(8, 7))
        sns.heatmap(cm, annot=True)
        plt.title("confusion matrix")
        plt.ylabel("truth")
        plt.xlabel("prediction")
        plt.show()

#### Create an instance of the wine classifier class

In [3]:
wine = WineClassifier()

#### Find the optimal number of neighbours for the KNN model

In [4]:
%matplotlib notebook
wine.find_knn_neighbours()

<IPython.core.display.Javascript object>

#### Find the 2 best classifiers based on prediction score performance

In [5]:
wine.check_classifiers()

total_phenolscolor_intensity":"1.0


'total_phenolscolor_intensity":"1.0'

#### Plot a confusion matrix based on the best fitted neighbours and 2 best classifiers, as well as print the F1 Score, Precision and Recall

In [48]:
wine.get_confusion_matrix(wine.knn_train("flavanoids", "alcohol", "prediction"))

0.9161904761904763
              precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       1.00      0.94      0.97        18
           2       1.00      1.00      1.00         9

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.98      0.97      0.97        36



<IPython.core.display.Javascript object>

#### Check the cross validation scores to the performance we have already calculated by using built in methods from SKlearn

In [49]:
wine.get_cross_val(neighbors.KNeighborsClassifier(n_neighbors=3),
                   x=wine.x_test.filter(["flavanoids", "alcohol"]), y=wine.y_test)

[1.         1.         1.         0.85714286 1.        ]


array([1.        , 1.        , 1.        , 0.85714286, 1.        ])

#### Plot an ROC Curve for the three different classes of wine to visualise the classification boundaries

In [50]:
wine.plot_roc_curve(neighbors.KNeighborsClassifier(n_neighbors=4))

<IPython.core.display.Javascript object>

#### Check against another classifer model with all 13 features for a comparison

In [51]:
wine.plot_roc_curve(ensemble.RandomForestClassifier(n_estimators=50))
wine.get_confusion_matrix(wine.train_forrest())

<IPython.core.display.Javascript object>

[0.875      1.         0.85714286 0.85714286 0.85714286]
0.8993650793650794


<IPython.core.display.Javascript object>