In [2]:
%matplotlib inline

from PIL import Image
import glob
import numpy as np
from itertools import product
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB

In [4]:
machine_name = 'local'

In [6]:
if machine_name == 'local':
    BASE_DIR = '/Users/duy/Documents/code/lund/EDAN95_applied_ai_lund/lab_5_nb/'
else:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_DIR = "content/drive/My Drive/Colab Notebooks/lab_5_nb/"
# Replace default BASE_DIR of the evaluate_classifier method

In [8]:
def evaluate_classifier(Classifier, is_naive_bayes_classifier=False, dataset="mnist", BASE_DIR="/Users/duy/Documents/code/lund/EDAN95_applied_ai_lund/lab_5_nb/MNIST_Light"):
  classifier = Classifier()
  data_prep = DataPreparator(BASE_DIR)
  train_features, test_features, train_labels, test_labels = data_prep.get_data(dataset, is_naive_bayes_classifier=is_naive_bayes_classifier)
  classifier.fit(train_features, train_labels)
  preds = classifier.predict(test_features)
  print("Classification report \n %s \n" % (metrics.classification_report(test_labels, preds)))
  print("Confusion matrix \n%s\n" %(metrics.confusion_matrix(test_labels, preds)))
  if dataset == "mnist":
    mnist = MNISTData(BASE_DIR + 'MNIST_Light/*/*.png')
    mnist.visualize_wrong_class(preds, 5)

In [10]:
# ---------------------------------------------------------------- #
# This code is mainly from the EDAN95 fall term lab session No 6,
# provided by Volker Krueger
# ---------------------------------------------------------------- #

class MNISTData():

    def __init__(self, dir):
        print('Loading MNIST data form %s' %dir)

        filelist = sorted(glob.glob(dir))
        self.x = np.array([np.array(Image.open(fname)) for fname in filelist])

        self.samples_per_class = 500
        self.number_of_classes = 10

        self.y = np.zeros(self.number_of_classes * self.samples_per_class,dtype=int)
        for cls in range(1,self.number_of_classes):
            self.y[(cls*500):(cls+1)*500] = cls


    def get_data(self):

        self.train_features, self.test_features, self.train_labels, self.test_labels = train_test_split(self.x, self.y, test_size=0.3,
                                                                                    random_state=42)
        self.train_normalised = self.train_features.reshape(3500, 400) / 255.0
        self.test_normalised = self.test_features.reshape(1500, 400) / 255.0

        return self.train_normalised, self.test_normalised, self.train_labels, self.test_labels


    def visualize_random(self):

        examples_per_class = 8
        for cls in range(self.number_of_classes):
            idxs = np.where(self.train_labels == cls)[0]
            idxs = np.random.choice(idxs, examples_per_class, replace=False)
            for i, idx in enumerate(idxs):
                plt.subplot(examples_per_class, self.number_of_classes, i * self.number_of_classes + cls + 1)
                plt.imshow(self.train_features[idx].astype('uint8'), cmap='gray')
                plt.axis('off')
                if i == 0:
                    plt.title(str(cls))
        plt.show()

    def visualize_wrong_class(self, pred, examples_per_class):
        for cls in range(self.number_of_classes):
            idxs = [idx for idx, value in enumerate(self.test_labels) if((value != cls) and (pred[idx] == cls))]
            if(len(idxs) > examples_per_class):
                idxs = np.random.choice(idxs, examples_per_class, replace=False)

            for i, idx in enumerate(idxs):
                plt.subplot(examples_per_class, self.number_of_classes, i * self.number_of_classes + cls + 1)
                plt.imshow(self.test_features[idx].astype('uint8'), cmap='gray')
                plt.axis('off')
                if i == 0:
                    plt.title(str(cls))
        plt.show()

In [12]:
class DataPreparator():
  def __init__(self, BASE_DIR):
    self.base_dir = BASE_DIR
    self.random_state = 10
  def get_data(self, dataset="mnist", is_naive_bayes_classifier=False):
    if dataset == "mnist":
      mnist = MNISTData(self.base_dir + '/*/*.png')
      train_features, test_features, train_labels, test_labels = mnist.get_data()
      mnist.visualize_random()
      if is_naive_bayes_classifier:
        n_train, pixels = np.shape(train_features)
        dim = int(np.sqrt(pixels))
        
        train_features = train_features.reshape([len(train_features), dim, dim])
        test_features = test_features.reshape([len(test_features), dim, dim])
    elif dataset == "digits":
      digits = load_digits()
      train_features, test_features, train_labels, test_labels = train_test_split(digits.images, digits.target, test_size=0.3, random_state=self.random_state)
      
      if not is_naive_bayes_classifier:
        train_features = train_features.reshape([len(train_features), -1])
        test_features = test_features.reshape([len(test_features), -1])

    elif dataset == "digits_summarized":
      digits = load_digits()
      train_features, test_features, train_labels, test_labels = train_test_split(digits.images, digits.target, test_size=0.3, random_state=self.random_state)

      train_features[train_features < 5] = 0
      train_features[(train_features >= 5) & (train_features < 11)] = 1.
      train_features[train_features >= 11] = 2.
      
      test_features[test_features < 5] = 0.
      test_features[(test_features >= 5) & (test_features < 11)] = 1.
      test_features[test_features >= 11] = 2.
      
      if not is_naive_bayes_classifier:
        train_features = train_features.reshape([len(train_features), -1])
        test_features = test_features.reshape([len(test_features), -1])
    return train_features, test_features, train_labels, test_labels

In [14]:
class NearestCentroidClassifier():
  def fit(self, X, y):
    n_observations, n_features = np.shape(X)
    classes = np.unique(y)
    n_classes = len(classes)
    # Contain the centroid of each class. Each centroid has n_features (just like an element from X)
    self.centroids = np.empty((n_classes, n_features), dtype=np.float64)

    for current_class in classes:
      self.centroids[current_class] = X[y == current_class].mean(axis=0)
    return self

  def predict(self, X):
    preds = np.empty(len(X), dtype= np.int32)

    for i, x in enumerate(X):
      # The class is the nearest centroid, which means the centroid with smallest value of L2 distance
      preds[i] = np.argmin([np.linalg.norm(x - centroid, 2) for centroid in self.centroids])

    self.preds = preds
    return self.preds

In [16]:
class NaiveBayesClassifier():
  def fit(self, X, y, data ="mnist"):
    self.classes, counts = np.unique(y, return_counts=True)
    n_obs, n_rows, n_cols = np.shape(X)
    pixel_values = np.unique(X.flatten())
    cond_prob = {}

    for current_class in self.classes:
      # Contain caracteristics of each pixel (i,j) of the samples in current_class
      class_pixel = {}
      # Grab all samples of current class
      X_current_class = X[y == current_class]

      # Loop through all pixels of the samples
      for k, (i,j) in enumerate(product(range(n_rows), range(n_cols))):
          
        pixel_val_counts = np.zeros(len(pixel_values))
        #Get unique the number of unique values and their position
        pos, n_uniques = np.unique(X[y == current_class][:,i,j], return_counts=True)
        #Set the number number of unique values at corresponding position
        pixel_val_counts[pos.astype(np.int8)] = n_uniques
        #Normalize to be probabilities using add-one or Laplace smoothing
        if data == "mnist":
          class_pixel[k] = (pixel_val_counts) / np.sum(pixel_val_counts)
        else:
          class_pixel[k] = (pixel_val_counts + 1) / np.sum(pixel_val_counts + 1)
          
      cond_prob[current_class] = class_pixel

      self.cond_prob = cond_prob
      self.priors = counts / np.sum(counts)

  def predict(self, X):
    preds = np.zeros(len(X), dtype=np.int32)
    n_obs, n_rows, n_cols = np.shape(X)

    for n, x in enumerate(X):
      # Probability on all classes
      probs = np.zeros(len(self.classes))

      for current_class in self.classes:
        # Prior term - Prior probability of current_class
        prob = self.priors[current_class]

        for k, (i,j) in enumerate(product(range(n_rows), range(n_cols))):
          prob *= self.cond_prob[current_class][k][int(x[i,j])]
        
        probs[current_class] = self.priors[current_class] * prob

      preds[n] = np.argmax(probs)

    self.preds = preds
    return preds

In [18]:
class GaussianNaiveBayesClassifier():
  def gauss(self, x, mu, sigma, epsilon = 1e-2):
    sigma += epsilon
    mu += epsilon
    return 1/(np.sqrt(2*np.pi)*sigma) * np.exp(-1/(2* (sigma ** 2)) * (x-mu) ** 2)

  def fit(self, X, y):
    self.classes = np.unique(y)

    means = {}
    stds = {}

    for current_class in self.classes:
      X_current_class = X[y == current_class]

      means[current_class] = np.mean(X_current_class, axis=0)
      stds[current_class] = np.std(X_current_class, axis=0)

    self.means = means
    self.stds = stds

  def predict(self, X):
    predictions = np.empty(len(X))
    gaussian = np.vectorize(self.gauss)

    for i, x in enumerate(X):
      probs = np.zeros(len(self.classes))
      
      for current_class in self.classes:
        prob = gaussian(x, self.means[current_class], self.stds[current_class])
        probs[current_class] = np.sum(prob)

      predictions[i] = np.argmax(probs)

    self.predictions = predictions
    return self.predictions

# Digits

In [20]:
dataset = "digits"

In [22]:
evaluate_classifier(GaussianNB, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        51
           1       0.62      0.93      0.74        57
           2       0.93      0.49      0.64        55
           3       0.75      0.91      0.82        56
           4       0.95      0.76      0.85        51
           5       0.90      0.90      0.90        51
           6       0.98      0.96      0.97        55
           7       0.78      0.98      0.87        60
           8       0.57      0.62      0.60        50
           9       0.97      0.57      0.72        54

    accuracy                           0.81       540
   macro avg       0.85      0.81      0.81       540
weighted avg       0.84      0.81      0.81       540
 

Confusion matrix 
[[49  1  0  0  1  0  0  0  0  0]
 [ 0 53  1  0  0  0  0  0  3  0]
 [ 0  8 27  8  0  0  0  0 12  0]
 [ 0  0  0 51  0  0  0  1  3  1]
 [ 0  8  0  0 39  1  0  2  1  0]
 [ 0  0  0  1  0 46  0  3  1  0]

In [24]:
evaluate_classifier(NearestCentroidClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        51
           1       0.81      0.75      0.78        57
           2       0.84      0.89      0.87        55
           3       0.96      0.93      0.95        56
           4       0.98      0.92      0.95        51
           5       0.90      0.84      0.87        51
           6       0.98      0.98      0.98        55
           7       0.92      1.00      0.96        60
           8       0.82      0.80      0.81        50
           9       0.78      0.85      0.81        54

    accuracy                           0.90       540
   macro avg       0.90      0.90      0.90       540
weighted avg       0.90      0.90      0.90       540
 

Confusion matrix 
[[50  0  0  0  1  0  0  0  0  0]
 [ 0 43  8  0  0  1  0  0  2  3]
 [ 1  1 49  2  0  0  0  0  2  0]
 [ 0  0  1 52  0  0  0  1  2  0]
 [ 0  2  0  0 47  0  0  0  2  0]
 [ 0  0  0  0  0 43  0  0  0  8]

In [26]:
evaluate_classifier(GaussianNaiveBayesClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.11      1.00      0.19        51
           1       0.00      0.00      0.00        57
           2       0.00      0.00      0.00        55
           3       0.00      0.00      0.00        56
           4       0.00      0.00      0.00        51
           5       0.00      0.00      0.00        51
           6       0.28      0.18      0.22        55
           7       0.72      0.22      0.33        60
           8       0.00      0.00      0.00        50
           9       0.00      0.00      0.00        54

    accuracy                           0.14       540
   macro avg       0.11      0.14      0.07       540
weighted avg       0.12      0.14      0.08       540
 

Confusion matrix 
[[51  0  0  0  0  0  0  0  0  0]
 [46  0  0  0  0  0 11  0  0  0]
 [41  2  0  0  0  0 12  0  0  0]
 [55  0  0  0  0  0  0  1  0  0]
 [49  0  0  0  0  0  0  2  0  0]
 [49  0  0  0  0  0  0  2  0  0]

In [28]:
evaluate_classifier(NaiveBayesClassifier, is_naive_bayes_classifier=True, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.21      0.98      0.35        51
           1       0.93      0.47      0.63        57
           2       0.88      0.55      0.67        55
           3       0.89      0.43      0.58        56
           4       0.97      0.67      0.79        51
           5       0.94      0.57      0.71        51
           6       1.00      0.64      0.78        55
           7       0.92      0.57      0.70        60
           8       0.60      0.50      0.54        50
           9       0.77      0.50      0.61        54

    accuracy                           0.58       540
   macro avg       0.81      0.59      0.64       540
weighted avg       0.82      0.58      0.64       540
 

Confusion matrix 
[[50  0  0  0  1  0  0  0  0  0]
 [20 27  3  0  0  1  0  0  6  0]
 [19  1 30  2  0  0  0  0  3  0]
 [21  0  0 24  0  0  0  0  3  8]
 [17  0  0  0 34  0  0  0  0  0]
 [19  0  0  0  0 29  0  1  2  0]

# Digits summarized

In [30]:
dataset = "digits_summarized"

In [32]:
evaluate_classifier(GaussianNB, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        51
           1       0.74      0.49      0.59        57
           2       0.81      0.40      0.54        55
           3       0.89      0.30      0.45        56
           4       0.98      0.86      0.92        51
           5       0.92      0.71      0.80        51
           6       0.96      0.95      0.95        55
           7       0.92      0.92      0.92        60
           8       0.26      0.92      0.41        50
           9       0.88      0.52      0.65        54

    accuracy                           0.70       540
   macro avg       0.83      0.70      0.72       540
weighted avg       0.84      0.70      0.72       540
 

Confusion matrix 
[[50  0  0  0  0  0  0  0  1  0]
 [ 0 28  4  0  0  1  0  0 23  1]
 [ 1  1 22  0  0  0  0  0 31  0]
 [ 0  2  1 17  0  0  0  1 33  2]
 [ 0  1  0  0 44  1  2  0  3  0]
 [ 0  0  0  1  0 36  0  2 11  1]

In [34]:
evaluate_classifier(NearestCentroidClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        51
           1       0.78      0.74      0.76        57
           2       0.84      0.84      0.84        55
           3       0.94      0.89      0.92        56
           4       0.96      0.92      0.94        51
           5       0.91      0.84      0.88        51
           6       0.96      0.98      0.97        55
           7       0.92      0.98      0.95        60
           8       0.72      0.78      0.75        50
           9       0.82      0.87      0.85        54

    accuracy                           0.88       540
   macro avg       0.88      0.88      0.88       540
weighted avg       0.88      0.88      0.88       540
 

Confusion matrix 
[[50  0  0  0  1  0  0  0  0  0]
 [ 0 42  7  0  0  1  1  0  6  0]
 [ 1  2 46  2  0  0  0  0  4  0]
 [ 0  1  1 50  0  0  0  1  2  1]
 [ 0  2  0  0 47  0  0  0  2  0]
 [ 0  0  0  1  0 43  0  0  0  7]

In [36]:
evaluate_classifier(NaiveBayesClassifier, dataset=dataset, is_naive_bayes_classifier=True)

Classification report 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        51
           1       0.82      0.79      0.80        57
           2       0.86      0.91      0.88        55
           3       0.94      0.89      0.92        56
           4       0.98      0.92      0.95        51
           5       0.89      0.82      0.86        51
           6       1.00      0.96      0.98        55
           7       0.87      1.00      0.93        60
           8       0.78      0.78      0.78        50
           9       0.79      0.81      0.80        54

    accuracy                           0.89       540
   macro avg       0.89      0.88      0.88       540
weighted avg       0.89      0.89      0.89       540
 

Confusion matrix 
[[48  0  0  1  1  0  0  0  1  0]
 [ 1 45  6  0  0  1  0  0  4  0]
 [ 1  1 50  1  0  0  0  0  2  0]
 [ 0  0  1 50  0  0  0  2  1  2]
 [ 0  2  0  0 47  0  0  1  1  0]
 [ 0  1  0  0  0 42  0  1  0  7]

In [38]:
evaluate_classifier(GaussianNaiveBayesClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.61      0.67      0.64        51
           1       0.00      0.00      0.00        57
           2       0.00      0.00      0.00        55
           3       0.00      0.00      0.00        56
           4       0.00      0.00      0.00        51
           5       0.00      0.00      0.00        51
           6       0.13      1.00      0.24        55
           7       1.00      0.02      0.03        60
           8       0.07      0.10      0.08        50
           9       0.00      0.00      0.00        54

    accuracy                           0.18       540
   macro avg       0.18      0.18      0.10       540
weighted avg       0.19      0.18      0.10       540
 

Confusion matrix 
[[34  0  0  0  0  0 17  0  0  0]
 [ 0  0  0  0  0  0 54  0  3  0]
 [ 1  0  0  0  0  0 43  0 11  0]
 [ 0  0  0  0  0  0 50  0  6  0]
 [ 4  0  0  0  0  0 44  0  3  0]
 [ 3  0  0  0  0  0 35  0 13  0]

# MNIST

In [40]:
dataset = "mnist"

In [42]:
evaluate_classifier(GaussianNB, dataset=dataset)

Loading MNIST data form /Users/duy/Documents/code/lund/EDAN95_applied_ai_lund/lab_5_nb/MNIST_LightMNIST_Light/*/*.png


ValueError: Found input variables with inconsistent numbers of samples: [0, 5000]

In [23]:
evaluate_classifier(NearestCentroidClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        51
           1       0.81      0.75      0.78        57
           2       0.84      0.89      0.87        55
           3       0.96      0.93      0.95        56
           4       0.98      0.92      0.95        51
           5       0.90      0.84      0.87        51
           6       0.98      0.98      0.98        55
           7       0.92      1.00      0.96        60
           8       0.82      0.80      0.81        50
           9       0.78      0.85      0.81        54

    accuracy                           0.90       540
   macro avg       0.90      0.90      0.90       540
weighted avg       0.90      0.90      0.90       540
 

Confusion matrix 
[[50  0  0  0  1  0  0  0  0  0]
 [ 0 43  8  0  0  1  0  0  2  3]
 [ 1  1 49  2  0  0  0  0  2  0]
 [ 0  0  1 52  0  0  0  1  2  0]
 [ 0  2  0  0 47  0  0  0  2  0]
 [ 0  0  0  0  0 43  0  0  0  8]

In [24]:
evaluate_classifier(NaiveBayesClassifier, dataset=dataset, is_naive_bayes_classifier=True)

Classification report 
               precision    recall  f1-score   support

           0       0.21      0.98      0.35        51
           1       0.93      0.47      0.63        57
           2       0.88      0.55      0.67        55
           3       0.89      0.43      0.58        56
           4       0.97      0.67      0.79        51
           5       0.94      0.57      0.71        51
           6       1.00      0.64      0.78        55
           7       0.92      0.57      0.70        60
           8       0.60      0.50      0.54        50
           9       0.77      0.50      0.61        54

    accuracy                           0.58       540
   macro avg       0.81      0.59      0.64       540
weighted avg       0.82      0.58      0.64       540
 

Confusion matrix 
[[50  0  0  0  1  0  0  0  0  0]
 [20 27  3  0  0  1  0  0  6  0]
 [19  1 30  2  0  0  0  0  3  0]
 [21  0  0 24  0  0  0  0  3  8]
 [17  0  0  0 34  0  0  0  0  0]
 [19  0  0  0  0 29  0  1  2  0]

In [25]:
evaluate_classifier(GaussianNaiveBayesClassifier, dataset=dataset)

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        51
           1       0.74      0.84      0.79        57
           2       0.88      0.89      0.88        55
           3       0.96      0.82      0.88        56
           4       0.94      0.90      0.92        51
           5       0.85      0.90      0.88        51
           6       1.00      0.87      0.93        55
           7       0.92      0.98      0.95        60
           8       0.79      0.76      0.78        50
           9       0.81      0.89      0.85        54

    accuracy                           0.88       540
   macro avg       0.89      0.88      0.88       540
weighted avg       0.89      0.88      0.88       540
 

Confusion matrix 
[[49  0  0  0  2  0  0  0  0  0]
 [ 0 48  6  0  0  1  0  0  1  1]
 [ 0  4 49  0  0  0  0  0  2  0]
 [ 0  1  1 46  0  0  0  0  2  6]
 [ 0  1  0  0 46  2  0  0  2  0]
 [ 0  0  0  0  0 46  0  1  1  3]