# Import Statements

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
from matplotlib import pyplot as plt
from tqdm import tqdm
import random

# Read Files

In [2]:
def images_file_read(file_name):
    with gzip.open(file_name, 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of images
        image_count = int.from_bytes(f.read(4), 'big')
        # third 4 bytes is the row count
        row_count = int.from_bytes(f.read(4), 'big')
        # fourth 4 bytes is the column count
        column_count = int.from_bytes(f.read(4), 'big')
        # rest is the image pixel data, each pixel is stored as an unsigned byte
        # pixel values are 0 to 255
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8).reshape((image_count, row_count, column_count))
        return images

In [3]:
def labels_file_read(file_name):
    with gzip.open(file_name, 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of labels
        label_count = int.from_bytes(f.read(4), 'big')
        # rest is the label data, each label is stored as unsigned byte
        # label values are 0 to 9
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)
        return labels

In [4]:
train_x = images_file_read('Dataset/mnist/train-images-idx3-ubyte.gz')
print(train_x.shape)

(60000, 28, 28)


In [5]:
train_x = np.reshape(train_x, (60000,784))
print(train_x.shape)

(60000, 784)


In [6]:
train_y = labels_file_read('Dataset/mnist/train-labels-idx1-ubyte.gz')
print(train_y.shape)

(60000,)


In [7]:
test_x = images_file_read('Dataset/mnist/t10k-images-idx3-ubyte.gz')
print(test_x.shape)

(10000, 28, 28)


In [8]:
test_x = np.reshape(test_x, (10000,784))
print(test_x.shape)

(10000, 784)


In [9]:
test_y = labels_file_read('Dataset/mnist/t10k-labels-idx1-ubyte.gz')
print(test_y.shape)

(10000,)


# Preprocessing

In [10]:
# Preprocessing
train_x = (train_x.astype('float32')/255)
test_x = (test_x.astype('float32')/255)

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(60000, 784) (60000,)
(10000, 784) (10000,)


# Bagging Model

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
def bagging(DTC, train_x, train_y, test_x, test_y, num_of_bags):
    bag_of_models = []
    for i in range(num_of_bags):
        indexes = np.random.randint(low=0, high=train_x.shape[0], size=train_x.shape[0])
        bag_x = [train_x[indexes[i]] for i in range(train_x.shape[0])]
        bag_y = [train_y[indexes[i]] for i in range(train_x.shape[0])]
        DTC.fit(bag_x, bag_y)
        bag_of_models.append(DTC)
    return bag_of_models

In [13]:
# DTC = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, random_state=0)
# bag_of_models = bagging(DTC, train_x, train_y, test_x, test_y, 3)

# Saving Model

In [14]:
# pi_file = open('Q4_pkl_file.pkl', 'wb')
# pickle.dump(bag_of_models, pi_file)
# pi_file.close()

# Loading Model

In [15]:
pi_file = open('Q4_pkl_file.pkl', 'rb')
bag_of_models = pickle.load(pi_file)
pi_file.close()

# Testing

In [16]:
bag_of_predictions = []
for i in range(3):
    bag_of_predictions.append(bag_of_models[i].predict(test_x))

In [17]:
final_prediction = []
for i in range(len(test_y)):
    if bag_of_predictions[1][i] == bag_of_predictions[2][i]:
        final_prediction.append(bag_of_predictions[1][i])
    else:
        if bag_of_predictions[0][i] == bag_of_predictions[1][i] or bag_of_predictions[0][i] == bag_of_predictions[2][i]:
            final_prediction.append(bag_of_predictions[0][i])
        else:
            choice = np.random.randint(0,3)
            final_prediction.append(bag_of_predictions[choice][i])

In [18]:
my_confusion_matrix = np.zeros((10,10), dtype=np.int64)

In [19]:
for i in range(test_y.shape[0]):
    my_confusion_matrix[test_y[i]][final_prediction[i]] += 1

In [20]:
print(my_confusion_matrix)

[[ 902    0   12   13    9   15   15    5    6    3]
 [   2 1070    8    7    5    4    7    3   22    7]
 [  18   22  841   39   16   11   24   25   25   11]
 [   5    8   26  834    4   44    7   22   43   17]
 [   4    7   13   10  858   11   14    9   11   45]
 [  12   12    8   57    8  715   19   10   32   19]
 [  16    9   18    8   16   31  834    3   17    6]
 [   1    7   27   14   15    8    1  923   14   18]
 [  15   11   24   33   30   28   23   12  771   27]
 [  12    7   15   19   54   10    3   21   15  853]]


In [21]:
class_wise_accuracies = np.zeros(10)
for i in range(10):
    class_wise_accuracies[i] = my_confusion_matrix[i][i]/np.sum(my_confusion_matrix[i])
    print('Accuracy of class {}: {}%'.format(i, class_wise_accuracies[i]*100))

print('\nOverall accuracy: {}%'.format(100*np.trace(my_confusion_matrix)/np.sum(my_confusion_matrix, axis=None)))

Accuracy of class 0: 92.04081632653062%
Accuracy of class 1: 94.27312775330397%
Accuracy of class 2: 81.4922480620155%
Accuracy of class 3: 82.57425742574257%
Accuracy of class 4: 87.37270875763747%
Accuracy of class 5: 80.15695067264575%
Accuracy of class 6: 87.05636743215031%
Accuracy of class 7: 89.78599221789884%
Accuracy of class 8: 79.15811088295688%
Accuracy of class 9: 84.53914767096134%

Overall accuracy: 86.01%
