- # Data loading Function sets
    - ## load train function
    - ## load test function

In [6]:
import os
import numpy as np
import matplotlib.pyplot as plt

directory = 'C:\\Users\\went1\\Desktop\\ml_practice'

def loadTrainData():
    trainImgPath = os.path.join(directory, 'train-images.idx3-ubyte')
    trainLablePath = os.path.join(directory, 'train-labels.idx1-ubyte')
    
    with open(trainImgPath, 'rb') as trainImgRawData:
        header = []
        for i in range(4):
            byte = trainImgRawData.read(4)
            header.append(int.from_bytes(byte, byteorder="big"))
        magic, trainImgLength, rows, col = header
        trainImages = np.fromfile(trainImgRawData, dtype=np.uint8).reshape(trainImgLength, 784)
        
    with open(trainLablePath, 'rb') as trainLableRawData:
        header = []
        for i in range(2):
            byte = trainLableRawData.read(4)
            header.append(int.from_bytes(byte, byteorder="big"))
        magic, trainLabelLength = header
        trainLabels = np.fromfile(trainLableRawData, dtype=np.uint8)
        
    return trainImages, trainLabels ##train img is 60000*784 array.
        
def loadTestData():
    testImgPath = os.path.join(directory, 't10k-images.idx3-ubyte')
    testLablePath = os.path.join(directory, 't10k-labels.idx1-ubyte')
    
    with open(testImgPath, 'rb') as testImgRawData:
        header = []
        for i in range(4):
            byte = testImgRawData.read(4)
            header.append(int.from_bytes(byte, byteorder="big"))
        magic, testImgLength, rows, col = header
        testImages = np.fromfile(testImgRawData, dtype=np.uint8).reshape(testImgLength, 784)
        
    with open(testLablePath, 'rb') as testLableRawData:
        header = []
        for i in range(2):
            byte = testLableRawData.read(4)
            header.append(int.from_bytes(byte, byteorder="big"))
        magic, testLabelLength = header
        testLabels = np.fromfile(testLableRawData, dtype=np.uint8)
        
    return testImages, testLabels

- # Naive Bayes function sets
    * ## normalizing pixel value between 1 to 32.
    * ## putting pixels into tally bins.(createBinstable)
    * ## calculate likelihood
    * ## calculate prior
    * ## fit
    * ## predict

In [2]:
class naiveBayes:
    def __init__(self, toggle, smoothing):
        self.toggle = toggle
        self.smoothing = smoothing
        
    def normalizePixels(self, data):
        for i in range(len(data)):
            data[i] = np.floor(data[i]/8)

    def createBinsAndCounts(self, X, y):
        self.counts = np.empty(10)
        self.bins = np.zeros([10, 32, 784])
        self.bins += self.smoothing
        for i in range(10):
            imgs = X[y == i]
            l = len(imgs)
            self.counts[i] = l
            for j in range(l):
                for p in range(784):
                    self.bins[i][imgs[j][p]][p] += 1
   
    def discreteModelTrain(self, X_train, y_train):
        self.createBinsAndCounts(X_train, y_train)
        ## Calculate likelihood.
        for i in range(10):
            for j in range(32):
                    self.bins[i][j] = (self.bins[i][j]) / (self.counts[i]) ## bins will become likelihood.
        ## Calculate prior.
        self.priors = np.zeros(10)
        for i in range(10):
            self.priors[i] = self.counts[i] / 60000

    def discreteModelPredict(self, X_test, y_test):
        self.posteriorList = np.zeros([10000, 10])
        error = 0
        for i in range(10000):
            for j in range(10):
                for k in range(784):
                      self.posteriorList[i][j] += np.log(self.bins[j][X_test[i][k]][k])
            predict = np.argmax(self.posteriorList[i])
            if predict != y_test[i]:
                error += 1
        self.errorRate = (error / 10000) * 100 # percentage format

    def continuousModelTrain(self, X_train, y_train):
        pixelValueSum = np.zeros([10, 784])
        self.pixelValueMean = np.empty([10, 784]) ## means
        self.pixelValueVar = np.empty([10, 784]) ## variances
        self.priors = np.zeros(10)
        ## This for loop generate every pixels mean
        for i in range(10):
            imgs = X_train[y_train == i]
            l = len(imgs)
            self.priors[i] = l / 60000
            for j in range(l):
                pixelValueSum[i] += imgs[j]
            self.pixelValueMean[i] = pixelValueSum[i] / l

        ## This for loop generate every pixels var
        pixelValueSum = np.zeros([10, 784])
        for i in range(10):
            imgs = X_train[y_train == i]
            l = len(imgs)
            for j in range(l):
                pixelValueSum[i] += np.power((imgs[j] - self.pixelValueMean[i]), 2)
            self.pixelValueVar[i] = pixelValueSum[i] / l

    def continuousModelPredict(self, X_test, y_test):
        constant = 1 / np.sqrt(2 * np.pi)
        self.posteriorList = np.zeros([10000, 10])
        error = 0
        for i in range(10000):
            for j in range(10):
                var = self.pixelValueVar[j] + self.smoothing
                power = -((X_test[i] - self.pixelValueMean[j]) ** 2) / (2 * var)
                prob = constant * (1 / np.sqrt(var)) * np.exp(power)
                self.posteriorList[i][j] = np.sum(np.log(prob))        
            predict = np.argmax(self.posteriorList[i])
            if predict != y_test[i]:
                error += 1
        self.errorRate = (error / 10000) * 100 # percentage format
    
    def fit(self, X_train, y_train):        
        if self.toggle == 0:
            self.normalizePixels(X_train)
            self.normalizePixels(X_test)
            self.discreteModelTrain(X_train, y_train)
        else:
            self.continuousModelTrain(X_train, y_train)
            
    def predict(self, X_test, y_test):
        if self.toggle == 0:
            self.discreteModelPredict(X_test, y_test)
            return self.posteriorList, self.errorRate
        else:
            self.continuousModelPredict(X_test, y_test)
            return self.posteriorList, self.errorRate

# Implement Machine Learning below

In [3]:
X_train, y_train = loadTrainData()
X_test, y_test = loadTestData()

In [4]:
model_continuous = naiveBayes(toggle = 1, smoothing = 1000)
model_continuous.fit(X_train, y_train)
posteriorList, errorRate = model_continuous.predict(X_test, y_test)
print('Error Rate for Gaussian MLE naiveBayes: ', errorRate, '%')
model_discrete = naiveBayes(toggle = 0, smoothing = 0.1)
model_discrete.fit(X_train, y_train)
posteriorList, errorRate = model_discrete.predict(X_test, y_test)
print('Error Rate for discrete naiveBayes: ', errorRate, '%')

Error Rate for Gaussian MLE naiveBayes:  18.509999999999998 %
Error Rate for discrete naiveBayes:  14.99 %


# Homework2-2 online learning

In [5]:
binaryFilePath = os.path.join(directory, 'hw2_data.txt')

with open(binaryFilePath) as f:
    data = f.read().splitlines()

print('Initial beta distribution parameter a and b for online learning')

a = int(input('a:'))
b = int(input('b:'))

def factorial(n):
    if n == 0: return 1 
    return n * factorial(n-1)
    
def combination(N, m):
    return factorial(N) / (factorial(m) * factorial(N-m))

def gamma(n):
    return factorial(n-1)

def betaFunction(p):
    return np.power(p, a-1) * np.power(p, b-1) * (gamma(a) * gamma(b) / gamma(a+b))

def onlineLearning(data, a, b):
    for i, val in enumerate(data):
        print('Line:', i)
        m = 0
        N = len(val)
        for bit in val:
            if bit == '1':
                m += 1
        mleProb = m / N
        likelihood = round(combination(N, m) * np.power(mleProb, m) * np.power((1 - mleProb), N-m), 5)
        prior = betaFunction(mleProb)
        a = a + m
        b = b + N - m
        print('Likelihood:', likelihood)
        print('Prior:', prior)
        print('New posterior parameter:', a, b)
        
onlineLearning(data, a, b)

Initial beta distribution parameter a and b for online learning
a:5
b:5
Line: 0
Likelihood: 0.17971
Prior: 1.0402539682539685e-06
New posterior parameter: 13 17
Line: 1
Likelihood: 0.24385
Prior: 4.85286153482938e-07
New posterior parameter: 17 24
Line: 2
Likelihood: 0.25496
Prior: 0.0001945864666413107
New posterior parameter: 27 27
Line: 3
Likelihood: 0.19361
Prior: 2.2754541004711918e-05
New posterior parameter: 37 34
Line: 4
Likelihood: 0.21769
Prior: 3.2683054275421163e-06
New posterior parameter: 43 41
Line: 5
Likelihood: 0.23845
Prior: 6.193403541368789e-05
New posterior parameter: 51 45
Line: 6
Likelihood: 0.21154
Prior: 1.8044924157034524e-05
New posterior parameter: 59 51
Line: 7
Likelihood: 0.19787
Prior: 2.1305116307404307e-06
New posterior parameter: 66 60
Line: 8
Likelihood: 0.18213
Prior: 6.193403541368789e-05
New posterior parameter: 80 67
Line: 9
Likelihood: 0.22768
Prior: 0.00013276110527092505
New posterior parameter: 91 71
Line: 10
Likelihood: 0.24385
Prior: 4.26876