# Naive Bayes

In [120]:
import numpy as np 
def read_txt(txt_file: str):
    # read all words and replace '\n'
    with open(txt_file, "r") as data:
        txt = data.read()
    return txt.replace('\n', '').lower()

In [121]:
eng_cha = ''
jap_cha = ''
spa_cha = ''
num_train_documents = 10 

for ii in range(num_train_documents):
    txt_file = './languageID/{}.txt'.format('e'+str(ii))
    eng_cha += read_txt(txt_file)
    
for ii in range(num_train_documents):
    txt_file = './languageID/{}.txt'.format('j'+str(ii))
    jap_cha += read_txt(txt_file)
    
for ii in range(num_train_documents):
    txt_file = './languageID/{}.txt'.format('s'+str(ii))
    spa_cha += read_txt(txt_file)

print('Total number of characters in the English documents: {}'.format(len(eng_cha)))
print('Total number of characters in the Japanese documents: {}'.format(len(jap_cha)))
print('Total number of characters in the Spanish documents: {}'.format(len(spa_cha)))
print('Total number of characters: {}'.format(len(eng_cha+jap_cha+spa_cha)))

Total number of characters in the English documents: 15119
Total number of characters in the Japanese documents: 14296
Total number of characters in the Spanish documents: 16202
Total number of characters: 45617


In [122]:
import string
cha_list = list(string.ascii_lowercase) + [' ']

Ks = 27
Kl = 3
alpha = 1/2 
j = ['a', 'b']
ccp_e = [] 
for cha in cha_list:
    cha_occurance = len([i for i in eng_cha if i==cha])
    ccp_e.append((cha_occurance+alpha)/(len(eng_cha)+Ks*alpha))
print('theta_e is {}'.format(ccp_e))

theta_e is [0.0601685114819098, 0.011134974392863043, 0.021509995043779945, 0.021972575582355856, 0.1053692383941847, 0.018932760614571286, 0.017478936064761277, 0.047216256401784236, 0.055410540227986124, 0.001420783082768875, 0.0037336857756484387, 0.028977366595076822, 0.020518751032545846, 0.057921691723112505, 0.06446390219725756, 0.01675202378985627, 0.0005617049396993227, 0.053824549810011564, 0.06618205848339666, 0.08012555757475633, 0.026664463902197257, 0.009284652238559392, 0.015496448042293078, 0.001156451346439782, 0.013844374690236246, 0.0006277878737815959, 0.1792499586981662]


In [123]:
ccp_j = [] 
for cha in cha_list:
    cha_occurance = len([i for i in jap_cha if i==cha])
    ccp_j.append((cha_occurance+alpha)/(len(jap_cha)+Ks*alpha))
print('theta_j is {}'.format(ccp_j))
 
ccp_s = [] 
for cha in cha_list:
    cha_occurance = len([i for i in spa_cha if i==cha])
    ccp_s.append((cha_occurance+alpha)/(len(spa_cha)+Ks*alpha))
print('theta_s is {}'.format(ccp_s))

theta_j is [0.1317656102589189, 0.010866906600510151, 0.005485866033054963, 0.01722631818022992, 0.06020475907613823, 0.003878542227191726, 0.014011670568503443, 0.03176211607673224, 0.09703343932352633, 0.0023411020650616725, 0.05740941332681086, 0.001432614696530277, 0.03979873510604843, 0.05671057688947902, 0.09116321324993885, 0.0008735455466648031, 0.00010482546559977637, 0.04280373178657535, 0.0421747789929767, 0.056990111464411755, 0.07061742199238269, 0.0002445927530661449, 0.01974212935462455, 3.4941821866592126e-05, 0.01415143785596981, 0.00772214263251686, 0.12344945665466997]
theta_s is [0.10456045141993771, 0.008232863618143134, 0.03752582405722919, 0.039745922111559924, 0.1138108599796491, 0.00860287996053159, 0.0071844839813758445, 0.0045327001942585795, 0.049859702136844375, 0.006629459467793161, 0.0002775122567913416, 0.052943171656748174, 0.02580863988159477, 0.054176559464709693, 0.07249236841293824, 0.02426690512164287, 0.007677839104560451, 0.05929511886774999, 0.0

In [124]:
txt_file = './languageID/{}.txt'.format('e10')
test_txt = read_txt(txt_file)

wcv = [] # word count vector 
for cha in cha_list:
    cha_occurance = len([i for i in test_txt if i==cha])
    wcv.append(cha_occurance)
print('bag-of-words count vector is {}'.format(wcv))

bag-of-words count vector is [164, 32, 53, 57, 311, 55, 51, 140, 140, 3, 6, 85, 64, 139, 182, 53, 3, 141, 186, 225, 65, 31, 47, 4, 38, 2, 498]


In [125]:
lnpx_e = np.sum(wcv * np.log(ccp_e))
lnpx_j = np.sum(wcv * np.log(ccp_j))
lnpx_s = np.sum(wcv * np.log(ccp_s))

print('log of conditional probability p(x|y=English) is {:.1f}'.format(lnpx_e))
print('log of conditional probability p(x|y=Japanese) is {:.1f}'.format(lnpx_j))
print('log of conditional probability p(x|y=Spanish) is {:.1f}'.format(lnpx_s))

log of conditional probability p(x|y=English) is -7841.9
log of conditional probability p(x|y=Japanese) is -8771.4
log of conditional probability p(x|y=Spanish) is -8467.3


In [126]:
lnpe_x = lnpx_e + np.log(1/3) 
lnpj_x = lnpx_j + np.log(1/3) 
lnps_x = lnpx_s + np.log(1/3) 

print('log of posterior probability p(y=English|x) is {:.1f}'.format(lnpe_x))
print('log of posterior probability p(y=Japanese|x) is {:.1f}'.format(lnpj_x))
print('log of posterior probability p(y=Spanish|x) is {:.1f}'.format(lnps_x))

log of posterior probability p(y=English|x) is -7843.0
log of posterior probability p(y=Japanese|x) is -8772.5
log of posterior probability p(y=Spanish|x) is -8468.4


In [127]:
# English 
pred_class_english = []
for ii in np.arange(10, 20):
    txt_file = './languageID/e{}.txt'.format(ii)
    test_txt = read_txt(txt_file)

    wcv = [] # word count vector 
    for cha in cha_list:
        cha_occurance = len([i for i in test_txt if i==cha])
        wcv.append(cha_occurance)
    lnpe_x = np.sum(wcv * np.log(ccp_e)) + np.log(1/3) 
    lnpj_x = np.sum(wcv * np.log(ccp_j)) + np.log(1/3) 
    lnps_x = np.sum(wcv * np.log(ccp_s)) + np.log(1/3) 
    pred_class_english.append(np.argmax([lnpe_x, lnpj_x, lnps_x]))


In [129]:
# Japanese 
pred_class_japanese = []
for ii in np.arange(10, 20):
    txt_file = './languageID/j{}.txt'.format(ii)
    test_txt = read_txt(txt_file)

    wcv = [] # word count vector 
    for cha in cha_list:
        cha_occurance = len([i for i in test_txt if i==cha])
        wcv.append(cha_occurance)
    lnpe_x = np.sum(wcv * np.log(ccp_e)) + np.log(1/3) 
    lnpj_x = np.sum(wcv * np.log(ccp_j)) + np.log(1/3) 
    lnps_x = np.sum(wcv * np.log(ccp_s)) + np.log(1/3) 
    pred_class_japanese.append(np.argmax([lnpe_x, lnpj_x, lnps_x]))


In [130]:
# Spanish 
pred_class_spanish = []
for ii in np.arange(10, 20):
    txt_file = './languageID/s{}.txt'.format(ii)
    test_txt = read_txt(txt_file)

    wcv = [] # word count vector 
    for cha in cha_list:
        cha_occurance = len([i for i in test_txt if i==cha])
        wcv.append(cha_occurance)
    lnpe_x = np.sum(wcv * np.log(ccp_e)) + np.log(1/3) 
    lnpj_x = np.sum(wcv * np.log(ccp_j)) + np.log(1/3) 
    lnps_x = np.sum(wcv * np.log(ccp_s)) + np.log(1/3) 
    pred_class_spanish.append(np.argmax([lnpe_x, lnpj_x, lnps_x]))


In [134]:
from sklearn.metrics import confusion_matrix
y_true = [0] * 10 + [2] * 10 + [1] * 10
y_pred = pred_class_english + pred_class_spanish + pred_class_japanese
confusion_matrix(y_true, y_pred)

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10]])

# Neural Network

In [2]:
import os
import torch
from torch import nn, optim
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import random_split, DataLoader 
import torch.optim as optim
from tqdm import tqdm 
import torch.nn.functional as F
import matplotlib.pylab as plt 
use_cuda = torch.cuda.is_available()

In [3]:
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
# load mnist dataset
root = './data/'
train_set = datasets.MNIST(root=root, train=True, transform=trans, download=True)
test_set = datasets.MNIST(root=root, train=False, transform=trans, download=True)

batch_size = 1024
nepoch = 100
train_subset, val_subset = random_split(
        train_set, [0.9, 0.1], generator=torch.Generator().manual_seed(1))


train_loader = DataLoader(
                 dataset=train_subset,
                 batch_size=batch_size,
                 shuffle=True, drop_last=False)

val_loader = DataLoader(
                 dataset=val_subset,
                 batch_size=batch_size,
                 shuffle=True, drop_last=False)

test_loader = DataLoader(
                dataset=test_set,
                batch_size=batch_size,
                shuffle=False, drop_last=False)


# Self-implemented version 

In [104]:
# network
class NN_self(): 
    def __init__(self, d1, d2, alpha, out_class=10):
        self.W1 = torch.rand(d1, 28*28) * 2 - 1
        self.W2 = torch.rand(d2, d1) * 2 - 1 
        self.W3 = torch.rand(out_class, d2) * 2 - 1
        self.b1 = torch.zeros(1, d1)
        self.b2 = torch.zeros(1, d2)
        self.b3 = torch.zeros(1, out_class)
        self.alpha = alpha # learning rate 

    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))
       
    def softmax(self, x):
        beta = torch.max(x, dim=1, keepdim=True)[0] # avoid overflow 
        x_exp = torch.exp(x-beta)
        x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
        return x_exp / x_exp_sum
    
    def cross_entropy(self, yhat, ytrue):
        return - yhat[range(ytrue.shape[0]), ytrue].log().mean()
    
    def forward(self, x):
        x = x.view(-1, 28*28) # batch*(28*28)
        z1 = self.sigmoid(torch.matmul(x, self.W1.T) + self.b1) 
        z2 = self.sigmoid(torch.matmul(z1, self.W2.T) + self.b2) 
        out = self.softmax(torch.matmul(z2, self.W3.T) + self.b3)
        return out, z2, z1, x
    
    def backward(self, out, ref, z2, z1, x):
        batchsize = out.shape[0]
        dz3 = - (ref - out) # z3 = output 
        db3 = torch.mean(dz3, dim=0, keepdim=True) 
        dW3 = torch.matmul(dz3.T, z2) / batchsize # normalized by the batch size 
        
        dz2 = torch.matmul(dz3, self.W3) * z2 * (1 - z2)
        db2 = torch.mean(dz2, dim=0, keepdim=True)
        dW2 = torch.matmul(dz2.T, z1) / batchsize
        
        dz1 = torch.matmul(dz2, self.W2) * z1 * (1 - z1)
        db1 = torch.mean(dz1, dim=0, keepdim=True)
        dW1 = torch.matmul(dz1.T, x) / batchsize
        
        self.b3 -= self.alpha * db3
        self.W3 -= self.alpha * dW3
        self.b2 -= self.alpha * db2
        self.W2 -= self.alpha * dW2
        self.b1 -= self.alpha * db1
        self.W1 -= self.alpha * dW1


In [108]:
acc_max = 0.0
#criterion = nn.CrossEntropyLoss(reduction='mean')

## training
model_self = NN_self(300, 200, alpha=0.2) # alpha/learning rate 
train_loss_epochs = []
val_loss_epochs = [] 
val_acc_epochs = []
for epoch in range(nepoch):
    # trainning
    train_loss = 0
    count = 0 
    for x, target in train_loader:
        out, z2, z1, x = model_self.forward(x)
        loss = model_self.cross_entropy(out, target) 
        #loss = F.cross_entropy(out.log(), target) # cross entropy loss accepts logits rather than softmax output 
        train_loss += loss
        # to one-hot 
        ref = F.one_hot(target, num_classes=10).to(out)
        model_self.backward(out, ref, z2, z1, x)
        count += 1
        
    train_loss /= count 
    # validation
    val_loss = 0
    correct_count = 0
    tot_count = 0 
    count = 0 
    for x, target in val_loader:
        out, _, _, _ = model_self.forward(x) 
        loss = model_self.cross_entropy(out, target) 
        val_loss += loss
        count += 1
        _, pred_label = torch.max(out, 1)
        correct_count += (pred_label == target.data).sum()
        tot_count += pred_label.shape[0]
    
    val_loss /= count 
    acc = correct_count / tot_count * 100 
    print('==>>> epoch: {}, train loss: {:.3f}'.format(epoch, train_loss))
    print('==>>> epoch: {}, validation loss: {:.3f}, accuracy: {:.3f}%'.format(epoch, val_loss, acc)) 
    
    train_loss_epochs.append(train_loss)
    val_loss_epochs.append(val_loss)
    val_acc_epochs.append(acc)
    
    plt.figure(figsize=(8,6))
    plt.plot(train_loss_epochs, label='Training loss')
    plt.plot(val_loss_epochs, label='Validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.rc('font', size=18) # controls default text sizes

    plt.savefig('./figures/model_self_loss_curve.png')
    plt.close()
    
    plt.figure(figsize=(8,6))
    plt.plot(val_acc_epochs, label='Validation accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.rc('font', size=18)
    plt.savefig('./figures/model_self_accuracy_curve.png')
    plt.close()
    
    if acc > acc_max:
        acc_max = acc 
        torch.save(model_self, './model_weights/model_self_best.pth.tar')


==>>> epoch: 0, train loss: 2.128
==>>> epoch: 0, validation loss: 1.320, accuracy: 58.150%
==>>> epoch: 1, train loss: 1.079
==>>> epoch: 1, validation loss: 0.932, accuracy: 70.267%
==>>> epoch: 2, train loss: 0.829
==>>> epoch: 2, validation loss: 0.780, accuracy: 75.550%
==>>> epoch: 3, train loss: 0.705
==>>> epoch: 3, validation loss: 0.688, accuracy: 78.417%
==>>> epoch: 4, train loss: 0.627
==>>> epoch: 4, validation loss: 0.625, accuracy: 80.350%
==>>> epoch: 5, train loss: 0.572
==>>> epoch: 5, validation loss: 0.581, accuracy: 81.750%
==>>> epoch: 6, train loss: 0.530
==>>> epoch: 6, validation loss: 0.550, accuracy: 82.783%
==>>> epoch: 7, train loss: 0.497
==>>> epoch: 7, validation loss: 0.531, accuracy: 83.367%
==>>> epoch: 8, train loss: 0.471
==>>> epoch: 8, validation loss: 0.503, accuracy: 84.167%
==>>> epoch: 9, train loss: 0.448
==>>> epoch: 9, validation loss: 0.486, accuracy: 84.850%
==>>> epoch: 10, train loss: 0.428
==>>> epoch: 10, validation loss: 0.470, accu

==>>> epoch: 88, train loss: 0.128
==>>> epoch: 88, validation loss: 0.265, accuracy: 92.033%
==>>> epoch: 89, train loss: 0.127
==>>> epoch: 89, validation loss: 0.266, accuracy: 92.067%
==>>> epoch: 90, train loss: 0.125
==>>> epoch: 90, validation loss: 0.263, accuracy: 92.050%
==>>> epoch: 91, train loss: 0.124
==>>> epoch: 91, validation loss: 0.263, accuracy: 92.150%
==>>> epoch: 92, train loss: 0.123
==>>> epoch: 92, validation loss: 0.264, accuracy: 92.167%
==>>> epoch: 93, train loss: 0.122
==>>> epoch: 93, validation loss: 0.264, accuracy: 92.150%
==>>> epoch: 94, train loss: 0.121
==>>> epoch: 94, validation loss: 0.263, accuracy: 92.233%
==>>> epoch: 95, train loss: 0.120
==>>> epoch: 95, validation loss: 0.261, accuracy: 92.133%
==>>> epoch: 96, train loss: 0.119
==>>> epoch: 96, validation loss: 0.261, accuracy: 92.167%
==>>> epoch: 97, train loss: 0.118
==>>> epoch: 97, validation loss: 0.261, accuracy: 92.150%
==>>> epoch: 98, train loss: 0.116
==>>> epoch: 98, validati

In [109]:
# Testing     
# To load model weights
model_self = torch.load('./model_weights/model_self_best.pth.tar')
correct_count = 0
wrong_count = 0
tot_count = 0
for x, target in test_loader:
    out, _, _, _ = model_self.forward(x) 
    _, pred_label = torch.max(out, 1)
    correct_count += (pred_label == target.data).sum()
    wrong_count += (pred_label != target.data).sum() 
    tot_count += pred_label.shape[0]
    
test_acc = correct_count / tot_count * 100 
test_err = wrong_count / tot_count * 100 
print('==>>> Testing accuracy {:.3f}%'.format(test_acc)) 
print('==>>> Testing error {:.3f}%'.format(test_err)) 


==>>> Testing accuracy 92.620%
==>>> Testing error 7.380%


# Pytorch version

In [112]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        #nn.init.zeros_(tensor)
        nn.init.zeros_(m.bias)

# network
class NN(nn.Module):
    def __init__(self, d1, d2, out_class=10):
        super(NN, self).__init__()
        model = nn.Sequential(nn.Linear(28*28, d1),
                                   nn.Sigmoid(),
                                   nn.Linear(d1, d2), 
                                   nn.Sigmoid(),
                                   nn.Linear(d2, out_class), 
                                  )
        self.model = model.apply(init_weights)
    
    def forward(self, x):
        x = x.view(-1, 28*28)
        return self.model(x)

In [95]:
## training
model = NN(300, 200)

if use_cuda:
    model = model.cuda()

optimizer = optim.SGD(model.parameters(), lr=1)
criterion = nn.CrossEntropyLoss(reduction='mean')
acc_max = 0.0
train_loss_epochs = []
val_loss_epochs = [] 
val_acc_epochs = []

for epoch in range(nepoch):
    # trainning
    train_loss = 0
    count = 0 
    for x, target in train_loader:
        optimizer.zero_grad()
        if use_cuda:
            x, target = x.cuda(), target.cuda()
            
        out = model(x)
        loss = criterion(out, target)
        train_loss += loss.data
        loss.backward()
        optimizer.step()
        count += 1
    train_loss /= count 
    
    # validation
    val_loss = 0
    correct_count = 0
    tot_count = 0 
    count = 0 
    with torch.no_grad():
        for x, target in val_loader:
            if use_cuda:
                x, target = x.cuda(), target.cuda()
            out = model(x)
            loss = criterion(out, target)
            val_loss += loss.data
            count += 1 
            
            _, pred_label = torch.max(out.data, 1)
            correct_count += (pred_label == target.data).sum()
            tot_count += pred_label.shape[0]
            
    val_loss /= count         
    acc = correct_count / tot_count * 100 
    print('==>>> epoch: {}, train loss: {:.3f}'.format(epoch, train_loss))
    print('==>>> epoch: {}, validation loss: {:.3f}, accuracy: {:.3f}%'.format(epoch, val_loss, acc)) 
    
    train_loss_epochs.append(train_loss.detach().cpu().numpy())
    val_loss_epochs.append(val_loss.detach().cpu().numpy())
    val_acc_epochs.append(acc.detach().cpu().numpy())
    
    plt.figure(figsize=(8,6))
    plt.plot(train_loss_epochs, label='Training loss')
    plt.plot(val_loss_epochs, label='Validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.rc('font', size=18) # controls default text sizes

    plt.savefig('./figures/model_loss_curve.png')
    plt.close()
    
    plt.figure(figsize=(8,6))
    plt.plot(val_acc_epochs, label='Validation accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.rc('font', size=18)
    plt.savefig('./figures/model_accuracy_curve.png')
    plt.close()
    
    if acc > acc_max:
        acc_max = acc 
        torch.save(model.state_dict(), './model_weights/model_best.pth.tar')
        

==>>> epoch: 0, train loss: 2.019
==>>> epoch: 0, validation loss: 1.084, accuracy: 58.033%
==>>> epoch: 1, train loss: 0.784
==>>> epoch: 1, validation loss: 0.515, accuracy: 84.150%
==>>> epoch: 2, train loss: 0.415
==>>> epoch: 2, validation loss: 0.346, accuracy: 90.367%
==>>> epoch: 3, train loss: 0.333
==>>> epoch: 3, validation loss: 0.300, accuracy: 91.317%
==>>> epoch: 4, train loss: 0.280
==>>> epoch: 4, validation loss: 0.270, accuracy: 92.250%
==>>> epoch: 5, train loss: 0.248
==>>> epoch: 5, validation loss: 0.236, accuracy: 92.983%
==>>> epoch: 6, train loss: 0.221
==>>> epoch: 6, validation loss: 0.221, accuracy: 93.300%
==>>> epoch: 7, train loss: 0.197
==>>> epoch: 7, validation loss: 0.195, accuracy: 94.200%
==>>> epoch: 8, train loss: 0.179
==>>> epoch: 8, validation loss: 0.179, accuracy: 94.467%
==>>> epoch: 9, train loss: 0.161
==>>> epoch: 9, validation loss: 0.167, accuracy: 95.167%
==>>> epoch: 10, train loss: 0.149
==>>> epoch: 10, validation loss: 0.156, accu

==>>> epoch: 88, train loss: 0.004
==>>> epoch: 88, validation loss: 0.070, accuracy: 97.883%
==>>> epoch: 89, train loss: 0.004
==>>> epoch: 89, validation loss: 0.070, accuracy: 97.900%
==>>> epoch: 90, train loss: 0.003
==>>> epoch: 90, validation loss: 0.070, accuracy: 97.917%
==>>> epoch: 91, train loss: 0.003
==>>> epoch: 91, validation loss: 0.070, accuracy: 97.933%
==>>> epoch: 92, train loss: 0.003
==>>> epoch: 92, validation loss: 0.069, accuracy: 97.833%
==>>> epoch: 93, train loss: 0.003
==>>> epoch: 93, validation loss: 0.069, accuracy: 97.900%
==>>> epoch: 94, train loss: 0.003
==>>> epoch: 94, validation loss: 0.070, accuracy: 97.867%
==>>> epoch: 95, train loss: 0.003
==>>> epoch: 95, validation loss: 0.070, accuracy: 97.850%
==>>> epoch: 96, train loss: 0.003
==>>> epoch: 96, validation loss: 0.070, accuracy: 97.917%
==>>> epoch: 97, train loss: 0.003
==>>> epoch: 97, validation loss: 0.069, accuracy: 97.883%
==>>> epoch: 98, train loss: 0.003
==>>> epoch: 98, validati

In [100]:
# Testing     
# To load model weights
#model.load_state_dict(torch.load('./model_weights/model_best.pth.tar'))
correct_count = 0
wrong_count = 0
tot_count = 0
with torch.no_grad():
        for x, target in test_loader:
            if use_cuda:
                x, target = x.cuda(), target.cuda()
            out = model(x)
            _, pred_label = torch.max(out, 1)
            correct_count += (pred_label == target.data).sum()
            wrong_count += (pred_label != target.data).sum() 
            tot_count += pred_label.shape[0]
    
test_acc = correct_count / tot_count * 100 
test_err = wrong_count / tot_count * 100 
print('==>>> Testing accuracy {:.3f}%'.format(test_acc)) 
print('==>>> Testing error {:.3f}%'.format(test_err)) 


==>>> Testing accuracy 98.190%
==>>> Testing error 1.810%


# All weights initialized to 0

In [110]:
# network
class NN_self_zero_init(): 
    def __init__(self, d1, d2, alpha, out_class=10):
        self.W1 = torch.zeros(d1, 28*28) 
        self.W2 = torch.zeros(d2, d1) 
        self.W3 = torch.zeros(out_class, d2) 
        self.b1 = torch.zeros(1, d1)
        self.b2 = torch.zeros(1, d2)
        self.b3 = torch.zeros(1, out_class)
        self.alpha = alpha # learning rate 

    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))
       
    def softmax(self, x):
        beta = torch.max(x, dim=1, keepdim=True)[0] # avoid overflow 
        x_exp = torch.exp(x-beta)
        x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
        return x_exp / x_exp_sum
    
    def cross_entropy(self, yhat, ytrue):
        return - yhat[range(ytrue.shape[0]), ytrue].log().mean()
    
    def forward(self, x):
        x = x.view(-1, 28*28) # batch*(28*28)
        z1 = self.sigmoid(torch.matmul(x, self.W1.T) + self.b1) 
        z2 = self.sigmoid(torch.matmul(z1, self.W2.T) + self.b2) 
        out = self.softmax(torch.matmul(z2, self.W3.T) + self.b3)
        return out, z2, z1, x
    
    def backward(self, out, ref, z2, z1, x):
        batchsize = out.shape[0]
        dz3 = - (ref - out) # z3 = output 
        db3 = torch.mean(dz3, dim=0, keepdim=True) 
        dW3 = torch.matmul(dz3.T, z2) / batchsize # normalized by the batch size 
        
        dz2 = torch.matmul(dz3, self.W3) * z2 * (1 - z2)
        db2 = torch.mean(dz2, dim=0, keepdim=True)
        dW2 = torch.matmul(dz2.T, z1) / batchsize
        
        dz1 = torch.matmul(dz2, self.W2) * z1 * (1 - z1)
        db1 = torch.mean(dz1, dim=0, keepdim=True)
        dW1 = torch.matmul(dz1.T, x) / batchsize
        
        self.b3 -= self.alpha * db3
        self.W3 -= self.alpha * dW3
        self.b2 -= self.alpha * db2
        self.W2 -= self.alpha * dW2
        self.b1 -= self.alpha * db1
        self.W1 -= self.alpha * dW1
        
acc_max = 0.0
#criterion = nn.CrossEntropyLoss(reduction='mean')

## training
model_self = NN_self_zero_init(300, 200, alpha=0.2) # alpha/learning rate 
train_loss_epochs = []
val_loss_epochs = [] 
val_acc_epochs = []
for epoch in range(nepoch):
    # trainning
    train_loss = 0
    count = 0 
    for x, target in train_loader:
        out, z2, z1, x = model_self.forward(x)
        loss = model_self.cross_entropy(out, target) 
        #loss = F.cross_entropy(out.log(), target) # cross entropy loss accepts logits rather than softmax output 
        train_loss += loss
        # to one-hot 
        ref = F.one_hot(target, num_classes=10).to(out)
        model_self.backward(out, ref, z2, z1, x)
        count += 1
        
    train_loss /= count 
    # validation
    val_loss = 0
    correct_count = 0
    tot_count = 0 
    count = 0 
    for x, target in val_loader:
        out, _, _, _ = model_self.forward(x) 
        loss = model_self.cross_entropy(out, target) 
        val_loss += loss
        count += 1
        _, pred_label = torch.max(out, 1)
        correct_count += (pred_label == target.data).sum()
        tot_count += pred_label.shape[0]
    
    val_loss /= count 
    acc = correct_count / tot_count * 100 
    print('==>>> epoch: {}, train loss: {:.3f}'.format(epoch, train_loss))
    print('==>>> epoch: {}, validation loss: {:.3f}, accuracy: {:.3f}%'.format(epoch, val_loss, acc)) 
    
    train_loss_epochs.append(train_loss)
    val_loss_epochs.append(val_loss)
    val_acc_epochs.append(acc)
    
    plt.figure(figsize=(8,6))
    plt.plot(train_loss_epochs, label='Training loss')
    plt.plot(val_loss_epochs, label='Validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.rc('font', size=18) # controls default text sizes

    plt.savefig('./figures/model_self_zero_init_loss_curve.png')
    plt.close()
    
    plt.figure(figsize=(8,6))
    plt.plot(val_acc_epochs, label='Validation accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.rc('font', size=18)
    plt.savefig('./figures/model_self_zero_init_accuracy_curve.png')
    plt.close()
    
    if acc > acc_max:
        acc_max = acc 
        torch.save(model_self, './model_weights/model_self_zero_init_best.pth.tar')


==>>> epoch: 0, train loss: 2.306
==>>> epoch: 0, validation loss: 2.306, accuracy: 9.317%
==>>> epoch: 1, train loss: 2.306
==>>> epoch: 1, validation loss: 2.308, accuracy: 11.300%
==>>> epoch: 2, train loss: 2.305
==>>> epoch: 2, validation loss: 2.312, accuracy: 9.950%
==>>> epoch: 3, train loss: 2.306
==>>> epoch: 3, validation loss: 2.307, accuracy: 9.700%
==>>> epoch: 4, train loss: 2.305
==>>> epoch: 4, validation loss: 2.316, accuracy: 10.717%
==>>> epoch: 5, train loss: 2.305
==>>> epoch: 5, validation loss: 2.308, accuracy: 9.933%
==>>> epoch: 6, train loss: 2.305
==>>> epoch: 6, validation loss: 2.305, accuracy: 10.183%
==>>> epoch: 7, train loss: 2.305
==>>> epoch: 7, validation loss: 2.304, accuracy: 11.300%
==>>> epoch: 8, train loss: 2.305
==>>> epoch: 8, validation loss: 2.305, accuracy: 9.933%
==>>> epoch: 9, train loss: 2.304
==>>> epoch: 9, validation loss: 2.304, accuracy: 11.300%
==>>> epoch: 10, train loss: 2.304
==>>> epoch: 10, validation loss: 2.304, accuracy:

==>>> epoch: 88, train loss: 1.498
==>>> epoch: 88, validation loss: 1.488, accuracy: 40.850%
==>>> epoch: 89, train loss: 1.487
==>>> epoch: 89, validation loss: 1.476, accuracy: 41.883%
==>>> epoch: 90, train loss: 1.476
==>>> epoch: 90, validation loss: 1.465, accuracy: 42.333%
==>>> epoch: 91, train loss: 1.462
==>>> epoch: 91, validation loss: 1.451, accuracy: 42.883%
==>>> epoch: 92, train loss: 1.448
==>>> epoch: 92, validation loss: 1.437, accuracy: 43.933%
==>>> epoch: 93, train loss: 1.432
==>>> epoch: 93, validation loss: 1.418, accuracy: 45.067%
==>>> epoch: 94, train loss: 1.414
==>>> epoch: 94, validation loss: 1.400, accuracy: 46.967%
==>>> epoch: 95, train loss: 1.396
==>>> epoch: 95, validation loss: 1.381, accuracy: 46.700%
==>>> epoch: 96, train loss: 1.376
==>>> epoch: 96, validation loss: 1.362, accuracy: 47.250%
==>>> epoch: 97, train loss: 1.357
==>>> epoch: 97, validation loss: 1.344, accuracy: 48.500%
==>>> epoch: 98, train loss: 1.338
==>>> epoch: 98, validati

In [111]:
# Testing      
model_self = torch.load('./model_weights/model_self_zero_init_best.pth.tar')
correct_count = 0
wrong_count = 0
tot_count = 0
for x, target in test_loader:
    out, _, _, _ = model_self.forward(x) 
    _, pred_label = torch.max(out, 1)
    correct_count += (pred_label == target.data).sum()
    wrong_count += (pred_label != target.data).sum() 
    tot_count += pred_label.shape[0]
    
test_acc = correct_count / tot_count * 100 
test_err = wrong_count / tot_count * 100 
print('==>>> Testing accuracy {:.3f}%'.format(test_acc)) 
print('==>>> Testing error {:.3f}%'.format(test_err)) 


==>>> Testing accuracy 49.790%
==>>> Testing error 50.210%
