# Adversarial Dropout Code
#### This file contains the code used to replicate the results of Adversarial Dropout

In [1]:
import json
import re
import numpy as np
import tensorflow as tf
from keras.datasets import cifar10
from keras.utils import to_categorical
from keras.datasets import mnist
from matplotlib import pyplot as plt
from google.colab import files

Using TensorFlow backend.


## Layers

In [0]:
SEED = 123456
rng = np.random.RandomState(SEED)


def __createWeights(shape, seed=None, name='weight'):
    w_init = tf.contrib.layers.variance_scaling_initializer(seed=seed)
    return tf.get_variable(name + '_w', shape=shape, initializer=w_init)
    

def __createBiases(size, name='bias'):
    return tf.get_variable(name + '_b', shape=[size], initializer=tf.constant_initializer(0.0))


def LeakyReLU(x, alpha=0.1):
    x = tf.nn.leaky_relu(x, alpha=alpha)
    return x


def MaxPooling(x, ksize=2, stride_length=2, padding='SAME', data_format='NHWC'):
    x = tf.nn.max_pool(x, (1, ksize, ksize, 1), (1, stride_length, stride_length, 1), padding, data_format)
    return x


def GlobalAveragePooling(x):
    x = tf.reduce_mean(x, [1, 2])
    return x


def Dense(x, input_dim, output_dim, seed=None, name='dense'):
    W = __createWeights([input_dim, output_dim], seed, name) 
    b = __createBiases(output_dim, name) 
    x = tf.nn.xw_plus_b(x, W, b)
    return x


def Conv2D(x, filter_size, n_channels, n_filters, stride_length=1, padding='SAME', data_format='NHWC', name='conv'):
    shape = [filter_size, filter_size, n_channels, n_filters]
    W = __createWeights(shape, name=name)
    b = __createBiases(n_filters, name=name)
    x = tf.nn.conv2d(x, filter=W, strides=(1, stride_length, stride_length, 1), padding=padding, data_format=data_format)
    x += b
    return x


def Dropout(x, probability=0.5):
    x = tf.nn.dropout(x, keep_prob=probability, seed=rng.randint(SEED))
    return x


def GaussianNoise(x, sigma=0.15):
    noise = tf.random_normal(shape=tf.shape(x), stddev=sigma)
    x += noise
    return x


def SoftMax(x):
    x = tf.nn.softmax(x)
    return x



'''
Loss functions. Arg 1: Approximation, Arg 2: Labels
'''
def CrossEntropyWithLogits(logits, labels):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels))
    return loss


# Formula: sum(p_i * log(p_i) - p_i * log(q_i))
def KLDivergenceWithLogits(q, p):
    p_soft = SoftMax(p)
    # plogp = tf.reduce_mean(tf.reduce_sum(p_soft * tf.nn.log_softmax(p), 1))
    # plogq = tf.reduce_mean(tf.reduce_sum(p_soft * tf.nn.log_softmax(q), 1))
    distance = tf.reduce_sum(p_soft * tf.nn.log_softmax(p) - p_soft * tf.nn.log_softmax(q))
    # distance = plogp - plogq
    return distance

## Building blocks for creating the networks

In [0]:
'''
The model before AdD is applied
'''
def upperBlock(x, conv_size=[128, 256, 512, 256,128], n_channels=3):
    x = GaussianNoise(x)
    x = Conv2D(x, filter_size=3, n_channels=n_channels, n_filters=conv_size[0], padding='SAME', name='1a')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=3, n_channels=conv_size[0], n_filters=conv_size[0], name='1b')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=3, n_channels=conv_size[0], n_filters=conv_size[0], name='1c')
    x = MaxPooling(x, ksize=2, stride_length=2)
    x = Dropout(x, probability=0.5)
    
    x = Conv2D(x, filter_size=3, n_channels=conv_size[0], n_filters=conv_size[1], name='2a')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=3, n_channels=conv_size[1], n_filters=conv_size[1], name='2b')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=3, n_channels=conv_size[1], n_filters=conv_size[1], name='2c')
    x = LeakyReLU(x)
    x = MaxPooling(x, ksize=2, stride_length=2)

    x = Conv2D(x, filter_size=3, n_channels=conv_size[1], n_filters=conv_size[2], padding='VALID', name='3a')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=1, n_channels=conv_size[2], n_filters=conv_size[3], name='3b')
    x = LeakyReLU(x)
    x = Conv2D(x, filter_size=1, n_channels=conv_size[3], n_filters=conv_size[4], name='3c')
    x = LeakyReLU(x)
    x = GlobalAveragePooling(x) 
    
    # x = Dropout(x, probability=0.5)
    # x = Dense(x, conv_size[4], 10)
    # x = SoftMax(x)

    return x


'''
The model after AdD is applied
'''
def lowerBlock(x, n_in=128, n_out=10, name='fc'):
    x = Dense(x, n_in, n_out, name=name)
    return x;


'''
Apply adv dropout
'''
def advDropout(x, mask, Jacobian, sigma=0.05, dim=128):
    # y: output 
    # mask: current sampled dropout mask 
    # sigma: hyper-parameter for boundary 
    # Jabocian: Jacobian vector (gradient of divergence (or loss function))
    # dim: layer dimension 

    Jacobian = tf.reshape(Jacobian, [-1, dim])

    # mask = 0 --> -1 
    mask = 2 * mask - tf.ones_like(mask)

    adv_mask = mask 

    # extract the voxels for which the update conditions hold 
    # mask = 0 and J > 0 
    # or
    # mask = 1 and J < 1 
    abs_jac = tf.abs(Jacobian)
    temp = tf.cast(tf.greater(abs_jac, 0), tf.float32)
    temp = 2 * temp - 1 
    # interested in the cases when temp * mask = -1
    ext = tf.cast(tf.less(mask, temp), tf.float32)

    # keep the voxels that you want to update 
    candidates = abs_jac * ext 
    thres = tf.nn.top_k(candidates, int(dim * sigma * sigma)  + 1)[0][:,-1]

    targets = tf.cast(tf.greater(candidates, tf.expand_dims(thres, -1)), tf.float32)

    # get new mask 
    adv_mask = (mask - targets * 2 * mask + tf.ones_like(mask)) / 2.0

    output = adv_mask * x

    return output, adv_mask

## Models

In [0]:
''' Preprocesses the data '''
def preprocess(data):
    # Mean normalization
    data = (data - 127.5) / 255.
    # Find principal component
    shape = data.shape
    data = data.transpose(0, 2, 3, 1)
    flatx = np.reshape(data, (data.shape[0], data.shape[1] * data.shape[2] * data.shape[3]))
    sigma = np.dot(flatx.T, flatx) / flatx.shape[1]
    U, S, V = np.linalg.svd(sigma)
    pc = np.dot(np.dot(U, np.diag(1. / np.sqrt(S + 0.0001))), U.T)
    # Apply ZCA whitening
    whitex = np.dot(flatx, pc)
    data = np.reshape(whitex, (shape[0], shape[1], shape[2], shape[3]))
    return data


'''
Returns a model without adversarial dropout
'''
def modelWithRandD(x, n_channels=3):
    x = upperBlock(x, n_channels=n_channels)
    x = lowerBlock(x)
    return x


'''
Returns a model with adversarial dropout
'''
def modelWithAdD(x, y, fn_loss=KLDivergenceWithLogits, n_channels=3):
    x = upperBlock(x, n_channels=n_channels)
    y_no_adD = lowerBlock(x)
    loss_no_adD = fn_loss(y_no_adD, y)

    # Derivative of loss fn wrt x
    DLoss = tf.gradients(loss_no_adD, [x])
    DLoss = tf.squeeze(tf.stop_gradient(DLoss)) # Stops backpropagation

    Jacobian_approx = DLoss * x
    mask = tf.ones_like(x)

    x, _ = advDropout(x, mask, Jacobian_approx)
    x = lowerBlock(x)

    return x



def CreateBaseModel(x, y, learning_rate=0.001, optimizer=tf.train.AdamOptimizer, n_channels=3):
    logit_rand = modelWithRandD(x, n_channels=n_channels)
    loss = CrossEntropyWithLogits(logit_rand, y)

    opt = optimizer(learning_rate=learning_rate)
    gradients = opt.compute_gradients(loss, tf.trainable_variables())
    train_op = opt.apply_gradients(gradients)

    return train_op, loss, logit_rand


'''
Create the AdD model for training
'''
def CreateAdDModel(x, y, learning_rate=0.001, optimizer=tf.train.AdamOptimizer, lmb=0.01, n_channels=3):
    logit_rand = modelWithRandD(x, n_channels=n_channels)
    logit_rand_loss = CrossEntropyWithLogits(logit_rand, y)

    with tf.variable_scope(tf.get_variable_scope(), reuse=True) as scope:
        # With adversarial dropout
        logit_adD = modelWithAdD(x, y, n_channels=n_channels)
        logit_adD_loss = CrossEntropyWithLogits(logit_adD, y)

        # Total loss
        loss = logit_rand_loss + lmb * logit_adD_loss


    opt = optimizer(learning_rate=learning_rate)
    gradients = opt.compute_gradients(loss, tf.trainable_variables())
    train_op = opt.apply_gradients(gradients)        

    return train_op, loss, logit_rand


def CreateTestModel(x, y, n_channels=3):  
    with tf.variable_scope(tf.get_variable_scope(), reuse=True) as scope:
        logit_rand = modelWithRandD(x, n_channels=n_channels)
        logit_rand_loss = CrossEntropyWithLogits(logit_rand, y)

        return logit_rand_loss, logit_rand


def Accuracy(logits, labels):
    y_pred = tf.argmax(logits, 1)
    y_true = tf.argmax(labels, 1)
    equality = tf.equal(y_pred, y_true)
    accuracy = tf.reduce_mean(tf.cast(equality, tf.float32))
    return accuracy


#### Visualization functions

In [0]:
'''
Uncomment to download graphs from google colab
'''
from google.colab import files

def visualize(trend, param):
    name = "Baseline" if param['BASELINE'] else "Adversarial"
    
    for key, val in trend.items():
        # X axis
        x = np.arange(len(val))
        # Acc or loss
        title = re.sub("_", " ", key).capitalize()
        plt.title(title + " " + name)
        plt.plot(x, val)
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy' if 'acc' in key else 'Loss')
        filename = str(key + re.sub("{|}|:|'|,| ", "", str(param)) + ".png")
        print(filename)
        plt.savefig(filename)
        files.download(filename)
        plt.close()
        
        
def visualize_both(trend_base, trend_adv, param_base, param_adv):
    for key in trend_base.keys():
        # X axis
        x = np.arange(len(trend_base[key]))
        # Acc or loss
        title = re.sub("_", " ", key).capitalize()
        plt.title(title)
        plt.plot(x, trend_base[key], label='Base')
        plt.plot(x, trend_adv[key], label='Adversarial')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy' if 'acc' in key else 'Loss')
        plt.legend()
        filename = str(key + '_both' + ".png")
        print(filename)
        plt.savefig(filename)
        files.download(filename)
        plt.close()

#### Creates the graph with input/output nodes for training

In [0]:
def prepareTrainingModel(param):
    
    n_channels = param['CHANNELS']
    
    with tf.variable_scope('Baseline' if param['BASELINE'] else 'Adversarial'):
        # Graph
        x_train_ph = tf.placeholder(tf.float32)
        x_test_ph = tf.placeholder(tf.float32)
        y_train_ph = tf.placeholder(tf.float32)
        y_test_ph = tf.placeholder(tf.float32)

        train_op, train_loss, train_logit = CreateBaseModel(x_train_ph, y_train_ph, n_channels=n_channels) if param['BASELINE'] else CreateAdDModel(
            x_train_ph, y_train_ph, n_channels=n_channels)
        test_loss, test_logit = CreateTestModel(x_test_ph, y_test_ph, n_channels=n_channels)

        # Accuracy Train
        train_accuracy = Accuracy(train_logit, y_train_ph)

        # Accuracy Test
        test_accuracy = Accuracy(test_logit, y_test_ph)  
    
    nodes = {
        'x_train_ph': x_train_ph,
        'x_test_ph': x_test_ph,
        'y_train_ph': y_train_ph,
        'y_test_ph': y_test_ph,
        'train_op': train_op,
        'train_loss': train_loss,
        'train_logit': train_logit,
        'test_logit': test_logit,
        'test_loss': test_loss,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy
    }
    
    return nodes
    

#### Can perform training incrementally i.e. you do not need to run all epochs at once

In [0]:
def doTraining(x_train, y_train, x_test, y_test, sess, nodes, param, trend, n_epochs=None):
    # Training setup

    batch_size = param['BATCH_SIZE']
    epochs = param['EPOCHS'] if n_epochs is None else n_epochs
    n_channels = x_train.shape[-1]

    STEPS = len(x_train) // batch_size if param['STEPS'] is None else param['STEPS']
    TEST_STEPS = len(x_test) // batch_size if param['STEPS'] is None else param['STEPS']
     
    # Training
    for epoch in range(epochs):
        # Train model
        acc_train, loss_train = 0, 0
        for i in range(STEPS):
            _, loss_, acc = sess.run([nodes['train_op'], nodes['train_loss'], nodes['train_accuracy']],
                                     feed_dict={nodes['x_train_ph']: x_train[batch_size * i: batch_size * (i + 1)],
                                                nodes['y_train_ph']: y_train[batch_size * i: batch_size * (i + 1)]})
            acc_train += acc
            loss_train += loss_

        trend['acc_train_trend'].append(acc_train / STEPS)
        trend['loss_train_trend'].append(loss_train / STEPS)

        # Test model
        acc_test, loss_test = 0, 0
        for i in range(TEST_STEPS):
            loss_t, acc_t = sess.run([nodes['test_loss'], nodes['test_accuracy']],
                                     feed_dict={nodes['x_test_ph']: x_test[batch_size * i: batch_size * (i + 1)],
                                                nodes['y_test_ph']: y_test[batch_size * i: batch_size * (i + 1)]})
            acc_test += acc_t
            loss_test += loss_t

        trend['acc_test_trend'].append(acc_test / TEST_STEPS)
        trend['loss_test_trend'].append(loss_test / TEST_STEPS)

        print('Epoch: {} || Train Loss: {}, Train Acc: {} || Test Loss: {}, Test Accuracy: {}'.format(epoch,
                                                                                                      loss_train / STEPS,
                                                                                                      acc_train / STEPS,
                                                                                                      loss_test / TEST_STEPS,
                                                                                                      acc_test / TEST_STEPS))
                  

#### Data I/O and preprocessing

In [0]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

In [0]:
# with tf.Session() as sess:
#     x_train = sess.run(tf.image.rgb_to_grayscale(x_train)) / 255
#     x_test = sess.run(tf.image.rgb_to_grayscale(x_test)) / 255
x_train, x_test = preprocess(x_train), preprocess(x_test)
    
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

## Adversarial Dropout training

In [0]:
param_adv = {
    'CHANNELS': 3,
    'BATCH_SIZE': 128,
    'EPOCHS': 50,
    'STEPS': None,
    'BASELINE': False
}

In [10]:
'''
Run this to delete default graph w/o having to restart notebook
'''
# tf.reset_default_graph()

'\nRun this to delete default graph w/o having to restart notebook\n'

In [0]:
nodes_adv = prepareTrainingModel(param_adv)

In [0]:
trend_adv = {
    'acc_train_trend': [],
    'loss_train_trend': [],
    'acc_test_trend': [],
    'loss_test_trend': []
}

In [0]:
sess_adv = tf.Session()
sess_adv.run(tf.global_variables_initializer())

In [14]:
doTraining(x_train, y_train, x_test, y_test, sess_adv, nodes_adv, param_adv, trend_adv, n_epochs=50)

Epoch: 0 || Train Loss: 2.5771095373691657, Train Acc: 0.10995592948717949 || Test Loss: 2.4128510921429367, Test Accuracy: 0.11247996794871795
Epoch: 1 || Train Loss: 2.2514486991442166, Train Acc: 0.1556690705128205 || Test Loss: 2.2027685122612195, Test Accuracy: 0.18990384615384615
Epoch: 2 || Train Loss: 2.0354786573312222, Train Acc: 0.24619391025641027 || Test Loss: 1.9825021685698094, Test Accuracy: 0.31079727564102566
Epoch: 3 || Train Loss: 1.801573171065404, Train Acc: 0.3384815705128205 || Test Loss: 2.698420839431958, Test Accuracy: 0.27403846153846156
Epoch: 4 || Train Loss: 1.5996402392020592, Train Acc: 0.42175480769230766 || Test Loss: 1.714689551255642, Test Accuracy: 0.42728365384615385
Epoch: 5 || Train Loss: 1.425567360719045, Train Acc: 0.4928886217948718 || Test Loss: 1.5586481950221918, Test Accuracy: 0.48127003205128205
Epoch: 6 || Train Loss: 1.2638479770758213, Train Acc: 0.5542067307692308 || Test Loss: 1.3721737433702519, Test Accuracy: 0.5361578525641025
E

Epoch: 36 || Train Loss: 0.368885912383214, Train Acc: 0.8725761217948718 || Test Loss: 2.901487695865142, Test Accuracy: 0.6588541666666666
Epoch: 37 || Train Loss: 0.3580057885020207, Train Acc: 0.8780248397435897 || Test Loss: 2.8073677328916697, Test Accuracy: 0.6696714743589743
Epoch: 38 || Train Loss: 0.3486704012904412, Train Acc: 0.87890625 || Test Loss: 2.6595843709432163, Test Accuracy: 0.6926081730769231
Epoch: 39 || Train Loss: 0.35195534794758526, Train Acc: 0.8795272435897435 || Test Loss: 2.2024638744500966, Test Accuracy: 0.7007211538461539
Epoch: 40 || Train Loss: 0.3294345988295017, Train Acc: 0.8846754807692307 || Test Loss: 3.093789206101344, Test Accuracy: 0.6664663461538461
Epoch: 41 || Train Loss: 0.32800829337957577, Train Acc: 0.8884014423076924 || Test Loss: 2.6140012053342967, Test Accuracy: 0.698417467948718
Epoch: 42 || Train Loss: 0.32935327536020526, Train Acc: 0.8871394230769231 || Test Loss: 3.0422634726915603, Test Accuracy: 0.6743790064102564
Epoch: 4

In [0]:
with open('adv_trend.json', 'w') as fp:
    json.dump(trend_adv, fp)
files.download('adv_trend.json')

In [63]:
visualize(trend_adv, param_adv)

acc_train_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINEFalse.png
loss_train_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINEFalse.png
acc_test_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINEFalse.png
loss_test_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINEFalse.png


## Baseline Training

In [0]:
param_base = {
    'CHANNELS': 3,
    'BATCH_SIZE': 128,
    'EPOCHS': 50,
    'STEPS': None,
    'BASELINE': True
}

In [0]:
nodes_base = prepareTrainingModel(param_base)

In [0]:
trend_base = {
    'acc_train_trend': [],
    'loss_train_trend': [],
    'acc_test_trend': [],
    'loss_test_trend': []
}

In [0]:
sess_base = tf.Session()
sess_base.run(tf.global_variables_initializer())

In [38]:
doTraining(x_train, y_train, x_test, y_test, sess_base, nodes_base, param_base, trend_base, n_epochs=50)

Epoch: 0 || Train Loss: 44.99179334395971, Train Acc: 0.10512820512820513 || Test Loss: 2.3833683759738236, Test Accuracy: 0.10166266025641026
Epoch: 1 || Train Loss: 2.293228856111184, Train Acc: 0.11704727564102564 || Test Loss: 2.3043171075674205, Test Accuracy: 0.1505408653846154
Epoch: 2 || Train Loss: 2.265076929483658, Train Acc: 0.13956330128205127 || Test Loss: 2.25803411923922, Test Accuracy: 0.15474759615384615
Epoch: 3 || Train Loss: 2.2147713490021537, Train Acc: 0.1685096153846154 || Test Loss: 2.2322454880445433, Test Accuracy: 0.1853966346153846
Epoch: 4 || Train Loss: 2.1570706783196867, Train Acc: 0.1953525641025641 || Test Loss: 2.248872834902543, Test Accuracy: 0.2088341346153846
Epoch: 5 || Train Loss: 2.1051469075374114, Train Acc: 0.22313701923076923 || Test Loss: 2.2102826711459036, Test Accuracy: 0.23046875
Epoch: 6 || Train Loss: 2.060809125655737, Train Acc: 0.24026442307692308 || Test Loss: 2.588299158291939, Test Accuracy: 0.19921875
Epoch: 7 || Train Loss:

Epoch: 37 || Train Loss: 0.5819737969300686, Train Acc: 0.7939302884615385 || Test Loss: 2.0561563204496336, Test Accuracy: 0.6394230769230769
Epoch: 38 || Train Loss: 0.5578603613070953, Train Acc: 0.8011818910256411 || Test Loss: 2.120261222888262, Test Accuracy: 0.6533453525641025
Epoch: 39 || Train Loss: 0.5442426409476843, Train Acc: 0.8080929487179487 || Test Loss: 1.7516240691527343, Test Accuracy: 0.6608573717948718
Epoch: 40 || Train Loss: 0.5324783491018491, Train Acc: 0.8145032051282052 || Test Loss: 1.9945228818135383, Test Accuracy: 0.6502403846153846
Epoch: 41 || Train Loss: 0.5256997774044673, Train Acc: 0.8159655448717948 || Test Loss: 2.057412557112865, Test Accuracy: 0.6411258012820513
Epoch: 42 || Train Loss: 0.49584715832502413, Train Acc: 0.8248597756410256 || Test Loss: 1.9321143886981866, Test Accuracy: 0.6734775641025641
Epoch: 43 || Train Loss: 0.48976458547971186, Train Acc: 0.8287059294871795 || Test Loss: 2.45268685848285, Test Accuracy: 0.6215945512820513
E

In [0]:
with open('base_trend.json', 'w') as fp:
    json.dump(trend_base, fp)
files.download('base_trend.json')

In [68]:
visualize(trend_base, param_base)

acc_train_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINETrue.png
loss_train_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINETrue.png
acc_test_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINETrue.png
loss_test_trendCHANNELS3BATCH_SIZE128EPOCHS50STEPSNoneBASELINETrue.png


In [0]:
visualize_both(trend_base, trend_adv, param_base, param_adv)

## Trying more epochs for baseline

In [66]:
doTraining(x_train, y_train, x_test, y_test, sess_base, nodes_base, param_base, trend_base, n_epochs=15)

Epoch: 0 || Train Loss: 0.4114689419666926, Train Acc: 0.8565705128205128 || Test Loss: 2.62478596277726, Test Accuracy: 0.6821915064102564
Epoch: 1 || Train Loss: 0.4116022842434736, Train Acc: 0.8560296474358975 || Test Loss: 2.397842292602246, Test Accuracy: 0.6944110576923077
Epoch: 2 || Train Loss: 0.40900135078491306, Train Acc: 0.8569711538461539 || Test Loss: 2.573611721014365, Test Accuracy: 0.6738782051282052
Epoch: 3 || Train Loss: 0.3950696151990157, Train Acc: 0.8607371794871795 || Test Loss: 2.2602377350513754, Test Accuracy: 0.7008213141025641
Epoch: 4 || Train Loss: 0.3882401616909565, Train Acc: 0.864863782051282 || Test Loss: 2.585665913728567, Test Accuracy: 0.6643629807692307
Epoch: 5 || Train Loss: 0.37742966432601976, Train Acc: 0.8680889423076923 || Test Loss: 2.890182140545967, Test Accuracy: 0.6627604166666666
Epoch: 6 || Train Loss: 0.3734817562194971, Train Acc: 0.8680889423076923 || Test Loss: 3.031246014130421, Test Accuracy: 0.6494391025641025
Epoch: 7 || 