In [46]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from torch.utils.data import Dataset, DataLoader

# Text Classification
In this part of the tutorial we develop a continuous bag of words (CBOW) model for a text classification task described [here]( https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf). The CBOW model was first described [here](https://arxiv.org/pdf/1301.3781.pdf)

## Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:
```
wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
```

In [2]:
def unpack_dataset():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
#unpack_dataset()

In [4]:
!ls data

glove.6B.100d.txt   glove.6B.50d.txt    plot.tok.gt9.5000   tripAdvisor.zip
glove.6B.200d.txt   [34mml-latest-small[m[m     quote.tok.gt9.5000
glove.6B.300d.txt   ml-latest-small.zip subjdata.README.1.0


In [5]:
! head -2 data/plot.tok.gt9.5000

the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 


In [6]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/glove.6B.300d.txt'),
 PosixPath('data/glove.6B.100d.txt'),
 PosixPath('data/ml-latest-small.zip'),
 PosixPath('data/ml-latest-small'),
 PosixPath('data/glove.6B.50d.txt'),
 PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/tripAdvisor.zip'),
 PosixPath('data/quote.tok.gt9.5000'),
 PosixPath('data/glove.6B.200d.txt')]

## Tokenization
Tokenization is the task of chopping up text into pieces, called tokens.

spaCy is an open-source software library for advanced Natural Language Processing. Here we will use it for tokenization.  

### Simple Tokenization

In [7]:
# We need each line in the file 
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [8]:
obj_lines = read_file(PATH/"plot.tok.gt9.5000")

In [9]:
obj_lines[0]

'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \n'

In [10]:
np.array(obj_lines[0].strip().lower().split(" "))

array(['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a',
       'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi',
       'from', 'a', 'hunter', '.'], dtype='<U8')

### Much better tokenization with Spacy

In [14]:
#!pip install -U spacy

In [15]:
import spacy

In [16]:
# first time run this
#!python3 -m spacy download en

In [17]:
tok = spacy.load('en')

In [18]:
obj_lines = read_file(PATH/"plot.tok.gt9.5000")

In [19]:
len(obj_lines)

5000

In [20]:
obj_lines[0]

'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \n'

In [21]:
test = tok(obj_lines[0])

In [22]:
np.array([x for x in test])

array([the, movie, begins, in, the, past, where, a, young, boy, named,
       sam, attempts, to, save, celebi, from, a, hunter, ., 
], dtype=object)

## Split dataset in train and validation

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
sub_content = read_file(PATH/"quote.tok.gt9.5000")
obj_content = read_file(PATH/"plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [13]:
X[0], y[0]

('smart and alert , thirteen conversations about one thing is a small gem .',
 0.0)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train[:5], y_train[:5]

(array(['will god let her fall or give her a new path ?',
        "the director's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship ( most notably wretched sound design ) .",
        "welles groupie/scholar peter bogdanovich took a long time to do it , but he's finally provided his own broadside at publishing giant william randolph hearst .",
        'based on the 1997 john king novel of the same name with a rather odd synopsis : " a first novel about a seasoned chelsea football club hooligan who represents a disaffected society operating by brutal rules .',
        'yet , beneath an upbeat appearance , she is struggling desperately with the emotional and physical scars left by the attack .'],
       dtype='<U691'), array([1., 0., 0., 1., 1.]))

## Word to index mapping
In interest of time we will tokenize without spaCy. Here we will compute a vocabulary of words based on the training set and a mapping from word to an index.

In [16]:
from collections import defaultdict

In [17]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab      

In [18]:
#Getting the vocabulary from the training set
word_count = get_vocab(X_train)

In [19]:
#word_count

In [20]:
len(word_count.keys())

21415

In [21]:
# let's delete words that are very infrequent
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]

In [22]:
len(word_count.keys())

4065

In [23]:
## Finally we need an index for each word in the vocab
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [24]:
#vocab2index

## Sentence encoding
Here we encode each sentence as a sequence of indices corresponding to each word.

In [25]:
x_train_len = np.array([len(x.split()) for x in X_train])
x_val_len = np.array([len(x.split()) for x in X_val])

In [26]:
np.percentile(x_train_len, 95) # let set the max sequence len to N=40

43.0

In [27]:
X_train[0]

'will god let her fall or give her a new path ?'

In [28]:
# returns the index of the word or the index of "UNK" otherwise
vocab2index.get("?", vocab2index["UNK"])

7

In [29]:
np.array([vocab2index.get(w, vocab2index["UNK"]) for w in X_train[0].split()])

array([ 4,  2,  6, 11,  8,  9,  3, 11, 10,  5, 12,  7])

In [30]:
def encode_sentence(s, N=40):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [31]:
encode_sentence(X_train[0])

(array([ 4,  2,  6, 11,  8,  9,  3, 11, 10,  5, 12,  7,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0], dtype=int32), 12)

In [32]:
x_train_len = np.minimum(x_train_len, 40)
x_val_len = np.minimum(x_val_len, 40)

In [33]:
x_train = np.vstack([encode_sentence(x) for x in X_train])
x_train.shape

(8000, 2)

In [34]:
x_val = np.vstack([encode_sentence(x) for x in X_val])
x_val.shape

(2000, 2)

## Embedding layer
Most deep learning models use a dense vectors of real numbers as representation of words (word embeddings), as opposed to a one-hot encoding representations. The module torch.nn.Embedding is used to represent word embeddings. It takes two arguments: the vocabulary size, and the dimensionality of the embeddings. The embeddings are initialized with random vectors. 

In [35]:
# an Embedding module containing 10 words with embedding size 4
# embedding will be initialized at random
embed = nn.Embedding(10, 4, padding_idx=0)
embed.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.3243,  0.0803, -0.8527,  0.4445],
        [ 0.3077,  0.5857, -1.3859,  0.6905],
        [-1.2525, -0.4846, -0.6517,  1.3722],
        [ 0.6615,  0.7037, -0.4998,  0.6022],
        [ 0.4344,  1.3441, -0.6744,  2.7256],
        [ 0.7420,  1.6063,  0.0864,  0.3109],
        [-0.3397,  0.1183, -0.9083,  2.0052],
        [-0.4423, -1.0187,  0.5727, -0.1020],
        [ 1.9286, -1.2615, -0.3954,  0.2758]], requires_grad=True)

Note that the `padding_idx` has embedding vector 0.

In [36]:
# given a list of ids we can "look up" the embedding corresponing to each id
# can you see that some vectors are the same?
a = torch.LongTensor([[1,4,1,5,1,0]])
embed(a)

tensor([[[ 1.3243,  0.0803, -0.8527,  0.4445],
         [ 0.6615,  0.7037, -0.4998,  0.6022],
         [ 1.3243,  0.0803, -0.8527,  0.4445],
         [ 0.4344,  1.3441, -0.6744,  2.7256],
         [ 1.3243,  0.0803, -0.8527,  0.4445],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward>)

This would be the representation of a sentence with words with indices [1,4,1,5,1] and a padding at the end. Bellow we have an example in which we have two sentences. the first sentence has length 3 and the last sentence has length 2. In order to use a tensor we use padding at the end of the second sentence. 

In [37]:
a = torch.LongTensor([[1,4,1], [1,3,0]])

Our model takes an average of the word embedding of each word. Here is how we do it.

In [38]:
s = torch.FloatTensor([3, 2]) # here is the size of the vector

In [39]:
embed(a)

tensor([[[ 1.3243,  0.0803, -0.8527,  0.4445],
         [ 0.6615,  0.7037, -0.4998,  0.6022],
         [ 1.3243,  0.0803, -0.8527,  0.4445]],

        [[ 1.3243,  0.0803, -0.8527,  0.4445],
         [-1.2525, -0.4846, -0.6517,  1.3722],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward>)

In [40]:
embed(a).sum(dim=1)

tensor([[ 3.3101,  0.8643, -2.2052,  1.4912],
        [ 0.0718, -0.4043, -1.5044,  1.8167]], grad_fn=<SumBackward1>)

In [41]:
sum_embs = embed(a).sum(dim=1) 
sum_embs/ s.view(s.shape[0], 1)

tensor([[ 1.1034,  0.2881, -0.7351,  0.4971],
        [ 0.0359, -0.2022, -0.7522,  0.9083]], grad_fn=<DivBackward0>)

## Continuous Bag of Words Model

In [42]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=100):
        super(CBOW, self).__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.linear = nn.Linear(emb_size, 1)
        
    def forward(self, x, s):
        x = self.word_emb(x)
        x = x.sum(dim=1)/ s
        x = self.linear(x)
        return x

# Training the CBOW model 

In [43]:
V = len(words)
model = CBOW(vocab_size=V, emb_size=50)
print(V)

4067


In [44]:
class SubjectivityDataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        x, s = encode_sentence(x)
        return x, self.y[idx], s
    
train_ds = SubjectivityDataset(X_train, y_train)
valid_ds = SubjectivityDataset(X_val, y_val)

NameError: name 'Dataset' is not defined

In [131]:
train_dl = DataLoader(train_ds, batch_size=5, shuffle=True)
x, y, s = next(iter(train_dl))

In [132]:
x, y, s

(tensor([[   1,   19, 2081,   80,   11,    1,    1,   34,  662,  124,   19,   36,
            62,  132, 1111, 1387,  119,  976, 2731,   56,   19,  191,   38,   19,
           871, 2427,  289,   80, 2438,   22,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0],
         [   1,   25, 4060,   16,   71, 3303,   34,  790,   34, 1280,   34,    1,
            34,   21,    1, 1205,  954,   22,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0],
         [  19,  687,  261,   11,  152,   62,   69, 1631,  252,   66, 1178,   49,
            40,  299,   21, 1049, 1157,   80,   19,    2, 1344,   22,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0],
         [1021,    1,   34,    1,   74,    1,   80,  482,    1,    1,  919,   34,
             1,  305,   11,  617,   96,  327,  496,   86,    1,   92,   22,

In [133]:
model = CBOW(vocab_size=V, emb_size=50)

In [135]:
train_dl = DataLoader(train_ds, batch_size=500, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=500)

In [147]:
def valid_metrics(model):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, y, s in valid_dl:
        x = x.long()  #.cuda()
        y = y.float().unsqueeze(1)
        s = s.float().view(s.shape[0], 1)
        batch = y.shape[0]
        out = model(x, s)
        loss = F.binary_cross_entropy_with_logits(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = (out > 0).float()
        correct += (pred == y).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, val_acc

In [168]:
def train_epocs(model, optimizer, epochs=10):
    for i in range(epochs):
        model.train()
        total_loss = 0
        total = 0
        for x, y, s in train_dl:
            x = x.long()
            y = y.float().unsqueeze(1)
            s = s.float().view(s.shape[0], 1)
            out = model(x, s)
            loss = F.binary_cross_entropy_with_logits(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
        train_loss = total_loss/total
        val_loss, val_accuracy = valid_metrics(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (
            train_loss, val_loss, val_accuracy))

In [169]:
model = CBOW(vocab_size=V, emb_size=50)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train_epocs(model, optimizer, epochs=10)

train_loss 0.637 val_loss 0.564 val_accuracy 0.793
train_loss 0.459 val_loss 0.392 val_accuracy 0.866
train_loss 0.281 val_loss 0.289 val_accuracy 0.899
train_loss 0.181 val_loss 0.247 val_accuracy 0.910
train_loss 0.124 val_loss 0.229 val_accuracy 0.914
train_loss 0.088 val_loss 0.223 val_accuracy 0.909
train_loss 0.064 val_loss 0.224 val_accuracy 0.912
train_loss 0.047 val_loss 0.227 val_accuracy 0.909
train_loss 0.036 val_loss 0.233 val_accuracy 0.908
train_loss 0.027 val_loss 0.237 val_accuracy 0.909


## Using pre-trained embeddings

To get glove pre-trained embeddings:
    `wget http://nlp.stanford.edu/data/glove.6B.zip` or use this function:

In [170]:
def unpack_glove():
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! mkdir data
    ! unzip glove.6B.zip -C data
    
# unpack_glove()

In [171]:
! head -2 data/glove.6B.50d.txt

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392


We would like to initialize the embeddings from our model with the pre-trained Glove embeddings. After initializing we should "freeze" the embeddings at least initially. The rationale is that we first want the network to learn weights for the other parameters that were randomly initialize. After that phase we could finetune the embeddings to our task. 

`embed.weight.requires_grad = False` freezes the embedding parameters.

The following code initializes the embedding. Here `V` is the vocabulary size and `emb_size` is the embedding size. `pretrained_weight` is a numpy matrix of shape `(V, emb_size)`.

In [172]:
def loadGloveModel(gloveFile=PATH/"glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [173]:
word_vecs = loadGloveModel()

In [174]:
# let compute the vocab again
word_count = get_vocab(X_train)

In [175]:
print(len(word_vecs.keys()), len(word_count.keys()))

400000 21415


In [176]:
def delete_rare_words(word_vecs, word_count, min_df=4):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

In [177]:
word_count = delete_rare_words(word_vecs, word_count)
print(len(word_count.keys()))

18425


In [180]:
def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

In [181]:
pretrained_weight, vocab, vocab2index = create_embedding_matrix(word_vecs, word_count)

In [182]:
len(pretrained_weight)

18427

In [183]:
# creating an embedding matrix with Glove embeddings
emb_size = 50
V = len(pretrained_weight)
emb = nn.Embedding(V, emb_size)
emb.weight.data.copy_(torch.from_numpy(pretrained_weight))

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.0158e-01,  7.5476e-02,  6.5178e-02,  ..., -1.1120e-01,
          9.3024e-02,  1.7373e-02],
        [ 1.9511e-01,  5.0739e-01,  1.4709e-03,  ...,  3.0488e-02,
         -1.4272e-01,  4.5411e-01],
        ...,
        [ 1.5865e+00,  6.3620e-02, -9.0932e-01,  ...,  3.3619e-01,
         -2.2531e-01,  4.4413e-01],
        [-3.3969e-01,  1.7529e-01, -9.7424e-01,  ...,  3.9837e-01,
          2.6775e-01, -1.3950e+00],
        [-4.7284e-01,  5.0374e-01,  4.0125e-01,  ..., -1.1791e+00,
         -7.2486e-01,  4.3227e-01]])

## Model with pre-trained embeddings

In [184]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=50, glove_weights=None):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embedding.weight.requires_grad = False ## freeze embeddings
        
        self.linear = nn.Linear(emb_size, 1)
        
    def forward(self, x, s):
        x = self.embedding(x)
        x = x.sum(dim=1)/ s
        x = self.linear(x)
        return x

In [185]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [193]:
model = CBOW(V, emb_size=50, glove_weights=pretrained_weight)
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [194]:
train_epocs(model, optimizer, epochs=5)

train_loss 0.648 val_loss 0.607 val_accuracy 0.848
train_loss 0.569 val_loss 0.546 val_accuracy 0.868
train_loss 0.513 val_loss 0.501 val_accuracy 0.870
train_loss 0.471 val_loss 0.466 val_accuracy 0.871
train_loss 0.439 val_loss 0.440 val_accuracy 0.871


In [195]:
# unfreezing the embeddings
model.embedding.weight.requires_grad = True

In [196]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [197]:
train_epocs(model, optimizer, epochs=5)

train_loss 0.352 val_loss 0.287 val_accuracy 0.908
train_loss 0.209 val_loss 0.225 val_accuracy 0.921
train_loss 0.138 val_loss 0.201 val_accuracy 0.931
train_loss 0.095 val_loss 0.192 val_accuracy 0.929
train_loss 0.067 val_loss 0.191 val_accuracy 0.926


In [198]:
set_learning_rate(optimizer, 0.001)
train_epocs(model, optimizer, epochs=10)

train_loss 0.052 val_loss 0.192 val_accuracy 0.926
train_loss 0.050 val_loss 0.191 val_accuracy 0.926
train_loss 0.049 val_loss 0.192 val_accuracy 0.925
train_loss 0.047 val_loss 0.192 val_accuracy 0.925
train_loss 0.046 val_loss 0.192 val_accuracy 0.925
train_loss 0.044 val_loss 0.192 val_accuracy 0.926
train_loss 0.043 val_loss 0.193 val_accuracy 0.925
train_loss 0.041 val_loss 0.193 val_accuracy 0.925
train_loss 0.040 val_loss 0.193 val_accuracy 0.924
train_loss 0.038 val_loss 0.194 val_accuracy 0.924
