In [1]:
%load_ext autoreload
%autoreload 2

#### Make sure you downloaded coco, flickr, glove. If not uncomment cell below.

In [3]:
#!bash ../load_flickr8k.sh
#!bash ../load_glove.sh
#!bash ../load_coco.sh

In [4]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import nltk
nltk.download('punkt')

In [6]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
from utils_torch import check_create_dir
from metrics import *
from utils_torch import words_from_tensors_fn
import numpy as np

In [7]:
import os.path
import pickle

from datasets.flickr8k import Flickr8kDataset


DATASET_BASE_PATH = '../data/flickr8k/'
VOCAB = 'vocab_set.pkl'

train_set = Flickr8kDataset(
    dataset_base_path=DATASET_BASE_PATH, dist='train',
    device=device, return_type='tensor', load_img_to_memory=False)


vocab_set = train_set.get_vocab()
with open(VOCAB, 'wb') as f:
    pickle.dump(vocab_set, f)
    

vocab, word2idx, idx2word, max_len = vocab_set
vocab_size = len(vocab)

val_set = Flickr8kDataset(
    dataset_base_path=DATASET_BASE_PATH, dist='val', vocab_set=vocab_set,
    device=device, return_type='corpus', load_img_to_memory=False)

test_set = Flickr8kDataset(
    dataset_base_path=DATASET_BASE_PATH, dist='test', vocab_set=vocab_set,
    device=device, return_type='corpus', load_img_to_memory=False)

train_eval_set = Flickr8kDataset(
    dataset_base_path=DATASET_BASE_PATH, dist='train', vocab_set=vocab_set,
    device=device, return_type='corpus', load_img_to_memory=False)


print(
    f"The number of samples in:\ntrain: {len(train_set)};"
    + f" validation: {len(val_set)}; test: {len(test_set)}\n"
    + f"Vocabulary size: {vocab_size}; Max length of a sentence: {max_len};"
)

The number of samples in:
train: 30000; validation: 1000; test: 1000
Vocabulary size: 7708; Max length of a sentence: 40;


In [8]:
from torchvision import transforms
from torch.utils.data import DataLoader


BATCH_SIZE = 50


train_transformations = transforms.Compose([
    transforms.Resize(256),  # smaller edge of image resized to 256
    transforms.RandomCrop(224),  # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),  # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),  # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))
])

eval_transformations = transforms.Compose([
    transforms.Resize(256),  # smaller edge of image resized to 256
    transforms.CenterCrop(224),  # get 224x224 crop from random location
    transforms.ToTensor(),  # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),  # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))
])

train_set.transformations = train_transformations
val_set.transformations = eval_transformations
test_set.transformations = eval_transformations
train_eval_set.transformations = eval_transformations

eval_collate_fn = lambda batch: (torch.stack([x[0] for x in batch]), [x[1] for x in batch], [x[2] for x in batch])
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, sampler=None, pin_memory=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False,
                        collate_fn=eval_collate_fn)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False,
                         collate_fn=eval_collate_fn)
train_eval_loader = DataLoader(train_eval_set, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False,
                               collate_fn=eval_collate_fn)


In [9]:
start_token = word2idx['<start>']
end_token = word2idx['<end>']
pad_token = word2idx['<pad>']
max_seq_len = max_len

In [10]:
import random
import torch
import numpy as np
from models.torch.vgg16_transformer import Captioner

np.random.seed(0)
torch.manual_seed(0)
random.seed(0)

In [11]:
from glove import embedding_matrix_creator
EMBEDDING_DIM = 50
EMBEDDING = f"GLV{EMBEDDING_DIM}"

embedding_matrix = embedding_matrix_creator(embedding_dim=EMBEDDING_DIM, word2idx=word2idx, GLOVE_DIR='../data/glove.6B/')
print(f"Embedding matrix shape: {embedding_matrix.shape}")

100%|██████████| 7708/7708 [00:00<00:00, 229882.07it/s]

Embedding matrix shape: (7708, 50)





In [12]:
final_model = Captioner(10, 6, EMBEDDING_DIM, vocab_size, embedding_matrix, False).to(device)

In [13]:
from train_torch import evaluate_model
from train_transformer import train_model

In [None]:
MODEL = final_model.name
train_from_scratch = False  # change to `True` if you want to train your model

if train_from_scratch:
    check_create_dir(f'saved_models/{MODEL}')
    MODEL_NAME = f'saved_models/{MODEL}/{MODEL}_b{BATCH_SIZE}_emd{EMBEDDING}'
    NUM_EPOCHS = 20
    print(MODEL_NAME)
    
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=train_set.pad_value)
    acc_fn = accuracy_fn(ignore_value=train_set.pad_value)
    optimizer = torch.optim.Adam(final_model.parameters(), lr=0.0005)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.8, patience=2, verbose = True)
    min_val_loss = float('Inf')

    sentence_bleu_score_fn = bleu_score_fn(4, 'sentence')
    corpus_bleu_score_fn = bleu_score_fn(4, 'corpus')
    tensor_to_word_fn = words_from_tensors_fn(idx2word=idx2word)
    train_loss_min = float('Inf')
    val_bleu4_max = 0.0
        
    for epoch in range(NUM_EPOCHS):
        train_loss = train_model(
            desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}',
            model=final_model,                                                                      
            optimizer=optimizer,
            loss_fn=loss_fn,                                        
            train_loader=train_loader,
            acc_fn=acc_fn,
            pad_token=pad_token
        )

        with torch.no_grad():
            train_bleu = evaluate_model(
                desc=f'\tTrain Bleu Score: ',
                model=final_model,
                bleu_score_fn=corpus_bleu_score_fn,
                tensor_to_word_fn=tensor_to_word_fn,
                data_loader=train_eval_loader,
                device=device
            )

            val_bleu = evaluate_model(
                desc=f'\tValidation Bleu Score: ',
                model=final_model,
                bleu_score_fn=corpus_bleu_score_fn,
                tensor_to_word_fn=tensor_to_word_fn,
                data_loader=val_loader,
                device=device
            )

            print(f'Epoch {epoch + 1}/{NUM_EPOCHS}',
                ''.join([f'train_bleu{i}: {train_bleu[i]:.4f} ' for i in (1, 4)]),
                ''.join([f'val_bleu{i}: {val_bleu[i]:.4f} ' for i in (1, 4)]),
                )
            
            state = {
                'epoch': epoch + 1,
                'state_dict': final_model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'train_loss_latest': train_loss,
                'val_bleu4_latest': val_bleu[4],
                'train_loss_min': min(train_loss, train_loss_min),
                'val_bleu4_max': max(val_bleu[4], val_bleu4_max),
                'train_bleus': train_bleu,
                'val_bleus': val_bleu,
            }

            torch.save(state, f'{MODEL_NAME}_latest.pt')

            if train_loss < train_loss_min:
                train_loss_min = train_loss
                torch.save(state, f'{MODEL_NAME}''_best_train_loss.pt')
                
            if val_bleu[4] > val_bleu4_max:
                val_bleu4_max = val_bleu[4]
                torch.save(state, f'{MODEL_NAME}''_best_val_bleu.pt')
        scheduler.step(train_loss)
        
    torch.save(state, f'{MODEL_NAME}_ep{NUM_EPOCHS:02d}_weights.pt')

In [None]:
PATH = f'saved_models/vgg16_transformer/vgg16_transformer_b50_emdGLV50_best_val_bleu.pt'
final_model.load_state_dict(torch.load(PATH, map_location=torch.device(device))['state_dict'])
final_model.eval()

In [None]:
torch.load(PATH, map_location=torch.device(device))['val_bleus'], torch.load(PATH, map_location=torch.device(device))['train_bleus']

([0.0,
  0.5279028008819513,
  0.3307541635152168,
  0.20134296556512604,
  0.1231079127594675],
 [0.0,
  0.5313176308552883,
  0.3363814838274679,
  0.21012488737622093,
  0.1321288381144432])

In [None]:
inter_params = {
    'model': final_model,
    'bleu_score_fn': corpus_bleu_score_fn,
    'tensor_to_word_fn': tensor_to_word_fn,
    'device': device, 
}

In [None]:
with torch.no_grad():
    train_bleu = evaluate_model(
        desc=f'Train: ',
        data_loader=train_eval_loader,
        **inter_params,
    )
    
    val_bleu = evaluate_model(
        desc=f'Val: ',
        data_loader=val_loader,
        **inter_params,
    )
    
    test_bleu = evaluate_model(
        desc=f'Test: ',
        data_loader=test_loader,
        **inter_params,
    )
    for setname, result in zip(('train', 'val', 'test'), (train_bleu, val_bleu, test_bleu)):
        print(setname, end=' ')
        for ngram in (1, 2, 3, 4):
            print(f'Bleu-{ngram}: {result[ngram]}\n', end=' ')
        print()

Train:   0%|          | 0/120 [00:00<?, ?it/s]

Val:   0%|          | 0/20 [00:00<?, ?it/s]

Test:   0%|          | 0/20 [00:00<?, ?it/s]

train Bleu-1: 0.5313176308552883
 Bleu-2: 0.3363814838274679
 Bleu-3: 0.21012488737622093
 Bleu-4: 0.1321288381144432
 
val Bleu-1: 0.5279028008819513
 Bleu-2: 0.3307541635152168
 Bleu-3: 0.20134296556512604
 Bleu-4: 0.1231079127594675
 
test Bleu-1: 0.5276105108829945
 Bleu-2: 0.3304151787552659
 Bleu-3: 0.20273477808085913
 Bleu-4: 0.12388007698440762
 
