# devise against custom sentence embedding
Now that we have a method of creating reasonably good sentence embeddings, we need to learn the mapping from sentence embedding space to image feature vector space. This process is essentially the same as what we've demonstrated before.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 14)

import os
import json
import nltk
import pickle
import itertools
import numpy as np 
import pandas as pd
from PIL import Image
from scipy.spatial.distance import cdist
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm as tqdm_
tqdm_.pandas()

import io
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms

import spacy
nlp = spacy.load('en')

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load data
We'll load in some of the bits of data we saved at the end of the last notebook

In [None]:
index_to_wordvec = np.load('/mnt/efs/models/index_to_wordvec.npy')
word_to_index = pickle.load(open('/mnt/efs/models/word_to_index.pkl', 'rb'))
index_to_word = pickle.load(open('/mnt/efs/models/index_to_word.pkl', 'rb'))

# utils
and define a few utility functions which will come in handy later on. The all contribute to being able to embed a query sentence with a single function call.

In [None]:
def sentence_to_indexes(sentence):
    sentence = sentence.lower()
    tokenised = word_tokenize(sentence)
    indexes = [word_to_index[word] 
               for word in tokenised 
               if word in word_to_index]
    return indexes


def embed(sentence):
    indexes = ([word_to_index['<s>']] + 
               sentence_to_indexes(sentence) +
               [word_to_index['</s>']])
    wvs = np.stack([index_to_wordvec[i] for i in indexes])
    embedding = model(torch.Tensor([wvs]).cuda()).cpu().data.numpy()
    return embedding.squeeze()

def embed_paragraph(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    if len(sentences) == 0:
        embeddings = embed('.')
    else:
        embeddings = [embed(sentence) for sentence in sentences]
    return np.array(embeddings).max(axis=0)

# sentence embedding models
We define the sentence embedding model in exactly the same way as we did before so that its learned weights from the last notebook can be overlaid without any issues.

In [None]:
hidden_size = 2048

class SentenceEncoder(nn.Module):
    def __init__(self):
        super(SentenceEncoder, self).__init__()
        self.enc_lstm = nn.LSTM(input_size=300, 
                                hidden_size=hidden_size, 
                                num_layers=1,
                                bidirectional=True)
        
    def forward(self, wv_batch):
        embedded, _ = self.enc_lstm(wv_batch)
        max_pooled = torch.max(embedded, 1)[0] 
        return max_pooled


class NLINet(nn.Module):
    def __init__(self, index_to_wordvec):
        super(NLINet, self).__init__()
        self.index_to_wordvec = index_to_wordvec
        self.encoder = SentenceEncoder()
        self.classifier = nn.Sequential(nn.Dropout(0.2),
                                        nn.Linear(hidden_size*8, 128),
                                        nn.ReLU(),
                                        nn.Dropout(0.2),
                                        nn.Linear(128, 3),
                                       )

    def forward(self, s1, s2):
        u, v = self.encoder(s1), self.encoder(s2)
        features = torch.cat((u, v, torch.abs(u - v), u * v), 1)
        return self.classifier(features)

    def encode(self, sentences):
        indexes = sentence_to_indexes(sentences)
        wvs = torch.Tensor(np.stack([self.index_to_wordvec[i] for i in indexes]))
        return self.encoder([wvs])

In [None]:
model = SentenceEncoder().to(device)

model_path = '/mnt/efs/models/sentence-encoder-2018-10-08.pt'
model.load_state_dict(torch.load(model_path))

# get image data
We can now start loading in the image data we'll use to learn mappings from image to sentence space. This follows the same pattern as before - building a pandas dataframe of paths and captions, whose embeddings and images can be calculated or looked up at train time for each batch, as needed.

In [None]:
wellcome_image_path = '/mnt/efs/images/wellcome_images/'

wellcome_image_paths = [wellcome_image_path + subdir + '/' + wellcome_image_id
                        for subdir in os.listdir(wellcome_image_path)
                        for wellcome_image_id in os.listdir(wellcome_image_path+subdir)]

wellcome_image_ids = [path.split('/')[-1].split('.')[0] for path in wellcome_image_paths]

wellcome_path_series = pd.Series(dict(zip(wellcome_image_ids, wellcome_image_paths)))

In [None]:
meta = pd.read_json('/mnt/efs/other/works.json', lines=True)
meta.index = meta['identifiers'].apply(lambda x: x[0]['value']).rename()
wellcome_title_series = meta['title'].fillna('')

In [None]:
wellcome_df = pd.concat([wellcome_path_series, wellcome_title_series], axis=1)
wellcome_df.columns = ['file_name', 'caption']

wellcome_df = wellcome_df.dropna()
wellcome_df['caption'] = wellcome_df['caption']

In [None]:
with open('/mnt/efs/images/coco/annotations/captions_val2014.json') as f:
    meta = json.load(f)

coco_df = (pd.merge(pd.DataFrame(meta['images']).set_index('id'),
                    pd.DataFrame(meta['annotations']).set_index('image_id'), 
                    left_index=True, right_index=True)
           .reset_index()
           [['caption', 'file_name']]
          )

coco_df['file_name'] = '/mnt/efs/images/coco/val2014/' + coco_df['file_name']

coco_df['caption'] = (coco_df['caption']
                      .apply(lambda x: ''.join([c for c in x if c.isalpha() or c.isspace()]))
                      .apply(str.lower)
                      .apply(lambda x: ' '.join(x.split()))
                     )

In [None]:
del wellcome_image_paths
del wellcome_path_series
del wellcome_title_series
del meta

# find nouns and adjective-noun pairs in sentences
We want to double down on the inclusion and good interpretation of short sequences, so we'll preprocess this core dataframe to find nouns and adjective-noun pairs too, matching them with the same image paths as their source sequences. This new short-sequence dataframe will then be appended to the original.

In [None]:
source_data = (pd.concat([wellcome_df, coco_df])
               .fillna('')
               .sample(50000)
               .values)

We extract the nouns from the sequence (using spacy's POS tagger) and add them to a dictionary, paired with their original image path. We'll also grab any adjective-noun pairs while we're there.

In [None]:
subjects, i = {}, 0

for caption, path in tqdm(source_data):
    words = nlp(caption)
    
    for word in words:
        if word.pos_ == 'NOUN':
            subjects[i] = {'caption': word.text,
                           'file_name': path}
            i += 1
    
    for i in range(len(words) - 1):
        word_1, word_2 = words[i:i+2]
        if ((word_1.pos_ == 'ADJ') & (word_2.pos_ == 'NOUN')):
            subjects[i] = {'caption': ' '.join([word_1.text, word_2.text]),
                           'file_name': path}
            i += 1

We'll now transform that dictionary into a dataframe so that it can be combined with the ones we loaded in before.

In [None]:
subjects = pd.DataFrame(subjects).T

# train test splits
We'll train on the coco and wellcome data combined (with their short sequence counterparts), and test against just the original wellcome captions.

In [None]:
mask = np.random.rand(len(wellcome_df)) < 0.8
train_df = pd.concat([wellcome_df[mask], coco_df, subjects], axis=0)
test_df = wellcome_df[~mask]

len(train_df), len(test_df)

In [None]:
del coco_df
del source_data
del subjects

In [None]:
train_embeddings = np.array([embed(caption) for caption in tqdm(train_df['caption'])])
test_embeddings = np.array([embed(caption) for caption in tqdm(test_df['caption'])])

# datasets and dataloaders
### dataset

In [None]:
class CaptionsDataset(Dataset):
    def __init__(self, path_df, caption_embeddings, 
                 transform=transforms.ToTensor()):
        self.ids = path_df.index.values
        self.image_paths = path_df['file_name'].values
        self.caption_embeddings = caption_embeddings
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.image_paths[index]).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        target = self.caption_embeddings[index]
        return image, target

    def __len__(self):
        return len(self.ids)

In [None]:
train_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.65, 0.9]),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.RandomGrayscale(0.35),
                                      transforms.ToTensor()])

test_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.65, 0.9]),
                                     transforms.ToTensor()])

In [None]:
train_dataset = CaptionsDataset(train_df, train_embeddings, transform=train_transform)
test_dataset = CaptionsDataset(test_df, test_embeddings, transform=test_transform)

### dataloader

In [None]:
batch_size = 64

train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=5)

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         num_workers=5)

# create DeViSE model
This is the same devise model as we've seen before, with the only difference being the target size. Our sentence space is now 4096d, rather than 300d, and our model adapts accordingly.

In [None]:
backbone = models.vgg16_bn(pretrained=True).features

for param in backbone[:37].parameters():
    param.requires_grad = False

In [None]:
class DeViSE(nn.Module):
    '''
    learn to map images into sentence space
    '''
    def __init__(self, backbone, target_size):
        super(DeViSE, self).__init__()
        self.backbone = backbone
        self.head = nn.Sequential(
            nn.Linear(in_features=512*7*7, out_features=target_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=target_size, out_features=target_size),
        )

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.head(x)
        x = x / x.max()  # normalise the output to keep mse sane
        return x

In [None]:
devise_model = DeViSE(backbone, target_size=4096).to(device)
devise_model_path = '/mnt/efs/models/devise-2018-10-09.pt'
devise_model.load_state_dict(torch.load(devise_model_path))

# train
Let's do some training!

In [None]:
losses = []

def train(model, train_loader, n_epochs, loss_function, 
          additional_metric, optimiser, device=device):
    for epoch in range(n_epochs):
        model.train()
        loop = tqdm(train_loader)
        for data, target in loop:
            data, target, flags = (data.cuda(non_blocking=True), 
                                   target.cuda(non_blocking=True), 
                                   torch.ones(len(target)).cuda(non_blocking=True))

            optimiser.zero_grad()
            prediction = model(data)

            loss = loss_function(prediction, target, flags)
            mean_sq_error = additional_metric(prediction, target)
            losses.append([loss.item(), mean_sq_error.item()])

            loss.backward()
            optimiser.step()

            loop.set_description('Epoch {}/{}'.format(epoch + 1, n_epochs))
            loop.set_postfix(loss=loss.item(), mse=mean_sq_error.item())

In [None]:
torch.backends.cudnn.benchmark = True
loss_function, mse = nn.CosineEmbeddingLoss(), nn.MSELoss()

In [None]:
trainable_parameters = filter(lambda p: p.requires_grad, devise_model.parameters())
optimiser = optim.Adam(trainable_parameters, lr=0.0001)

In [None]:
train(model=devise_model,
      train_loader=train_loader,
      loss_function=loss_function,
      additional_metric=mse, 
      optimiser=optimiser,
      n_epochs=3)

In [None]:
loss_data = pd.DataFrame(losses).rolling(window=15).mean()
loss_data.columns = ['cosine loss', 'mse']
ax = loss_data.plot(subplots=True);

ax[0].set_xlim(0,);
ax[0].set_ylim(0, 0.6);
ax[1].set_ylim(0,);

# predict
and make some predictions on the test set, checking our loss metric along the way.

In [None]:
preds = []
test_loss = []

with torch.no_grad():
    test_loop = tqdm(test_loader)
    for data, target in test_loop:
        data, target, flags = (data.cuda(),
                               target.cuda(),
                               torch.ones(len(target)).cuda())

        prediction = devise_model.eval()(data)
        loss = loss_function(prediction, target, flags)

        preds.append(prediction.cpu().data.numpy())
        test_loss.append(loss.item())

        test_loop.set_description('Test set')
        test_loop.set_postfix(loss=loss.item())

In [None]:
preds = np.concatenate(preds).reshape(-1, 4096)
np.mean(test_loss)

In [None]:
preds.shape

In [None]:
embeddings = preds

# search
We can now run proper searches against our wellcome data! We'll brute force the search here, but in the real world we'll precompute a search index using `nmslib`.

In [None]:
def search(query):
    query_embedding = embed(query).reshape(-1, 4096)

    distances = cdist(query_embedding, embeddings, 'cosine').squeeze()
    nearby_image_paths = test_df['file_name'].values[np.argsort(distances)][:20]
    nearby_images = [np.array((Image.open(path)
                               .convert('RGB')
                               .resize((224, 224), Image.BILINEAR)))
                     for path in nearby_image_paths]

    return Image.fromarray(np.concatenate([np.concatenate(nearby_images[:5], axis=1),
                                           np.concatenate(nearby_images[5:10], axis=1),
                                           np.concatenate(nearby_images[10:15], axis=1),
                                           np.concatenate(nearby_images[15:20], axis=1)],
                                          axis=0))

In [None]:
search('simulations of protein structure')

In [None]:
search('text written in hindi')

In [None]:
search('photograph of stone pillars in a church')

In [None]:
search('portrait of a man')

In [None]:
search('portrait of a woman')

In [None]:
search('mri scan of a brain')

In [None]:
search('pretty drawings of plants and flowers')

In [None]:
search('really horrible , disgusting drawings of burns and skin diseases')

In [None]:
search('surgical instruments')

In [None]:
search('astronomical charts of the moons')

In [None]:
search('a cat preparing for surgery')

In [None]:
search('dancing skeletons')

In [None]:
search('giraffe')

In [None]:
search('a man dancing')

In [None]:
search('a collection of blood cells')

In [None]:
search('a waterfall')

In [None]:
search('anatomical details of the tendons in hands and fingers')

In [None]:
search('buddhist man sitting with folded legs')

In [None]:
search('AIDS posters')

In [None]:
search('fractured bone')

In [None]:
torch.save(devise_model.state_dict(), '/mnt/efs/models/devise-2018-10-09.pt')

# save devise'd embeddings
We want to use these embeddings in our demo app, so we'll save them here so that they can be moved over to the app's data directory by hand.

In [None]:
all_caption_embeddings = np.array([embed(caption) for caption in tqdm(wellcome_df['caption'].values)])

In [None]:
full_dataset = CaptionsDataset(wellcome_df, all_caption_embeddings, transform=test_transform)
full_loader = DataLoader(dataset=full_dataset, 
                         batch_size=batch_size,
                         num_workers=5)

In [None]:
embeddings = []

with torch.no_grad():
    loop = tqdm(full_loader)
    for data, target in loop:
        data, target, flags = (data.cuda(),
                               target.cuda(),
                               torch.ones(len(target)).cuda())

        embedding = devise_model.eval()(data)
        embeddings.append(embedding.cpu().data.numpy())

In [None]:
embeddings = np.concatenate(embeddings).reshape(-1, 1024)

In [None]:
path_to_id = lambda x: x.split('/')[-1].split('.')[0]

In [None]:
np.save('/mnt/efs/models/embeddings.npy', embeddings)
np.save('/mnt/efs/models/image_ids.npy', wellcome_df['file_name'].apply(path_to_id).values)