# scaling up to imagenet
We've seen how effective DeViSE can be on a small set of data, but it's equally good when it's applied to a large dataset. Here I'll step through the exact same process but with a more complete imagenet dump of imagenet: the [ImageNet Large Scale Visual Recognition Challenge 2014 (ILSVRC2014)](http://image-net.org/challenges/LSVRC/2014/download-images-5jj5.php). 

We'll only use the validation dataset here, which alone is already more than 6GB. The test set is almost 140GB, which feels like overkill given the already good performance we achieved on tiny imagenet. Again, the fact that we're not entering the competition frees us up to use their data in a way which is appropriate for us - splitting the original competition's validation data into a new train and test set is perfectly valid.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 20)

import os
import io
import numpy as np
import pandas as pd
from PIL import Image
from scipy.spatial.distance import cdist
from scipy.io import loadmat

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# wordvectors

In [None]:
wv_path = '/mnt/efs/nlp/word_vectors/fasttext/crawl-300d-2M.vec'
wv_file = io.open(wv_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

fasttext = {line.split()[0]: np.array(line.split()[1:]).astype(np.float)
            for line in tqdm(list(wv_file)[1:])}

In [None]:
mean_wv = np.array(list(fasttext.values())).mean(axis=0)

In [None]:
vocabulary = set(fasttext.keys())

# wordnet

In [None]:
clean = lambda x: x.lower().strip().split(', ')

In [None]:
mat = loadmat('/home/jupyter/ILSVRC2012_devkit_t12/data/meta.mat')
wnid_to_words = {line[0][1][0]: clean(line[0][2][0]) for line in mat['synsets']}
competition_id_to_wnid = {line[0][0][0][0]: line[0][1][0] for line in mat['synsets']}

In [None]:
wnid_to_wordvector = {wnid: (np.array([fasttext[word] 
                                       if word in fasttext 
                                       else mean_wv 
                                       for word in words ])
                             .mean(axis=0))
                      for wnid, words in wnid_to_words.items()}

wnids = list(wnid_to_wordvector.keys())

# example data

In [None]:
id_path = '/mnt/efs/images/ILSVRC2012_validation_ground_truth.txt'
competition_ids = pd.read_csv(id_path, header=None).values.squeeze()

image_path = '/mnt/efs/images/ILSVRC2012/'
image_paths = np.sort([image_path + file_name for file_name in os.listdir(image_path)])

In [None]:
index = np.random.choice(50000)
competition_id = competition_ids[index]
wnid = competition_id_to_wnid[competition_id]

print(' '.join(wnid_to_words[wnid]))
Image.open(image_paths[index])

# datasets and dataloaders

In [None]:
df = dict(zip(image_paths, competition_ids))

df = pd.Series(df).to_frame().reset_index()
df.columns = ['path', 'wnid']

In [None]:
df = df.sample(frac=1).reset_index(drop=True) 

In [None]:
split_ratio = 0.8
train_size = int(split_ratio * len(df))

train_df = df.loc[:train_size]
test_df  = df.loc[train_size:]

In [None]:
class ImageDataset(Dataset):
    def __init__(self, dataframe, 
                 competition_id_to_wnid, wnid_to_wordvector,
                 transform):
        self.image_paths = dataframe['path'].values
        self.wnids = dataframe['wnid'].values
        self.competition_id_to_wnid = competition_id_to_wnid
        self.wnid_to_wordvector = wnid_to_wordvector
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.image_paths[index]).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        wnid = competition_id_to_wnid[self.wnids[index]]
        target = torch.Tensor(wnid_to_wordvector[wnid])
        return image, target

    def __len__(self):
        return len(self.wnids)

In [None]:
train_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.5, 0.9]),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.RandomRotation(15),
                                      transforms.RandomGrayscale(0.25),
                                      transforms.ToTensor()])

test_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.5, 0.9]),
                                     transforms.ToTensor()])

In [None]:
train_dataset = ImageDataset(dataframe=train_df, 
                             competition_id_to_wnid=competition_id_to_wnid, 
                             wnid_to_wordvector=wnid_to_wordvector, 
                             transform=train_transform)

test_dataset = ImageDataset(dataframe=test_df, 
                            competition_id_to_wnid=competition_id_to_wnid, 
                            wnid_to_wordvector=wnid_to_wordvector, 
                            transform=test_transform)

In [None]:
batch_size = 128

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          num_workers=5,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         num_workers=5)

# building the model

In [None]:
backbone = models.vgg16_bn(pretrained=True).features

In [None]:
for param in backbone.parameters():
    param.requires_grad = False

In [None]:
class DeViSE(nn.Module):
    def __init__(self, backbone, target_size=300):
        super(DeViSE, self).__init__()
        self.backbone = backbone
        self.head = nn.Sequential(
            nn.Linear(in_features=(25088), out_features=target_size*2),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=target_size*2, out_features=target_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=target_size, out_features=target_size),
        )

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.head(x)
        x = x / x.max()
        return x

In [None]:
devise_model = DeViSE(backbone, target_size=300).to(device)

# train loop

In [None]:
losses = []

def train(model, train_loader, loss_function, optimiser, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        loop = tqdm(train_loader)
        for images, targets in loop:
            images = images.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)
            flags = torch.ones(len(targets)).cuda(non_blocking=True)
            
            optimiser.zero_grad()
            predictions = model(images)

            loss = loss_function(predictions, targets, flags)
            loss.backward()
            optimiser.step()

            loop.set_description('Epoch {}/{}'.format(epoch + 1, n_epochs))
            loop.set_postfix(loss=loss.item())
            losses.append(loss.item())

In [None]:
trainable_parameters = filter(lambda p: p.requires_grad, devise_model.parameters())

loss_function = nn.CosineEmbeddingLoss()
optimiser = optim.Adam(trainable_parameters, lr=0.001)

In [None]:
train(model=devise_model,
      n_epochs=3,
      train_loader=train_loader,
      loss_function=loss_function,
      optimiser=optimiser)

In [None]:
loss_data = pd.Series(losses).rolling(window=15).mean()
ax = loss_data.plot();

ax.set_xlim(0,);
ax.set_ylim(0, 1);

# evaluate on test set

In [None]:
preds = []
test_loss = []

devise_model.eval()
with torch.no_grad():
    test_loop = tqdm(test_loader)
    for images, targets in test_loop:
        images = images.cuda(non_blocking=True)
        targets = targets.cuda(non_blocking=True)
        flags = torch.ones(len(targets)).cuda(non_blocking=True)

        predictions = devise_model(images)
        loss = loss_function(predictions, targets, flags)

        preds.append(predictions.cpu().data.numpy())
        test_loss.append(loss.item())

        test_loop.set_description('Test set')
        test_loop.set_postfix(loss=np.mean(test_loss[-5:]))

In [None]:
preds = np.concatenate(preds).reshape(-1, 300)
np.mean(test_loss)

# run a search on the predictions

In [None]:
preds.shape

In [None]:
def search(query, n=5):
    image_paths = test_df['path'].values
    distances = cdist(fasttext[query].reshape(1, -1), preds)
    closest_n_paths = image_paths[np.argsort(distances)].squeeze()[:n]
    close_images = [np.array(Image.open(image_path).resize((224, 224)).convert('RGB')) 
                    for image_path in closest_n_paths]
    return Image.fromarray(np.concatenate(close_images, axis=1))

In [None]:
search('bridge')

Great, that works too. We're now working with much larger, more complex data but the network is still able to make inferences about the interactions between written and visual language.