# broadening the scope of our classes
So far, we've used ImageNet data as the basis for teaching our machine about the relationship between language and visual features. The 200 classes of tiny ImageNet and the 1000 classes of ImageNet are aggregations of images described by a number of WordNet nouns - that's where all of our `wordnet_id`s come from.  
In this notebook, we test the hypothesis that we needn't confine ourselves to the 1000 classes of ImageNet. Instead of a large number of images associated with a small number of classes, we can invert the relationship to obtain a small number of images for each of a large number of classes, thereby mapping the word-vector space more completely. When using ImageNet, we precisely define the points in word vector space which map to certain visual features, but the rest of the space must be geometrically inferred.  By reducing the precision but increasing the breadth, the hope is that we'll see a more informed network.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 20)

import os
import io
import requests
import numpy as np
import pandas as pd
from PIL import Image
from scipy.spatial.distance import cdist
from scipy.io import loadmat
from bs4 import BeautifulSoup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# get wordnet nouns

In [None]:
id_to_word = {}
wordnet_url = 'http://files.fast.ai/data/classids.txt'

for line in requests.get(wordnet_url).text.split('\n'):
    try:
        id, word = line.split()
        id_to_word[id] = word
    except: pass    

In [None]:
wordnet_nouns = [word.lower().replace('_', '-') for word in id_to_word.values()]

# load word vectors

In [None]:
wv_path = '/mnt/efs/nlp/word_vectors/fasttext/crawl-300d-2M.vec'
wv_file = io.open(wv_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

word_vectors = {line.split()[0]: np.array(line.split()[1:]).astype(np.float)
                for line in tqdm(list(wv_file))}

In [None]:
word_vector_set = set(word_vectors.keys())
wordnet_set = set(wordnet_nouns)

valid_queries = list(word_vector_set.intersection(wordnet_set))

# get images of the valid wordnet nouns from google
We'll use google images to obtain the corresponding image sets for our wordnet nouns. By concatenating the wordnet noun with a google search query string and parsing the response with beautifulsoup, we can build up a broad set of small images relatively quickly, as demonstrated below with a random query.

In [None]:
query = np.random.choice(valid_queries)
base_url = 'https://www.google.com/search?tbm=isch&q='

soup = BeautifulSoup(requests.get(base_url + query).content)
urls = [img['src'] for img in soup.findAll('img')]

print(query)

images = [(Image.open(io.BytesIO(requests.get(url).content))
           .resize((64, 64), resample=Image.BILINEAR)
           .convert('RGB'))
          for url in urls]

Image.fromarray(np.concatenate(images, axis=1))

We can wrap up that functionality for convenience

In [None]:
def image_search(query):
    base_url = 'https://www.google.com/search?tbm=isch&q='

    soup = BeautifulSoup(requests.get(base_url + query).content)
    urls = [img['src'] for img in soup.findAll('img')]
    
    images = [Image.open(io.BytesIO(requests.get(url).content)).convert('RGB')
              for url in urls]
    
    return images

In [None]:
images = [i.resize((224,224)) for i in image_search('dog')]
Image.fromarray(np.concatenate(images, axis=1))

# save the data
Let's churn through our wordnet nouns and save a collection of images for each

In [None]:
save_path = '/mnt/efs/images/google_scraping/'

for query in tqdm(np.random.choice(valid_queries, 2000)):
    images = image_search(query)
    for i, image in enumerate(images):
        image.save(save_path + '{}_{}.jpg'.format(query, i))

from here onwards, the process is much the same as before. We'll define our data loading processes, build a simple model with a pre-trained feature-extracting backbone and train it until the loss bottoms out. Then we'll evaluate how well it has generalised against a pre-defined test set and run some test queries using out-of-vocabulary words.

# datasets and dataloaders

In [None]:
df = {}

for file_name in os.listdir(save_path):
    df[save_path + file_name] = file_name.split('_')[0]

df = pd.Series(df).to_frame().reset_index()
df.columns = ['path', 'word']

In [None]:
df = df.sample(frac=1).reset_index(drop=True) 

In [None]:
split_ratio = 0.8
train_size = int(split_ratio * len(df))

train_df = df.loc[:train_size]
test_df  = df.loc[train_size:]

In [None]:
class ImageDataset(Dataset):
    def __init__(self, dataframe, word_vectors,
                 transform=transforms.ToTensor()):
        self.image_paths = dataframe['path'].values
        self.words = dataframe['word'].values
        self.word_vectors = word_vectors
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.image_paths[index]).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        target = torch.Tensor(word_vectors[self.words[index]])
        return image, target

    def __len__(self):
        return len(self.words)

In [None]:
train_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.6, 0.9]),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.RandomRotation(15),
                                      transforms.RandomGrayscale(0.25),
                                      transforms.ToTensor()])

test_transform = transforms.Compose([transforms.RandomResizedCrop(224, scale=[0.6, 0.9]),
                                     transforms.ToTensor()])

In [None]:
train_dataset = ImageDataset(train_df, word_vectors, train_transform)
test_dataset = ImageDataset(test_df, word_vectors, test_transform)

In [None]:
batch_size = 128

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          num_workers=5,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         num_workers=5)

# building the model

In [None]:
backbone = models.vgg16_bn(pretrained=True).features

In [None]:
for param in backbone.parameters():
    param.requires_grad = False

In [None]:
class DeViSE(nn.Module):
    def __init__(self, backbone, target_size=300):
        super(DeViSE, self).__init__()
        self.backbone = backbone
        self.head = nn.Sequential(
            nn.Linear(in_features=(25088), out_features=target_size*2),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=target_size*2, out_features=target_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=target_size, out_features=target_size),
        )

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.head(x)
        x = x / x.max()
        return x

In [None]:
devise_model = DeViSE(backbone).to(device)

# train loop

In [None]:
losses = []

def train(model, train_loader, loss_function, optimiser, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        loop = tqdm(train_loader)
        for images, targets in loop:
            images = images.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)
            flags = torch.ones(len(targets)).cuda(non_blocking=True)

            optimiser.zero_grad()
            predictions = model(images)

            loss = loss_function(predictions, targets, flags)
            loss.backward()
            optimiser.step()

            loop.set_description('Epoch {}/{}'.format(epoch + 1, n_epochs))
            loop.set_postfix(loss=loss.item())
            losses.append(loss.item())

In [None]:
trainable_parameters = filter(lambda p: p.requires_grad, devise_model.parameters())

loss_function = nn.CosineEmbeddingLoss()
optimiser = optim.Adam(trainable_parameters, lr=0.001)

In [None]:
train(model=devise_model,
      n_epochs=3,
      train_loader=train_loader,
      loss_function=loss_function,
      optimiser=optimiser)

In [None]:
loss_data = pd.Series(losses).rolling(window=15).mean()
ax = loss_data.plot();

ax.set_xlim(0,);
ax.set_ylim(0, 1);

# evaluate on test set

In [None]:
preds = []
test_loss = []

devise_model.eval()
with torch.no_grad():
    test_loop = tqdm(test_loader)
    for images, targets in test_loop:
        images = images.cuda(non_blocking=True)
        targets = targets.cuda(non_blocking=True)
        flags = torch.ones(len(targets)).cuda(non_blocking=True)
        
        predictions = devise_model(images)
        loss = loss_function(predictions, targets, flags)

        preds.append(predictions.cpu().data.numpy())
        test_loss.append(loss.item())

        test_loop.set_description('Test set')
        test_loop.set_postfix(loss=np.mean(test_loss[-5:]))

In [None]:
preds = np.concatenate(preds).reshape(-1, 300)
np.mean(test_loss)

# run a search on the predictions

In [None]:
preds.shape

In [None]:
def search(query, n=5):
    image_paths = test_df['path'].values
    distances = cdist(word_vectors[query].reshape(1, -1), preds)
    closest_n_paths = image_paths[np.argsort(distances)].squeeze()[:n]
    close_images = [np.array(Image.open(image_path).convert('RGB').resize((224,224)))
                    for image_path in closest_n_paths]
    return Image.fromarray(np.concatenate(close_images, axis=1))

In [None]:
search('bridge')

again, this works! We're getting somewhere now, and making significant changes to the established theory set out in the original DeViSE paper.