#Please visit the word_embedding_linear.ipynb notebook for comments about paths and options

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init
import torchvision.models as models
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.backends.cudnn as cudnn
from torch.nn.utils import clip_grad_norm
import pickle
import numpy as np
#!pip install "nltk==3.4.5"
from nltk.stem.cistem import Cistem
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_german = stopwords.words('german')

#!pip install fasttext
import fasttext
from random import random

import pandas as pd
from PIL import Image
import torchvision.transforms as transforms
from torch.utils import data
from torch.utils.data import Dataset, TensorDataset
import os

In [None]:
#text_emb_model = fasttext.load_model('path/to/wiki.de.bin')
text_emb_model = fasttext.load_model('path/to/newspaper.bin')

#text_emb_model = fasttext.train_unsupervised('/content/drive/MyDrive/Code/PJDS/custom_MAXHAL/mediaeval_vocab.npy', minn=3, maxn=5, dim=300)



In [None]:

def l2norm(X):
    norm = torch.pow(X, 2).sum(dim=1, keepdim=True).sqrt()
    X = torch.div(X, norm)
    return X

def cosine_sim(im, s):
    return im.mm(s.t())

def padding(max_length, word_list):
  word_list = [w.lower() for w in word_list]
  if len(word_list)>=max_length:
    return word_list[:max_length]
  else:
    difference = max_length - len(word_list)
    word_list = word_list + difference*[""]
    return word_list

def create_n_grams(n,word):
  if len(word)<=n:
    return [word]
  else:
    output = [word[i:n+i] for i in range(0,len(word)-n+1)]
    return output

def embed_with_3_4_5_grams(emb_model, word):
  three_grams = create_n_grams(3,word)
  four_grams = create_n_grams(4,word)
  five_grams = create_n_grams(5,word)
  total = [word] + three_grams + four_grams +five_grams
  total = [torch.from_numpy(emb_model[sw]).float()  for sw in total]
  total = torch.stack(total)
  total = torch.mean(total,0)
  return torch.unsqueeze(total,0)


#Model

In [None]:
class MAXHAL(object):##Model
  def __init__(self, learning_rate):
    self.img_enc = ImageEncoder()
    self.txt_enc = TextEncoder()
    if torch.cuda.is_available():
      self.img_enc.cuda()
      self.txt_enc.cuda()

    # Loss and Optimizer
    self.criterion = ContrastiveLoss()
    params = list(self.txt_enc.parameters())
    params += list(self.img_enc.fc.parameters())

    self.params = params

    self.optimizer = torch.optim.Adam(params, lr=learning_rate)

  def train_start(self):
    self.img_enc.train()
    self.txt_enc.train()

  def val_start(self):
    self.img_enc.eval()
    self.txt_enc.eval()
  
  def forward_emb(self, images, article,title,category):
    
    article = article.cuda()
    title = title.cuda()
    category = category.cuda()

    images = torch.Tensor(images).cuda()
    
    img_emb = self.img_enc(images)
    txt_emb = self.txt_enc(article,title,category)

    return img_emb, txt_emb

  def forward_loss(self, img_emb, txt_emb, indices):
    loss, loss_text, loss_img = self.criterion(img_emb, txt_emb)
    return loss

  def train_emb(self, images, article,title,category, ids):
    img_emb, txt_emb = self.forward_emb(images,article,title,category)

    self.optimizer.zero_grad()
    loss = self.forward_loss(img_emb, txt_emb, ids)
    print(loss)
    loss.backward()
    self.optimizer.step()

    del img_emb
    del txt_emb

In [None]:
class TextEncoder(nn.Module):
  def __init__(self):
    super(TextEncoder, self).__init__()
    self.word_dim = 300
    
    self.title_dim = 5
    self.article_dim = 25
    self.category_dim = 1

    self.embed_size = 512
    self._pool = nn.AdaptiveMaxPool1d(1)
    self.fuse_pool = nn.AdaptiveMaxPool1d(1)
        
    self.fc = nn.Linear(300,512)
    #self.init_weights()
    

  def forward(self,article,title,category):#input is fasttext embedding

      out_art = self._pool(article).squeeze(2)
      out_tit = self._pool(title).squeeze(2)
      out_cat = self._pool(category).squeeze(2)
        
      out_fused = torch.stack([out_art,out_tit,out_cat]).transpose(0,1).transpose(1,2)
      out = self.fuse_pool(out_fused).squeeze(2)
      out = self.fc(out)
      out = l2norm(out)
      
      return out

In [None]:
class ImageEncoder(nn.Module):

    def __init__(self, embed_size=512, finetune=False, cnn_type='vgg19',
                 use_abs=False, no_imgnorm=False):
        """Load pretrained VGG19 and replace top fc layer."""
        super(ImageEncoder, self).__init__()
        self.embed_size = embed_size
        self.no_imgnorm = no_imgnorm
        self.use_abs = use_abs

        # Load a pre-trained model
        self.cnn = self.get_cnn(cnn_type, True)

        # For efficient memory usage.
        for param in self.cnn.parameters():
            param.requires_grad = finetune

        # Replace the last fully connected layer of CNN with a new one
        self.fc = nn.Linear(self.cnn.classifier._modules['6'].in_features,
                                embed_size)
        self.cnn.classifier = nn.Sequential(
            *list(self.cnn.classifier.children())[:-1])

        self.init_weights()

    def get_cnn(self, arch, pretrained):
        """Load a pretrained CNN and parallelize over GPUs
        """
        print("=> using pre-trained model '{}'".format(arch))
        model = models.__dict__[arch](pretrained=True)
        model.features = nn.DataParallel(model.features)
        model.cuda()
        
        return model


    def init_weights(self):
        """Xavier initialization for the fully connected layer
        """
        r = np.sqrt(6.) / np.sqrt(self.fc.in_features +
                                  self.fc.out_features)
        self.fc.weight.data.uniform_(-r, r)
        self.fc.bias.data.fill_(0)

    def forward(self, images):
        """Extract image feature vectors."""
        features = self.cnn(images).cuda()

        # normalization in the image embedding space
        features = l2norm(features)

        # linear projection to the joint embedding space
        features = self.fc(features).cuda()

        # normalization in the joint embedding space
        if not self.no_imgnorm:
            features = l2norm(features)

        # take the absolute value of the embedding (used in order embeddings)
        if self.use_abs:
            features = torch.abs(features)

        return features

In [None]:
class ContrastiveLoss(nn.Module):
  def __init__(self, margin=0.4):
    super(ContrastiveLoss, self).__init__()
    self.margin = margin
    self.sim = cosine_sim
    self.max_violation = False
    self.sim_power = 1.5

  def forward(self, image, text):
    scores = self.sim(image, text)
    diagonal = scores.diag().view(image.size(0), 1)

    d1 = diagonal.expand_as(scores)
    d2 = diagonal.t().expand_as(scores)

    cost_text = (self.margin + scores - d1).clamp(min=0)
    cost_im = (self.margin + scores - d2).clamp(min=0)

    mask = torch.eye(scores.size(0)) > 0.5
    I = Variable(mask)
    if torch.cuda.is_available():
      I = I.cuda()
    cost_text = cost_text.masked_fill_(I, 0)
    cost_im = cost_im.masked_fill_(I, 0)
    
    cost_text = torch.pow(cost_text, self.sim_power)
    cost_im = torch.pow(cost_im, self.sim_power)

    return cost_text.sum() + cost_im.sum(), cost_text.sum(), cost_im.sum()

In [None]:
class ArticleDataset(data.Dataset):
    def __init__(self,dataset,text_emb_model, split):
      
      self.basepath = 'path/to/images'
      
      normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])
      if split=='test':
        t_list = [transforms.Resize(250), transforms.CenterCrop(250)]
      else:
        t_list = [transforms.Resize(250), transforms.CenterCrop(250),transforms.RandomResizedCrop(250), transforms.RandomHorizontalFlip()]
      t_end = [transforms.ToTensor(), normalizer]
      
      t_end = [transforms.ToTensor(), normalizer]
      
      self.stemmer = Cistem()
      self.transform = transforms.Compose(t_list + t_end)
      self.dataset = dataset 
      self.text_emb_model =  text_emb_model
        
    def __getitem__(self, index):
      row = self.dataset.iloc[index]
      path = row[['imgFile']].tolist()[0]              
      image = Image.open(self.basepath + '/'+path).convert('RGB')
      image = self.transform(image)
      
      title = row[['title']].tolist()[0]
      article = row[['text']].tolist()[0]
      category = row[['category']].tolist()[0]
      title = title.split(' ')
      article = article.split(' ')
      category = category.split(' ')
      
      title = [self.stemmer.stem(w) for w in title]
      article = [self.stemmer.stem(w) for w in article]
      category = [self.stemmer.stem(w) for w in category]

      title = padding(5,title)
      article = padding(25,article)
      category = padding(1,category)

      title = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in title]
      title = torch.stack(title)
      article = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in article]
      article = torch.stack(article)
      category = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in category]
      category = torch.stack(category)
      
      title = title.t()
      article = article.t()
      category = category.t()

      return image, article, title, category, index

    def __len__(self):
      return self.dataset.shape[0]

class TextMediaeval(data.Dataset):
    def __init__(self,dataset,text_emb_model):
      
      self.dataset = dataset 
      self.text_emb_model =  text_emb_model
      self.stemmer = Cistem()
      
    def __getitem__(self, index):
      row = self.dataset.iloc[index]

      title = row[['title']].tolist()[0]
      article = row[['text']].tolist()[0]
      category = row[['category']].tolist()[0]
      title = title.split(' ')
      article = article.split(' ')
      category = category.split(' ')
      
      title = [self.stemmer.stem(w) for w in title]
      article = [self.stemmer.stem(w) for w in article]
      category = [self.stemmer.stem(w) for w in category]

      title = padding(5,title)
      article = padding(25,article)
      category = padding(1,category)

      title = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in title]
      title = torch.stack(title)
      article = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in article]
      article = torch.stack(article)
      category = [embed_with_3_4_5_grams(self.text_emb_model,t) for t in category]
      category = torch.stack(category)
      
      title = title.t()
      article = article.t()
      category = category.t()
      
      return article, title, category

    def __len__(self):
      return self.dataset.shape[0]

class ImageMediaeval(data.Dataset):
    def __init__(self):
      
      self.basepath = '/path/to/image/test/'
      
      normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])
      t_list = []
      t_list = [transforms.Resize(250), transforms.CenterCrop(250)]
      t_end = [transforms.ToTensor(), normalizer]
      
      self.transform = transforms.Compose(t_list + t_end)
      self.dataset = os.listdir(self.basepath)
        
    def __getitem__(self, index):
      file_name = self.dataset[index]
                    
      image = Image.open(self.basepath + file_name).convert('RGB')
      image = self.transform(image)
      
      return image, index

    def __len__(self):
      return len(self.dataset)

#Validation

In [None]:
def validate(val_loader, model, text='', metrics_only=True):

    model.val_start()

    # compute the encoding for all the validation images and captions
    img_embs, cap_embs = encode_data(model, val_loader)

    if metrics_only:
      return t2i_article_metrics(img_embs, cap_embs)
    else:
      return t2i_article_indexes(img_embs, cap_embs)

def encode_data(model, data_loader, log_step=10, logging=print):
    """Encode all images and captions loadable by `data_loader`
    """

    # switch to evaluate mode
    model.val_start()

    # numpy array to keep all the embeddings
    img_embs = None
    cap_embs = None
    print("data_loader.dataset length:", len(data_loader.dataset))
    for i, (images, article,title,category, ids) in enumerate(data_loader):
        # compute the embeddings
        img_emb, cap_emb = model.forward_emb(images, article,title,category)

        if img_embs is None:
          img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1)))
          cap_embs = np.zeros((len(data_loader.dataset), cap_emb.size(1)))

        # preserve the embeddings by copying from gpu and converting to numpy
        img_embs[ids] = img_emb.data.cpu().numpy().copy()
        cap_embs[ids] = cap_emb.data.cpu().numpy().copy()

        del images, article,title,category

    return img_embs, cap_embs

In [None]:
def t2i_article_indexes(images, captions, npts=None, measure='cosine', return_ranks=False):
    """
    Text->Images (Image Search)
    Images: (N, K) matrix of images
    Captions: (N, K) matrix of captions
    """
    if npts is None:
        npts = captions.shape[0]

    ims = images

    total_score = .0
    counter = 0
    total_index = []
    for index in range(npts):

        # Get query captions
        queries = captions[index:index+1]

        # Compute scores
        d = np.dot(queries, ims.T)
        inds = np.zeros(d.shape)
        
        for i in range(len(inds)):
          total_index.append(np.argsort(d[i])[::-1][:100])

    return total_index

def t2i_article_metrics(images, captions, npts=None, measure='cosine', return_ranks=False):
    """
    Text->Images (Image Search)
    Images: (N, K) matrix of images
    Captions: (N, K) matrix of captions
    """
    if npts is None:
        npts = captions.shape[0]
    print(npts)
    ims = images

    ranks = np.zeros(npts)
    top1 = np.zeros(npts)
    
    total_score = .0
    counter = 0
    total_index = []
    for index in range(npts):

        # Get query captions
        queries = captions[index:index+1]

        # Compute scores
        d = np.dot(queries, ims.T)
        inds = np.zeros(d.shape)
        
        for i in range(len(inds)):
          inds[i] = np.argsort(d[i])[::-1]
          ranks[index + i] = np.where(inds[i] == index)[0][0]
          top1[index + i] = inds[i][0]
      
    #Compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r100 = 100.0 * len(np.where(ranks < 100)[0]) / len(ranks)
    r1000 = 100.0 * len(np.where(ranks < 1000)[0]) / len(ranks)
    
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1   

    return (r1, r5, r10,r100,r1000, medr, meanr), (ranks, top1)

In [None]:
(r1, r5, r10,r100, r1000,medr, meanr),(ranks,top1) = validate(test_data_loader,model)

In [None]:
print("R@1: "+str(round(r1,2))+"%")
print("R@5: "+str(round(r5,2))+"%")
print("R@10: "+str(round(r10,2))+"%")
print("R@100: "+str(round(r100,2))+"%")
print("R@1000: "+str(round(r1000,2))+"%")

# Train

In [None]:
import pandas as pd

In [None]:
#test = pd.read_csv('/content/drive/MyDrive/Code/PJDS/custom_MAXHAL/mediaeval_data.csv') #Full dataset
dataset_train = pd.read_csv('/content/drive/MyDrive/Code/PJDS/custom_MAXHAL/mediaeval_data_train_2.csv')
dataset_test = pd.read_csv('/content/drive/MyDrive/Code/PJDS/custom_MAXHAL/mediaeval_data_test_2.csv')

In [None]:
datamodel_train = ArticleDataset(dataset_train,text_emb_model, split='train')
train_data_loader = torch.utils.data.DataLoader(dataset=datamodel_train,
                                              batch_size=128,
                                              shuffle=True,
                                              num_workers=4)

datamodel_test = ArticleDataset(dataset_test,text_emb_model, split='test')
test_data_loader = torch.utils.data.DataLoader(dataset=datamodel_test,
                                              batch_size=300,
                                              shuffle=False,
                                              num_workers=4)

In [None]:
learning_rate = 0.002
model = MAXHAL(learning_rate)

=> using pre-trained model 'vgg19'


In [None]:
for epoch in range(30):

  print("training for epoch " + str(epoch))
  # if epoch in [20,40,60,80]:
  #   learning_rate = 0.002
  if epoch % 4  == 0 and epoch>0:
    print("LEARNING RATE DECREASE!")
    learning_rate = learning_rate/2.0
    optimizer = model.optimizer
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

  for i,train_data in enumerate(train_data_loader): 
    model.train_start()

    model.train_emb(*train_data)

#Mediaeval Evaluation

In [None]:
dataset_text = pd.read_csv('path/to/test/set')
categories = []
for url in dataset_text["url"]:
  url_arr = url.split('/')
  cat = url_arr[len(url_arr)-2]
  categories.append(cat)
dataset_text["category"] = categories

In [None]:
datamodel_text = TextMediaeval(dataset_text,text_emb_model)
text_data_loader = torch.utils.data.DataLoader(dataset=datamodel_text,
                                              batch_size=len(datamodel_text),
                                              shuffle=False,
                                              num_workers=2)

datamodel_image = ImageMediaeval()
image_data_loader = torch.utils.data.DataLoader(dataset=datamodel_image,
                                              batch_size=100,
                                              shuffle=False,
                                              num_workers=2)

In [None]:
for i,train_data in enumerate(text_data_loader):
  model.val_start()
  text_embedding = model.txt_enc.forward(train_data[0].cuda(),train_data[1].cuda(),train_data[2].cuda())

In [None]:
text_embedding = text_embedding.detach().cpu().numpy()

In [None]:
img_emb_list = []
for i,train_data in enumerate(image_data_loader):
  model.val_start() 
  img_embedding = model.img_enc.forward(train_data[0].cuda())
  img_emb_list.append(img_embedding.detach().cpu().numpy())

In [None]:
vec = img_emb_list[0]
for arr in img_emb_list[1:]:
  vec = np.vstack((vec,arr))

In [None]:
vec.shape

In [None]:
img_embedding_full = vec

In [None]:
indexs_final = t2i_article_indexes(img_embedding_full, text_embedding)

In [None]:
len(np.unique(indexs_final))

In [None]:
indexs_final = np.array(indexs_final)

In [None]:
img_list = [w.replace('.jpg', '') for w in datamodel_image.dataset]
dataset_text = dataset_text.astype({"articleID": int})
img_list = np.array(img_list)

In [None]:
csv_list = np.hstack((dataset_text['articleID'][0],img_list[indexs_final[0]]))

for a,i in zip(dataset_text['articleID'][1:],indexs_final[1:]):
    csv_list = np.vstack((csv_list,np.hstack((a,img_list[i]))))
dd = pd.DataFrame(csv_list)
dd.to_csv('results.csv',index=False, sep ='\t')