# setup

In [51]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from efficientnet_pytorch import EfficientNet
from PIL import Image
from torchvision import transforms


from transformers import AutoModel, AutoTokenizer
import torch

In [20]:
HOME = '/data/git/shopee-product-matching'
pdata = f'{HOME}/data/shopee-product-matching'
!ls $pdata

sample_submission.csv  test.csv  test_images  train.csv  train_images


# data

In [21]:
fnm = 'train'
df = pd.read_csv(f'{pdata}/{fnm}.csv')
assert len(df) == df.posting_id.nunique()

df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


# vision

In [57]:
mdlv = EfficientNet.from_pretrained('efficientnet-b0')

# https://github.com/lukemelas/EfficientNet-PyTorch
tfms = transforms.Compose([transforms.Resize(224), transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

imgfn = df.image.iloc[0]
img = tfms(Image.open(f'{pdata}/train_images/{imgfn}')).unsqueeze(0)
print(img.shape) # torch.Size([1, 3, 224, 224])
features = mdlv.extract_features(img)
print(features.shape) # torch.Size([1, 1280, 7, 7])


Loaded pretrained weights for efficientnet-b0
torch.Size([1, 3, 224, 224])
torch.Size([1, 1280, 7, 7])


In [58]:
vecv = features.mean(dim=(-1,-2))
vecv.shape

torch.Size([1, 1280])

In [59]:
def mkvec_v(imgfn, mdlv, tfms):
    img = tfms(Image.open(f'{pdata}/train_images/{imgfn}')).unsqueeze(0)
    features = mdlv.extract_features(img)
    return features.mean(dim=(-1,-2))

In [60]:
mkvec_v(imgfn, mdlv, tfms)

tensor([[0.1484, 0.1114, 0.1691,  ..., 0.2296, 0.0968, 0.1339]],
       grad_fn=<MeanBackward1>)

In [None]:
vecs_v = []
for imgfn in tqdm(df.image):
    vecs_v.append(mkvec_v(imgfn, mdlv, tfms))

  3%|▎         | 1027/34250 [05:26<21:47:16,  2.36s/it]

# text

In [44]:
# 'bert-base-multilingual-cased',
# 'xlm-roberta-base',
# 'pvl/labse_bert'
nm_mdl = 'pvl/labse_bert'
mdlt = AutoModel.from_pretrained(nm_mdl)
tokenizer = AutoTokenizer.from_pretrained(nm_mdl, do_lower_case=False)

def mk_tensors(txt, tokenizer, maxlen):
    tok_res = tokenizer(
        txt, truncation=True, padding=True, max_length=maxlen
    )
    input_ids = tok_res["input_ids"]
    token_type_ids = tok_res["token_type_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    token_type_ids = torch.tensor(token_type_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask, token_type_ids

txt = df.title.iloc[0]
input_ids, attention_mask, token_type_ids = mk_tensors([txt], tokenizer, 32)
output = mdlt(input_ids, attention_mask, token_type_ids, output_hidden_states=True)

In [45]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [48]:
embt = output.last_hidden_state.mean(dim=(-2))
embt.shape

torch.Size([1, 768])

https://huggingface.co/transformers/model_doc/bert.html#bertmodel

    last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) – Sequence of hidden-states at the output of the last layer of the model.

    pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) – Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.

https://github.com/huggingface/transformers/issues/1328#issuecomment-534956703

    Both would probably work, but I agree that streamlining is a good idea. In their paper, BERT gets the best results by concatenating the last four layers, so what I always use is something like this (from the top of my head):

    outputs = self.bert(input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        position_ids=position_ids, 
                        head_mask=head_mask)

    hidden_states = outputs[1]
    pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, -1]]), dim=-1)
    pooled_output = pooled_output[:, 0, :]
    pooled_output = self.dropout(pooled_output)
    # classifier of course has to be 4 * hidden_dim, because we concat 4 layers
    logits = self.classifier(pooled_output)

In [49]:
def mkvec_t(txt, mdlt, tokenizer):
    input_ids, attention_mask, token_type_ids = mk_tensors([txt], tokenizer, 32)
    output = mdlt(input_ids, attention_mask, token_type_ids, output_hidden_states=True)
    return output.last_hidden_state.mean(dim=(-2))

In [None]:
vecs_t = []
for txt in tqdm(df.title):
    vecs_t.append(mkvec_t(txt, mdlt, tokenizer))

# similarity