# setup

In [34]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from efficientnet_pytorch import EfficientNet
from PIL import Image
from torchvision import transforms

from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch, gc

HOME = '/data/git/shopee-product-matching'
pdata = f'{HOME}/data/shopee-product-matching'
!ls $pdata

BS = 256
NWKRS = 8
DEVICE = 'cuda'
PIN_MEMORY = True
MAXLEN = 128

sample_submission.csv  test.csv  test_images  train.csv  train_images


# data

In [2]:
fnm = 'train'
df = pd.read_csv(f'{pdata}/{fnm}.csv')
assert len(df) == df.posting_id.nunique()
p_imgs = f"{pdata}/train_images"
df.head(2)

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045


# vision

In [5]:
class VDataset(Dataset):
    def __init__(self, df, p_imgs, transforms):
        self.df = df
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        img_id = self.df.image[idx]
        img_path = f"{p_imgs}/{img_id}"
        img = Image.open(img_path)
        img = self.transforms(img)
        return img

# https://github.com/lukemelas/EfficientNet-PyTorch
tfms = transforms.Compose([
    transforms.Resize((224,224)), 
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

ds = VDataset(df, p_imgs, tfms)

dl = DataLoader(dataset=ds, 
                batch_size=BS,
                num_workers=NWKRS,
                pin_memory=PIN_MEMORY,
                shuffle=False)


device = torch.device(DEVICE)
mdlv = EfficientNet.from_pretrained('efficientnet-b0').to(device)
mdlv.eval()

feats = np.zeros((len(ds), 1280))
i = 0
for dat in tqdm(dl):
    with torch.no_grad():
        fts = mdlv.extract_features(dat.to(device)).mean(dim=(-1,-2))
    l = len(fts)
    feats[i:i+l,:] = fts.cpu().detach().numpy()

Loaded pretrained weights for efficientnet-b0


100%|██████████| 134/134 [01:56<00:00,  1.15it/s]


In [8]:
df.shape, feats.shape

((34250, 5), (34250, 1280))

In [22]:
del dl, ds, dat, fts, mdlv

NameError: name 'dl' is not defined

In [24]:
gc.collect()
torch.cuda.empty_cache()

In [25]:
vfeats = feats

# text

In [32]:
def mk_tensors(txt, tokenizer, maxlen):
    tok_res = tokenizer(
        txt, truncation=True, padding=True, max_length=maxlen
    )
    input_ids = tok_res["input_ids"]
    token_type_ids = tok_res["token_type_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    token_type_ids = torch.tensor(token_type_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask, token_type_ids


def mk_dl(tensors):
    input_ids, token_type_ids, attention_mask = tensors
    ds = TensorDataset(input_ids, attention_mask, token_type_ids)
    dl = DataLoader(dataset=ds, 
                batch_size=BS,
                num_workers=NWKRS,
                pin_memory=PIN_MEMORY,
                shuffle=False)
    return dl

In [29]:
# 'bert-base-multilingual-cased',
# 'xlm-roberta-base',
# 'pvl/labse_bert'
nm_mdl = 'pvl/labse_bert'
tokenizer = AutoTokenizer.from_pretrained(nm_mdl, do_lower_case=False)

tensors = mk_tensors(list(df.title.values), tokenizer, maxlen=MAXLEN)

In [30]:
[o.shape for o in tensors]

[torch.Size([34250, 128]), torch.Size([34250, 128]), torch.Size([34250, 128])]

In [35]:
dl = mk_dl(tensors)

In [38]:
mdlt = AutoModel.from_pretrained(nm_mdl).to(device)
mdlt.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(501153, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [45]:
feats = np.zeros((len(df), 768))
i = 0
for dat in tqdm(dl):
    with torch.no_grad():
        dat = (o.to(device) for o in dat)
        output = mdlt(*dat)
        fts = output.last_hidden_state
        fts = fts.mean(dim=-2)
        l = len(fts)
        feats[i:i+l,:] = fts.cpu().detach().numpy()

100%|██████████| 134/134 [01:13<00:00,  1.82it/s]


In [51]:
del dl, dat, fts, mdlt

NameError: name 'dl' is not defined

In [49]:
gc.collect()
torch.cuda.empty_cache()

In [54]:
df.shape, feats.shape

((34250, 5), (34250, 768))

In [50]:
tfeats = feats