In [1]:
# import sys
# !cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import os
data_dir = '../input/pretrained/'
cache_dir ='/root/.cache/torch/hub/checkpoints/'
hugging_path = '/root/.cache/huggingface/'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

from shutil import copyfile, copytree
for fname in os.listdir(data_dir): 
   
    if 'transform' in fname:
        src = data_dir + fname
        dest = hugging_path + fname
        copytree(src, dest)
    else: 
        src = data_dir + fname
        dest = cache_dir + fname
        copyfile(src, dest)

In [3]:
!pip install ../input/effnetpytrochwhl/efficientnet_pytorch-0.7.0-py3-none-any.whl
!pip install ../input/effnetpytrochwhl/timm-0.4.8-py3-none-any.whl

Processing /kaggle/input/effnetpytrochwhl/efficientnet_pytorch-0.7.0-py3-none-any.whl
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0
Processing /kaggle/input/effnetpytrochwhl/timm-0.4.8-py3-none-any.whl
Installing collected packages: timm
Successfully installed timm-0.4.8


In [4]:
import random, re, string, gc, math, os

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2
import unicodedata


import sklearn

from tqdm.notebook import tqdm


import  cuml, cupy, cudf
from cuml import DBSCAN
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from cuml.experimental.preprocessing import normalize


from PIL import Image

import torch
# torch.manual_seed(1010)


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader


import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import transformers


import timm
from efficientnet_pytorch import EfficientNet


import nltk
from nltk.corpus import stopwords

In [5]:
IMG_SIZE = 512
N_WORKERS = 4
BATCH_SIZE = 12
SEED = 24
COMB_RATE = 0.1


NFNET_CONFIG = {
    'path' : '../input/checkpoints/l0_fc512_s12_m03_epoch9.pth',
    'params' : {
        'model_name': 'eca_nfnet_l0',
        'feature_space' : 512, 
        'out_features' : 11014, 
        'scale' : 12.0, 
        'margin' : 0.35
    }
}

EFFNET_CONFIG = {
    'path' : '../input/checkpoints/b0_fc512_s12_m03_epoch9.pth',
    'params' : {
        'model_name': 'efficientnet-b0',
        'feature_space' : 512, 
        'out_features' : 11014, 
        'scale' : 12.0, 
        'margin' : 0.3
    }
}

STSB_CONFIG = {
    'path' : '../input/checkpoints/stsb_fc512_s24.0_m0.35_epoch2.pth',
    'params' : {
        'model_name': '../input/stsb-xlm-r-multilingual/0_Transformer',
        'feature_space' : 512, 
        'out_features' : 11014, 
        'scale' : 24.0, 
        'margin' : 0.35
    }
}

PHRASE_CONFIG = {
    'path' : '../input/checkpoints/paraphrase_fc512_s12.0_m0.3_epoch2.pth',
    'params' : {
        'model_name': '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer',
        'feature_space' : 512, 
        'out_features' : 11014, 
        'scale' : 12.0, 
        'margin' : 0.3
    }
}


# **Utils**

In [6]:
def combine_for_sub(row):
    x = np.concatenate([
        row.comb_preds, 
        row.tf_preds,
        row.phash
    ])
    return " ".join(np.unique(x))

In [7]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    


In [8]:
def clean_title(row):
   
    title = row.title
    
    title = title.encode().decode('unicode_escape').encode('latin1').decode()
    title = unicodedata.normalize('NFD', title)
    title = ''.join([c for c in title if not c == '\N{COMBINING ACUTE ACCENT}'])    
    title = title.lower()
    title = re.sub(r"\-","",title)
    title = re.sub(r"\+","",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|","",title)
    title = re.sub(r"\\","",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}","",title)
    
    title = re.sub(r"\s+"," ",title)

    title = re.sub(r"\d+", lambda x: f" {x.group(0)} ", title)

    
    stop = stopwords.words('english') + \
           stopwords.words('indonesian') + \
           [# Sales 
                'free', 'gift', 'give', 'get', 'ready', 'stock', 'stocks', 'stok',
                'ori', 'original', 'official', 'new', 'latest',
                'import', 'low', 'price', 'cheap', 'vip', 'discount', 'warranty',
                'promo', 'promotion', 'buy', 'buyer', 'shop', 'shopper', 'shopping',
                'bigsale', 'big', 'sale', 'sell', 'seller', 'resell', 'reseller',
                'all', 'any', 'full', 'include', 'includes', 'inclusive', 'tax',
                       
            # Units
                'pieces', 'piece', 'pcs', 'pc', 'box', 'boxes', 'pack', 'packs', 'packet', 'packets', 'paket', 'package',
                'set', 'sets', 'size', 'roll', 'rolls', 'sachet', 'sachets',
            
            # Dimensions
                'ml', 'l', 'litre', 'liter', 'g', 'gr', 'gram', 'kg', 'kilo', 'kilogram',
                'mm', 'cm', 'm', 'meter', 'metre', 'yard', 'inch', 'x',
    
            # Miscellaneous alphabets
                'c', 'xe', 'f', 'b', 'v', 'xa',
                
            # Location words:
                'shopee', 'indonesia', 'indonesian', 'indo', 'id', 'jakarta', 'local', 'lokal',
    
            # English descriptors:
                'fashion', 'colour', 'color', 'design',
                'plus', 'pro', 'mini', 'premium', 'pro', 'super', 'extra', 'big', 'small',
                
            # Indonesian descriptors:
                'bpom', 'muat', 'cod', 'murah', 'isi', 'warna', 'pajak', 'garansi', 'beli', 'gratis',
                'terbaru', 'harga', 'resmi',
    ]

    
    title = [x for x in title.split() if not x in stop]
    title = " ".join(title)
    
    return title

# **ShopeeNet**

In [9]:
class ShopeeDataset(Dataset):
    def __init__(self, df, mode, transform):
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img = cv2.imread(row.image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transform(image=img)['image']

        if self.mode == 'test':
            return img
        if self.mode == 'train':
            return img, torch.tensor(row.label_group).float()
    
    def __len__(self):
        return len(self.df)
    
class ShopeeTextDataset(Dataset):
    def __init__(self, df, mode, tokenizer):
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.tokenizer = tokenizer

        
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        text = row.title

        text = self.tokenizer(text, max_length=128, padding='max_length', truncation=True,  return_tensors='pt')
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0] 

        if self.mode == 'test':
            return (input_ids, attention_mask)
        if self.mode == 'train':
            return (input_ids, attention_mask), torch.tensor(row.label_group).float()
    
    def __len__(self):
        return len(self.df)

In [10]:
class ArcFace(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.5, eps=1e-6):
        super(ArcFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.threshold = math.pi - margin
        self.eps = eps

        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label=None):
        cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))

        if label is None:
            return cos_theta

        theta = torch.acos(torch.clamp(cos_theta, -1.0 + self.eps, 1.0 - self.eps))

        one_hot = torch.zeros_like(cos_theta)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)

        mask = torch.where(theta > self.threshold, torch.zeros_like(one_hot), one_hot)

        logits = torch.cos(torch.where(mask.bool(), theta + self.margin, theta))
        logits *= self.scale

        return logits
        

class ShopeeNet(nn.Module):
    def __init__(self, model_name, feature_space, out_features, scale, margin):
        super(ShopeeNet, self).__init__()
        self.feature_space = feature_space
        self.out_features = out_features
        
        if 'efficientnet' in model_name:
            self.backbone = EfficientNet.from_pretrained(model_name)
            self.in_features = self.backbone._conv_head.out_channels
            self.timm = False
        
        elif 'nfnet' in model_name:
            self.backbone = timm.create_model(model_name, pretrained=True, num_classes=0)
            self.in_features = self.backbone.final_conv.out_channels
            self.timm = True
        
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(self.in_features, self.feature_space)
        self.bn = nn.BatchNorm1d(self.feature_space)
        
        self.margin = ArcFace(
            in_features = self.feature_space,
            out_features = self.out_features,
            scale = scale, 
            margin = margin       
        )
        
        if self.training:
            self._init_params()
        

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)


    def forward(self, img, label=None):
        features = self.extract_features(img)
        
        if self.training:
            logits = self.margin(features, label)
            return logits
        else:
            logits = self.margin(features)
            return logits

    def extract_features(self, x):    
        batch_size = x.shape[0]

        if self.timm:
            x = self.backbone(x).view(batch_size, -1)

        else:
            x = self.backbone.extract_features(x)
            x = self.backbone._avg_pooling(x).view(batch_size, -1)

        x = self.dropout(x)
        x = self.classifier(x)
        x = self.bn(x) 
        
        return x
    

class ShopeeBert(nn.Module):

    def __init__(self, model_name, feature_space, out_features, scale, margin):
        super(ShopeeBert, self).__init__()
        self.feature_space = feature_space
        self.out_features = out_features

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        self.in_features = self.transformer.config.hidden_size
        
        
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(self.in_features, self.feature_space)
        self.bn = nn.BatchNorm1d(self.feature_space)
        
        self.margin = ArcFace(
            in_features = self.feature_space,
            out_features = self.out_features,
            scale = scale, 
            margin = margin       
        )
        
        if self.training:
            self._init_params()

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)


    def forward(self, input_ids, attention_mask, label=None):
        features = self.extract_text_feat(input_ids, attention_mask)

        if self.training:
            logits = self.margin(features, label)
            return F.normalize(logits)
        else:
            logits = self.margin(features)
            return F.normalize(logits)


    def extract_text_feat(self, input_ids, attention_mask):
        # inputs = self.tokenizer(text, max_length=128, padding='max_length', truncation=True,  return_tensors='pt')
        x = self.transformer(input_ids, attention_mask)

        features = x[0]
        features = features[:, 0, :]

        features = self.dropout(features)
        features = self.classifier(features)
        features = self.bn(features) 
        
        return features  

In [11]:
class EnsembleNet(nn.Module):
    def __init__(self, m1, m2):
        super(EnsembleNet, self).__init__()
        
        self.m1 = get_model(ShopeeNet, m1['params'], m1['path'])
        self.m2 = get_model(ShopeeNet, m2['params'], m2['path']) 
        
    def forward(self, img, label=None):
        feat1 = self.m1(img, label)
        feat2 = self.m2(img, label)
        
        return (feat1 + feat2) / 2  
    

class EnsembleBert(nn.Module):
    def __init__(self, m1, m2):
        super(EnsembleBert, self).__init__()
        
        self.m1 = get_model(ShopeeBert, m1['params'], m1['path'])
        self.m2 = get_model(ShopeeBert, m2['params'], m2['path']) 
        self.tokenizer = self.m1.tokenizer
        
    def forward(self, input_ids, attention_mask, label=None):
        feat1 = self.m1(input_ids, attention_mask, label)
        feat2 = self.m2(input_ids, attention_mask, label)
        
        return (feat1 + feat2) / 2

In [12]:
def get_model(model_class, model_params, model_path):
    model = model_class(**model_params)
    model.load_state_dict(torch.load(model_path))
    model.to('cuda')
    
    model.eval()
    return model

# **Engine**

In [13]:
def get_image_embeddings(df, mode):
    image_embeddings = []
    
    model = EnsembleNet(NFNET_CONFIG, EFFNET_CONFIG)  
    
    image_transforms = A.Compose([
        A.Resize(IMG_SIZE, IMG_SIZE),
        A.Normalize(),
        ToTensorV2()
    ])

    image_dataset = ShopeeDataset(df, mode, image_transforms)

    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size = BATCH_SIZE,
        shuffle = False,
        pin_memory = True,
        drop_last = False,
        num_workers = N_WORKERS
    )
    
    
    with torch.no_grad():
        for img in tqdm(image_loader): 
            img = img.cuda()
            feat = model(img)
            feat = feat.reshape(feat.shape[0], feat.shape[1])
            embed = cupy.asarray(feat)
            embed = normalize(embed, copy=False).astype(dtype=cupy.float16)
            image_embeddings.append(embed)
            
    
    
    del model, img, feat, embed
    gc.collect()

    image_embeddings = cupy.concatenate(image_embeddings)   
    print(f'Image embeddings shape is {image_embeddings.shape}')
    return image_embeddings

In [14]:
def get_bert_embeddings(df, mode):
    bert_embeddings = []
    
    model = EnsembleBert(PHRASE_CONFIG, STSB_CONFIG)  

    text_dataset = ShopeeTextDataset(df, mode, tokenizer=model.tokenizer)

    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size = BATCH_SIZE,
        shuffle = False,
        pin_memory = True,
        drop_last = False,
        num_workers = N_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
            feat = model(input_ids, attention_mask)

            embed = cupy.asarray(feat).astype(dtype=cupy.float16)
            bert_embeddings.append(embed)
            
    
    
    del model, input_ids, attention_mask, feat, embed
    gc.collect()

    bert_embeddings = cupy.concatenate(bert_embeddings)   
    print(f'Bert embeddings shape is {bert_embeddings.shape}')
    return bert_embeddings

In [15]:
def get_tf_embeddings(df, max_features=25_000, ngram_range=(1,1), min_df=1):
    print('Generating text embeddings')

    text = cudf.Series.from_pandas(df.apply(clean_title, axis=1))    
    model = TfidfVectorizer(stop_words=None, binary=True,max_features=max_features, ngram_range=ngram_range, min_df=min_df)
    text_embeddings = model.fit_transform(text).toarray().astype(dtype=cupy.float16)

    print(f'Text embeddins shape: {text_embeddings.shape}')

    return text_embeddings

In [16]:
def get_clust_preds(df, embeds):
    print('Finding best eps') 
    max_clusters = 0
    best_eps = -1000
    eps = 0.1
    while True:
        clusterer = DBSCAN(eps=eps, min_samples=2, max_mbytes_per_batch=4000)
        clusterer.fit(embeds.astype(dtype=cupy.float32))
        n_clusters = len(cupy.unique(clusterer.labels_))
        if n_clusters > max_clusters:
            max_clusters = n_clusters
            best_eps = eps
            eps += 0.05
        else:
            break
    
    print(f'Making prediction for eps:{best_eps} - n_clusters:{max_clusters}')
    
    preds = pd.Series(np.full((len(df),), np.nan)) 
    clusterer = DBSCAN(eps=best_eps, min_samples=2, max_mbytes_per_batch=4000)    
    clusterer.fit(embeds.astype(dtype=cupy.float32))
    
    for cluster in tqdm(cupy.unique(clusterer.labels_)):        
        idxs = cupy.asnumpy(cupy.where(clusterer.labels_ == cluster)[0])
        o = df.iloc[idxs].posting_id.values
        for idx in idxs:
            if cluster == -1:
                preds.iloc[idx] = [df.posting_id.iloc[idx]]
            else:
                preds.iloc[idx] = o
    
    return preds

In [17]:
def get_knn_preds(df, embeddings, KNN=50, th = 0.54):
    
    if len(df) <= 3:
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
       
    CHUNK = 1024*4
    ITER = np.arange(np.ceil(len(df) / CHUNK))
    
    predictions = []

    for i in tqdm(ITER):
        left = int(i * CHUNK)
        right = min(int((i + 1) * CHUNK), len(df))
          
        distances, indices = model.kneighbors(embeddings[left:right])

        for dist, idx in zip(distances, indices):
            posting_ids = df.iloc[cupy.asnumpy(idx[dist < th])].posting_id.values
            predictions.append(posting_ids)
    
    del model, distances, indices, dist, idx, posting_ids
    gc.collect()
    return predictions

In [18]:
def run(): 
    seed_torch(SEED)    
    
    # DATA
    path = '../input/shopee-product-matching/'
    mode = 'test'

    df = pd.read_csv(path + 'test.csv')
    df['image'] = path + 'test_images/' + df['image']
    
    tmp = df.groupby('image_phash').posting_id.unique().to_dict()
    df['phash'] = df.image_phash.map(tmp)
    
    
    # PREDICTION
    bert_embs = get_bert_embeddings(df, mode)   
    # df['bert_preds'] = get_clust_preds(df, bert_embs)
    
    img_embs = get_image_embeddings(df, mode)    
    # df['img_preds'] = get_clust_preds(df, img_embs)
    
    comb_embs = (COMB_RATE * bert_embs + (1 - COMB_RATE) * img_embs) / 2
        
    del bert_embs, img_embs
    gc.collect()
    
    df['comb_preds'] = get_clust_preds(df, comb_embs)
    
    del comb_embs
    gc.collect()
    
    min_df = 2 if len(df) > 3 else 1
    
    tf_embs = get_tf_embeddings(df, max_features=None, min_df=min_df)
    df['tf_preds'] = get_knn_preds(df, tf_embs, th=0.7)
        
    
#     Submission
    df['matches'] = df.apply(combine_for_sub, axis = 1)
#     df['matches'] = ''
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)


In [19]:
run()

  0%|          | 0/1 [00:00<?, ?it/s]

Bert embeddings shape is (3, 11014)
Loaded pretrained weights for efficientnet-b0


  0%|          | 0/1 [00:00<?, ?it/s]

Image embeddings shape is (3, 11014)
Finding best eps
Making prediction for eps:0.1 - n_clusters:1


  0%|          | 0/1 [00:00<?, ?it/s]

Generating text embeddings
Text embeddins shape: (3, 22)


  0%|          | 0/1 [00:00<?, ?it/s]