In [None]:
import sys
!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/


In [1]:
import os
data_dir = '../input/pretrained/'
cache_dir ='/root/.cache/torch/hub/checkpoints/'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

from shutil import copyfile
for fname in os.listdir(data_dir):    
    src = data_dir + fname
    dest = cache_dir + fname
    copyfile(src, dest)

In [3]:
!pip install ../input/effnetpytrochwhl/efficientnet_pytorch-0.7.0-py3-none-any.whl

Processing /kaggle/input/effnetpytrochwhl/efficientnet_pytorch-0.7.0-py3-none-any.whl
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0


In [4]:
import random, re, string, gc, math, os

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2

import sklearn
# from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA


from tqdm.notebook import tqdm


import  cuml, cupy, cudf
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from cuml.experimental.preprocessing import normalize


from PIL import Image

import torch
# torch.manual_seed(1010)


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader


import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from efficientnet_pytorch import EfficientNet


import nltk
from nltk.corpus import stopwords

In [5]:
IMG_SIZE = 512
N_WORKERS = 4
BATCH_SIZE = 12
SEED = 24

MODEL_PATH = '../input/checkpoints/arcface_epoch9.pth'
MODEL_PARAMS = {
    'feature_space' : 680, 
    'out_features' : 11014, 
    'scale' : 24.0, 
    'margin' : 0.4
}

# **Utils**

In [6]:
def combine_for_sub(row):
    x = np.concatenate([row.img_preds, row.text_preds])
    return " ".join(np.unique(x))

In [7]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

# **Image**


In [8]:
# class ShopeeDataset(Dataset):
#     def __init__(self, img_path, transform):
#         self.img_path = img_path
#         self.transform = transform
        
#     def __getitem__(self, idx):
#         img = Image.open(self.img_path[idx]).convert('RGB')
#         img = self.transform(img)
#         return img
    
#     def __len__(self):
#         return len(self.img_path)

class ShopeeDataset(Dataset):
    def __init__(self, df, mode, transform):
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img = cv2.imread(row.image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transform(image=img)['image']

        if self.mode == 'test':
            return img
        if self.mode == 'train':
            return img, torch.tensor(row.label_group).float()
    
    def __len__(self):
        return len(self.df)

In [9]:
class ArcFace(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.5, eps=1e-6):
        super(ArcFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.threshold = math.pi - margin
        self.eps = eps

        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label=None):
        cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))

        if label is None:
            return cos_theta

        theta = torch.acos(torch.clamp(cos_theta, -1.0 + self.eps, 1.0 - self.eps))

        one_hot = torch.zeros_like(cos_theta)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)

        mask = torch.where(theta > self.threshold, torch.zeros_like(one_hot), one_hot)

        logits = torch.cos(torch.where(mask.bool(), theta + self.margin, theta))
        logits *= self.scale

        return logits
        

class ShopeeNet(nn.Module):
    def __init__(self, feature_space, out_features, scale, margin):
        super(ShopeeNet, self).__init__()
        self.feature_space = feature_space
        self.out_features = out_features
        
        self.backbone = EfficientNet.from_pretrained('efficientnet-b0')
        in_features = self.backbone._conv_head.out_channels
        self.dropout = nn.Dropout(p=self.backbone._global_params.dropout_rate)
        self.classifier = nn.Linear(in_features, self.feature_space)
        self.bn = nn.BatchNorm1d(self.feature_space)
        
        self.margin = ArcFace(
            in_features = self.feature_space,
            out_features = self.out_features,
            scale = scale, 
            margin = margin       
        )
        
        if self.training:
            self._init_params()
        

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)


    def forward(self, img, label=None):
        batch_size = img.shape[0]
        out = self.backbone.extract_features(img)
        out = self.backbone._avg_pooling(out).view(batch_size, -1)
        out = self.dropout(out)
        out = self.classifier(out)
        out = self.bn(out) 
        
        if self.training:
            logits = self.margin(out, label)
            return logits
        else:
            logits = self.margin(out)
            return logits
        
# class ShopeeNet(nn.Module):
#     def __init__(self):
#         super(ShopeeNet, self).__init__()
#         model = EfficientNet.from_pretrained('efficientnet-b0')
#         model.eval()
#         self.model = model
        
#     def forward(self, img):
#         out = self.model.extract_features(img)
#         out = self.model._avg_pooling(out)
#         return out

In [10]:
def get_image_predictions(df, embeddings, threshold = 0.54):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for dist, idx in tqdm(zip(distances, indices)):
        posting_ids = df.iloc[cupy.asnumpy(idx[dist < threshold])].posting_id.values
        predictions.append(posting_ids)
        
    del model, distances, indices, dist, idx, posting_ids
    gc.collect()
    return predictions

In [11]:
def get_image_embeddings(pretrained=False):
    image_embeddings = []
    
    model = ShopeeNet(**MODEL_PARAMS)
    if pretrained:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cuda:0'))
    model = model.to('cuda')   
    
    with torch.no_grad():
        for img in tqdm(image_loader): 
            img = img.cuda()
            feat = model(img)
            feat = feat.reshape(feat.shape[0], feat.shape[1])
            embed = cupy.asarray(feat)
#             embed = feat.detach().cpu().numpy()
            embed = normalize(embed)
            image_embeddings.append(embed)
            
    
    
    del model, img, feat, embed
    gc.collect()

    image_embeddings = cupy.concatenate(image_embeddings)
#     del embeds
#     image_embeddings = normalize(image_embeddings)    
    print(f'Image embeddings shape is {image_embeddings.shape}')
    return image_embeddings

# **Text**

In [12]:
def clean_title(row):
    title = row.title
    stop = stopwords.words('english')
    title = [x for x in title.split() if not x in stop]
    title = " ".join(title)
    title = title.lower()
    title = re.sub(r"\-","",title)
    title = re.sub(r"\+","",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|","",title)
    title = re.sub(r"\\","",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}","",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [13]:
def get_text_predictions(df, max_features = 25_000, threshold = 0.54):
    
    text = cudf.Series.from_pandas(df.apply(clean_title, axis=1))    
    model = TfidfVectorizer(stop_words=None, binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(text).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        distances = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            idx = cupy.where(distances[k] > threshold)[0]
            o = df.iloc[cupy.asnumpy(idx)].posting_id.values
            preds.append(o)
    
    del text, model, text_embeddings, distances, idx, o
    gc.collect()
    return preds

# **Predictions**

In [14]:
path = '../input/shopee-product-matching/'

In [15]:
df = pd.read_csv(path + 'test.csv')
df['image'] = path + 'test_images/' + df['image']

# df = pd.read_csv(path + 'train.csv')
# df['image'] = path + 'train_images/' + df['image']
# df = pd.concat([df,df,df.iloc[:10_000]])

In [16]:
image_transforms = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE),
    A.Normalize(),
    ToTensorV2()
])

image_dataset = ShopeeDataset(df, 'test', image_transforms)

image_loader = torch.utils.data.DataLoader(
    image_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    pin_memory = True,
    drop_last = False,
    num_workers = N_WORKERS
)

In [None]:
image_embeddings = get_image_embeddings(pretrained=True)

Loaded pretrained weights for efficientnet-b0


  0%|          | 0/6542 [00:00<?, ?it/s]

In [18]:
image_predictions = get_image_predictions(df, image_embeddings, threshold=0.94)
df['img_preds'] = image_predictions

In [19]:
del image_embeddings, image_predictions
gc.collect()

22

In [20]:
text_predictions = get_text_predictions(df, threshold=0.7)
df['text_preds'] = text_predictions

  "right", dtype_r, dtype_l, libcudf_join_type


Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 36864
chunk 36864 to 40960
chunk 40960 to 45056
chunk 45056 to 49152
chunk 49152 to 53248
chunk 53248 to 57344
chunk 57344 to 61440
chunk 61440 to 65536
chunk 65536 to 69632
chunk 69632 to 73728
chunk 73728 to 77824
chunk 77824 to 78500


# **Submission**

In [None]:
df['matches'] = df.apply(combine_for_sub, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)