In [None]:
import sys
!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

import os
data_dir = '../input/pretrained/'
cache_dir ='/root/.cache/torch/hub/checkpoints/'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

from shutil import copyfile
for fname in os.listdir(data_dir):    
    src = data_dir + fname
    dest = cache_dir + fname
    copyfile(src, dest)

In [None]:
!pip install ../input/effnetpytrochwhl/efficientnet_pytorch-0.7.0-py3-none-any.whl

In [None]:
import random, re, string, gc

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2

import sklearn
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA


from tqdm.notebook import tqdm


import  cuml, cupy, cudf
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors


from PIL import Image

import torch
torch.manual_seed(1010)


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from efficientnet_pytorch import EfficientNet


import nltk
from nltk.corpus import stopwords

# **Utils**

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.img_preds, row.text_preds])
    return " ".join(np.unique(x))

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(1010)

# **Image**


In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, idx):
        img = Image.open(self.img_path[idx]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)



In [None]:
class ShopeeNet(nn.Module):
    def __init__(self):
        super(ShopeeNet, self).__init__()
        model = EfficientNet.from_pretrained('efficientnet-b3')
        model.eval()
        self.model = model
        
    def forward(self, img):
        out = self.model.extract_features(img)
        out = self.model._avg_pooling(out)
        return out

In [None]:
def get_image_predictions(df, embeddings, threshold = 0.54):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for dist, idx in tqdm(zip(distances, indices)):
        posting_ids = df.iloc[cupy.asnumpy(idx[dist < 0.54])].posting_id.values
        predictions.append(posting_ids)
        
    del model, distances, indices, dist, idx, posting_ids
    gc.collect()
    return predictions

In [None]:
def get_image_embeddings(image_paths):
    embeds = []
    
    model = ShopeeNet()
    model = model.to('cuda')

    image_dataset = ShopeeDataset(
        image_paths,
        transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.CenterCrop((200, 200)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    )

    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size = 12,
        shuffle = False,
        pin_memory=True,
        drop_last=False,
        num_workers = 4
    )
    
    
    with torch.no_grad():
        for img in tqdm(image_loader): 
            img = img.cuda()
            feat = model(img)
            feat = feat.reshape(feat.shape[0], feat.shape[1])
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    image_embeddings = normalize(image_embeddings)    
    print(f'Image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

# **Text**

In [None]:
def clean_title(row):
    title = row.title
    stop = stopwords.words('english')
    title = [x for x in title.split() if not x in stop]
    title = " ".join(title)
    title = title.lower()
    title = re.sub(r"\-","",title)
    title = re.sub(r"\+","",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|","",title)
    title = re.sub(r"\\","",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}","",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
def get_text_predictions(df, max_features = 25_000, threshold = 0.54):
    
    text = cudf.Series.from_pandas(df.apply(clean_title, axis=1))    
    model = TfidfVectorizer(stop_words=None, binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(text).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        distances = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            idx = cupy.where(distances[k] > 0.7)[0]
            o = df.iloc[cupy.asnumpy(idx)].posting_id.values
            preds.append(o)
    
    del text, model, text_embeddings, distances, idx, o
    gc.collect()
    return preds

# **Predictions**

In [None]:
path = '../input/shopee-product-matching/'

In [None]:
df = pd.read_csv(path + 'test.csv')
df['image'] = path + 'test_images/' + df['image']

In [None]:
image_embeddings = get_image_embeddings(df.image.values)

In [None]:
image_predictions = get_image_predictions(df, image_embeddings)
text_predictions = get_text_predictions(df)

# **Submission**

In [None]:
df['img_preds'] = image_predictions
df['text_preds'] = text_predictions
df['matches'] = df.apply(combine_for_sub, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)