In [16]:
import random, os, re, string, gc

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2

import sklearn
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


from tqdm.notebook import tqdm


from PIL import Image

import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from shopeenet import ShopeeNet
from shopeedataset import ShopeeDataset


In [14]:
IMG_SIZE = 512
N_WORKERS = 4
BATCH_SIZE = 12
SEED = 24

MODEL_PATH = 'checkpoints/arcface_epoch9.pth'
MODEL_PARAMS = {
    'feature_space' : 680, 
    'out_features' : 11014, 
    'scale' : 24.0, 
    'margin' : 0.4
}

In [3]:
path = 'data/'
train_path = 'data/train_images/'
test_path = 'data/test_images/'

In [4]:
df = pd.read_csv(path + 'train.csv')
df['image'] = train_path + df['image']
tmp = df.groupby('label_group').posting_id.unique().to_dict()
df['target'] = df.label_group.map(tmp)

# **Utils**

In [5]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

In [6]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

# **Image**

In [17]:
def get_image_embeddings(df, mode, pretrained=False):
    embeds = []
    
    model = ShopeeNet(**MODEL_PARAMS)
    if pretrained:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cuda:0'))
    model = model.to('cuda')

    image_transforms = A.Compose([
        A.Resize(IMG_SIZE, IMG_SIZE),
        A.Normalize(),
        ToTensorV2()
    ])

    image_dataset = ShopeeDataset(df, mode, image_transforms)

    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size = BATCH_SIZE,
        shuffle = False,
        pin_memory = True,
        drop_last = False,
        num_workers = N_WORKERS
    )
    
    
    with torch.no_grad():
        for img in tqdm(image_loader): 
            img = img.cuda()
            feat = model(img)
            feat = feat.reshape(feat.shape[0], feat.shape[1])
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    image_embeddings = normalize(image_embeddings)    
    print(f'Image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [19]:
def get_image_predictions(df, embeddings, threshold = 0.54):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for dist, idx in tqdm(zip(distances, indices)):
        posting_ids = df.iloc[cupy.asnumpy(idx[dist < 0.54])].posting_id.values
        predictions.append(posting_ids)
        
    del model, distances, indices, dist, idx, posting_ids
    gc.collect()
    return predictions

In [18]:
img_embs = get_image_embeddings(df, 'test', pretrained=True)

Loaded pretrained weights for efficientnet-b0


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2855.0), HTML(value='')))


Image embeddings shape is (34250, 11014)


In [None]:
image_predictions = get_image_predictions(df, img_embs)