In [30]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2
import sklearn

from tqdm.notebook import tqdm


In [2]:
path = 'data/'
train_path = 'data/train_images/'
test_path = 'data/test_images/'

In [3]:
traindf = pd.read_csv(path + 'train.csv')

In [4]:
traindf.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [5]:
print(f"Train data shape: {traindf.shape}")
print(f"N_unique label_group: {traindf.label_group.nunique()}")

Train data shape: (34250, 5)
N_unique label_group: 11014


In [6]:
traindf['image'] = train_path + traindf['image']

In [7]:
traindf = traindf.drop_duplicates(subset=['image']).reset_index()

In [8]:
tmp = traindf.groupby('label_group').posting_id.unique().to_dict()
traindf['target'] = traindf.label_group.map(tmp)

In [9]:
tmp = traindf.groupby('image_phash').posting_id.unique().to_dict()
traindf['phash_dups'] = traindf.image_phash.map(tmp)

In [10]:
def combine(row):
    x = np.concatenate([row.phash_dups, row.cnn, row.knn])
    return np.unique(x)

**Baseline** 

In [11]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

In [12]:
traindf['f1'] = traindf.apply(getMetric('phash_dups'), axis=1)
print('Baseline score =', traindf.f1.mean())

Baseline score = 0.5446525227019672


**CNN**

In [13]:
from PIL import Image

import torch
torch.manual_seed(1010)


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets


In [14]:
class ShopeeDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, idx):
        img = Image.open(self.img_path[idx]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [15]:
shopee_dataset = ShopeeDataset(
    traindf.image.values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
)

shopee_loader = torch.utils.data.DataLoader(
    shopee_dataset,
    batch_size = 10,
    shuffle = False,
    num_workers = 0
)

In [16]:
class ShopeeNet(nn.Module):
    def __init__(self):
        super(ShopeeNet, self).__init__()
        

        model = models.resnet50(True)
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        model = nn.Sequential(*list(model.children())[:-1])
        model.eval()
        self.model = model
        
    def forward(self, img):
        out = self.model(img)
        return out
        

In [17]:
shopee_model = ShopeeNet()
shopee_model = shopee_model.to('cuda')

In [18]:
embedings = []
with torch.no_grad():
    for data in tqdm(shopee_loader):
        data = data.to('cuda')
        emb = shopee_model(data)
        emb = emb.reshape(emb.shape[0], emb.shape[1])
        emb = emb.data.cpu().numpy()
        
        embedings.append(emb)

  0%|          | 0/3242 [00:00<?, ?it/s]

In [19]:
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS


In [20]:
embedings = np.vstack(embedings)
embedings = normalize(embedings)

In [21]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(embedings)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                 radius=1.0)

In [24]:
preds = []
CHUNK = 1024*4

CTS = len(embedings) // CHUNK
if len(embedings) % CHUNK != 0: CTS += 1
    
for j in range (CTS):
    a = j * CHUNK
    b = (j + 1) * CHUNK
    b = min(b, len(embedings))
    print('chunk', a,'to', b)
    
    distances, indices = model.kneighbors(embedings[a:b])
    
    for dist, idx in zip(distances, indices):
        o = traindf.iloc[idx[dist > .6]].posting_id.values
        preds.append(o)
    

chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32412


In [25]:
traindf['knn'] = preds

In [26]:

preds = []
CHUNK = 1024*4

CTS = len(embedings) // CHUNK
if len(embedings) % CHUNK != 0: CTS += 1
for i in range(CTS):
    
    a = i * CHUNK
    b = (i + 1) * CHUNK
    b = min(b, len(embedings))
    print('chunk', a,'to', b)
    
    distances = np.matmul(embedings, embedings[a:b].T).T

    
    for k in range(b-a):
        IDX = np.where(distances[k,]>0.95)[0]
        o = traindf.iloc[IDX].posting_id.values
        preds.append(o)

chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32412


In [27]:
traindf['cnn'] = preds

traindf['f1_cnn'] = traindf.apply(getMetric('cnn'), axis=1)
print('CNN KNN score =', traindf.f1_cnn.mean())

CNN KNN score = 0.6436852975694984


In [28]:
traindf['matches'] = traindf.apply(combine, axis=1)

In [29]:
traindf['f1'] = traindf.apply(getMetric('matches'), axis =1)
print ('Hashes + CNN + KNN score =', traindf.f1.mean())

Hashes + CNN + KNN score = 0.6435139394912119


**Clustering**

In [55]:
clustering_DBSCAN = DBSCAN(eps=0.07, min_samples=2, metric='cosine').fit(embedings)


In [56]:
traindf['clusters_dbscan'] = clustering_DBSCAN.labels_
clustered = (traindf['clusters_dbscan'] != -1)
tmp = traindf.loc[clustered].groupby('clusters_dbscan').posting_id.unique().to_dict()
tmp[-1] = []
for key, value in tmp.items():
    if len(value) > 50:
        tmp[key] = value[:50]

        
traindf['cnn_dbscan'] = traindf['clusters_dbscan'].map(tmp)
traindf['f1_dbscan'] = traindf.apply(getMetric('cnn_dbscan'), axis=1)
print('CNN DBSCAN score =', traindf.f1_dbscan.mean())

CNN DBSCAN score = 0.4150762569220733


In [57]:
clustering_OPTICS = OPTICS(min_samples=2, max_eps=0.05, metric='cosine').fit(embedings)

In [58]:
traindf['clusters_optics'] = clustering_OPTICS.labels_
clustered = (traindf['clusters_optics'] != -1)
tmp = traindf.loc[clustered].groupby('clusters_optics').posting_id.unique().to_dict()
tmp[-1] = []
for key, value in tmp.items():
    if len(value) > 50:
        tmp[key] = value[:50]

        
traindf['cnn_optics'] = traindf['clusters_optics'].map(tmp)
traindf['f1_optics'] = traindf.apply(getMetric('cnn_optics'), axis=1)
print('CNN OPTICS score =', traindf.f1_optics.mean())

CNN OPTICS score = 0.33722592912940347
