# setup

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from efficientnet_pytorch import EfficientNet
from PIL import Image
from torchvision import transforms

from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch, gc

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

HOME = '/data/git/shopee-product-matching'
pdata = f'{HOME}/data/shopee-product-matching'
!ls $pdata

BS = 256
NWKRS = 8
DEVICE = 'cuda'
PIN_MEMORY = True
MAXLEN = 128

sample_submission.csv  test.csv  test_images  train.csv  train_images


# data

In [2]:
fnm = 'train'
df = pd.read_csv(f'{pdata}/{fnm}.csv')
assert len(df) == df.posting_id.nunique()
p_imgs = f"{pdata}/train_images"
df.head(2)

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045


# hash

In [3]:
def hamming(x1, x2): return sum(o1!=o2 for o1,o2 in zip(x1,x2))

hfeats = df.image_phash.apply(lambda x: bin(int(x, 16))).values

n = len(hfeats)
hdists = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i, n):
        x1 = hfeats[i]
        x2 = hfeats[j]
        hdists[i,j]=hdists[j,i]=hamming(x1, x2)

100%|██████████| 34250/34250 [1:23:10<00:00,  6.86it/s] 


In [4]:
hsims = 1/(1+hdists)

In [5]:
hsims

array([[1.        , 0.02325581, 0.02439024, ..., 0.02702703, 0.02857143,
        0.03225806],
       [0.02325581, 1.        , 0.03448276, ..., 0.03703704, 0.03030303,
        0.04347826],
       [0.02439024, 0.03448276, 1.        , ..., 0.03225806, 0.02564103,
        0.02857143],
       ...,
       [0.02702703, 0.03703704, 0.03225806, ..., 1.        , 0.03448276,
        0.03448276],
       [0.02857143, 0.03030303, 0.02564103, ..., 0.03448276, 1.        ,
        0.02857143],
       [0.03225806, 0.04347826, 0.02857143, ..., 0.03448276, 0.02857143,
        1.        ]])

# demo

In [18]:
i = np.random.choice(range(len(df)))
print(i)
df.iloc[i].to_dict()

34221


{'posting_id': 'train_2918683619',
 'image': 'ffbf2266ff17f4bc336002a00d82b647.jpg',
 'image_phash': 'afde42252f25c992',
 'title': 'Sticker Star Wall Glow In The Dark | Stiker Bintang',
 'label_group': 3251156563}

In [19]:
grp = df.iloc[i].label_group

df[df.label_group==grp][['title', 'image_phash', 'label_group']]

Unnamed: 0,title,image_phash,label_group
10216,Hiasan Dinding 100 PC Bintang Star Cantik LED ...,afde42250f25c9d2,3251156563
34221,Sticker Star Wall Glow In The Dark | Stiker Bi...,afde42252f25c992,3251156563


In [20]:
ntop = 10

In [21]:
idx_sim = np.argsort(-hsims[i])[:ntop]
df.iloc[idx_sim][['title', 'image_phash', 'label_group']]

Unnamed: 0,title,image_phash,label_group
34221,Sticker Star Wall Glow In The Dark | Stiker Bi...,afde42252f25c992,3251156563
10216,Hiasan Dinding 100 PC Bintang Star Cantik LED ...,afde42250f25c9d2,3251156563
25022,[isi 3] GMY | Celana dalam gt man isi 3 | CD g...,bd9ad2340f64cdc2,1611718867
26825,Sisir Besi Butterfly Balisong Training Knife C...,abcc12952e69c89f,3226463072
18039,Porto Sepatu Sneakers Sports Pria AA001M Size ...,af96c06b3f359842,1206386793
31260,Implora Cheek & Liptint,abce96258525cc7a,3627744656
755,SOREX cd Basic Katun premium 1232 Soft Comfor...,bc9e813d1e65c9c2,1468444411
32742,Mainan Bola balon Basket karet murah,bdc2c20dcf34cd32,69923982
28809,KAIN KATUN TOYOBO ROYAL MIX PER 1 METER,add0d26f0d2dd0d2,2057938818
32405,McDoDo 1.8M Kabel Gaming Fast Charging 3.0 Mic...,be4e80b72561c3f2,1124629473


# save 

In [10]:
pout = f"{HOME}/data/dev0006"
!mkdir -p $pout

In [11]:
np.save(f"{pout}/hdists.npy", hdists)
np.save(f"{pout}/hsims.npy", hsims)