# setup

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from efficientnet_pytorch import EfficientNet
from PIL import Image
from torchvision import transforms

from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch, gc

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

HOME = '/data/git/shopee-product-matching'
pdata = f'{HOME}/data/shopee-product-matching'
!ls $pdata

BS = 256
NWKRS = 8
DEVICE = 'cuda'
PIN_MEMORY = True
MAXLEN = 128

sample_submission.csv  test.csv  test_images  train.csv  train_images


# data

In [2]:
fnm = 'train'
df = pd.read_csv(f'{pdata}/{fnm}.csv')
assert len(df) == df.posting_id.nunique()
p_imgs = f"{pdata}/train_images"
df.head(2)

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045


# hash

In [85]:
def hamming(x1, x2): 
    x1,x2 = (bin(int(x, 16)) for x in (x1, x2))
    return sum(o1!=o2 for o1,o2 in zip(x1,x2))
n = 5
hdists = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i, n):
        x1 = df.image_phash[i]
        x2 = df.image_phash[j]
        hdists[i,j]=hdists[j,i]=hamming(x1, x2)
hdists

100%|██████████| 5/5 [00:00<00:00, 7845.69it/s]


array([[ 0., 42., 40., 30., 32.],
       [42.,  0., 28., 28., 34.],
       [40., 28.,  0., 34., 44.],
       [30., 28., 34.,  0., 34.],
       [32., 34., 44., 34.,  0.]])

In [88]:
hfeats = df[:5].image_phash.apply(lambda x: int(x, 16)).values

In [89]:
# https://leetcode.com/problems/hamming-distance/discuss/1105813/Python3-with-bit-manipulation-faster-than-99.96.
def hammingDistance(x: int, y: int) -> int:
    x,y=int(x),int(y)
    n=x^y     #taking XOR of inputs to capture the difference(at bit level) between them.
    counter=0
    while(n):           #to count the set bits
        counter+=1
        n&=(n-1)
    return counter



In [90]:
n = len(hfeats)
hdists = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i, n):
        x1 = hfeats[i]
        x2 = hfeats[j]
        hdists[i,j]=hdists[j,i]=hammingDistance(x1, x2)
        
hdists

100%|██████████| 5/5 [00:00<00:00, 18078.90it/s]


array([[ 0., 42., 40., 30., 32.],
       [42.,  0., 28., 28., 34.],
       [40., 28.,  0., 34., 44.],
       [30., 28., 34.,  0., 34.],
       [32., 34., 44., 34.,  0.]])

In [91]:
pairwise_distances(hfeats[:,None], metric=hammingDistance)

array([[ 0., 37., 35., 26., 25.],
       [37.,  0., 22., 23., 26.],
       [35., 22.,  0., 25., 34.],
       [26., 23., 25.,  0., 29.],
       [25., 26., 34., 29.,  0.]])

In [148]:
def hammingDistance(x, y):
    print(x, y)
    x,y = x.astype('uint64'),y.astype('uint64')
    print(x, y)
    n=np.bitwise_xor(x,y)     #taking XOR of inputs to capture the difference(at bit level) between them.
    counter=0
    while(n):           #to count the set bits
        counter+=1
        n&=(n-1)
    return counter


In [149]:
hfeats.dtype

dtype('uint64')

In [152]:
hfeats

array([10707114133977572403, 12627975023471333135, 13352240572977713016,
        9589566965408506499, 12029987587489624204], dtype=uint64)

In [150]:
pairwise_distances(hfeats[:,None], metric=hammingDistance)

[1.07071141e+19] [1.2627975e+19]
[10707114133977573376] [12627975023471333376]
[1.07071141e+19] [1.33522406e+19]
[10707114133977573376] [13352240572977713152]
[1.07071141e+19] [9.58956697e+18]
[10707114133977573376] [9589566965408505856]
[1.07071141e+19] [1.20299876e+19]
[10707114133977573376] [12029987587489624064]
[1.2627975e+19] [1.33522406e+19]
[12627975023471333376] [13352240572977713152]
[1.2627975e+19] [9.58956697e+18]
[12627975023471333376] [9589566965408505856]
[1.2627975e+19] [1.20299876e+19]
[12627975023471333376] [12029987587489624064]
[1.33522406e+19] [9.58956697e+18]
[13352240572977713152] [9589566965408505856]
[1.33522406e+19] [1.20299876e+19]
[13352240572977713152] [12029987587489624064]
[9.58956697e+18] [1.20299876e+19]
[9589566965408505856] [12029987587489624064]
[1.07071141e+19] [1.07071141e+19]
[10707114133977573376] [10707114133977573376]
[1.2627975e+19] [1.2627975e+19]
[12627975023471333376] [12627975023471333376]
[1.33522406e+19] [1.33522406e+19]
[133522405729777

array([[ 0., 37., 35., 26., 25.],
       [37.,  0., 22., 23., 26.],
       [35., 22.,  0., 25., 34.],
       [26., 23., 25.,  0., 29.],
       [25., 26., 34., 29.,  0.]])