In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import cv2
import sklearn

from tqdm.notebook import tqdm


In [2]:
path = 'data/'
train_path = 'data/train_images/'
test_path = 'data/test_images/'

In [3]:
traindf = pd.read_csv(path + 'train.csv')

In [4]:
traindf.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [5]:
print(f"Train data shape: {traindf.shape}")
print(f"N_unique label_group: {traindf.label_group.nunique()}")

Train data shape: (34250, 5)
N_unique label_group: 11014


In [6]:
traindf['image'] = train_path + traindf['image']

In [7]:
traindf = traindf.drop_duplicates(subset=['image']).reset_index()

In [8]:
tmp = traindf.groupby('label_group').posting_id.unique().to_dict()
traindf['target'] = traindf.label_group.map(tmp)

In [9]:
tmp = traindf.groupby('image_phash').posting_id.unique().to_dict()
traindf['phash_dups'] = traindf.image_phash.map(tmp)

In [10]:
def combine(row):
    x = np.concatenate([row.cnn, row.nlp])
    return np.unique(x)

**Baseline** 

In [11]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

In [12]:
traindf['f1'] = traindf.apply(getMetric('phash_dups'), axis=1)
print('Baseline score =', traindf.f1.mean())

Baseline score = 0.5446525227019672


**CNN**

In [13]:
from PIL import Image

import torch
torch.manual_seed(1010)


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets

from efficientnet_pytorch import EfficientNet



In [10]:
class ShopeeDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, idx):
        img = Image.open(self.img_path[idx]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [11]:
shopee_dataset = ShopeeDataset(
    traindf.image.values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
)

shopee_loader = torch.utils.data.DataLoader(
    shopee_dataset,
    batch_size = 10,
    shuffle = False,
    num_workers = 0
)

In [12]:
class ShopeeNet(nn.Module):
    def __init__(self):
        super(ShopeeNet, self).__init__()
        

#         model = models.resnet50(True)
#         model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
#         model = nn.Sequential(*list(model.children())[:-1])
#         model.eval()

        model = EfficientNet.from_pretrained('efficientnet-b0')
        model.eval()
#         model = nn.Sequential(*list(model.children())[:-3])
        self.model = model
        
    def forward(self, img):
#         out = self.model(img)
        out = self.model.extract_features(img)
        out = self.model._avg_pooling(out)
        return out
        

In [13]:
shopee_model = ShopeeNet()
shopee_model = shopee_model.to('cuda')

Loaded pretrained weights for efficientnet-b0


In [71]:
embedings = []
with torch.no_grad():
    for data in tqdm(shopee_loader):
        data = data.to('cuda')
        emb = shopee_model(data)
        emb = emb.reshape(emb.shape[0], emb.shape[1])
        emb = emb.data.cpu().numpy()
        
        embedings.append(emb)

  0%|          | 0/3242 [00:00<?, ?it/s]

In [13]:
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
# from sklearn.cluster import DBSCAN
# from sklearn.cluster import OPTICS


In [72]:
embedings = np.vstack(embedings)
embedings = normalize(embedings)

In [14]:
import joblib

In [15]:
embedings = joblib.load("b0_eval.pkl")

In [73]:
joblib.dump(embedings, 'b0_eval.pkl')

['b0_eval.pkl']

In [21]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(embedings)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                 radius=1.0)

In [22]:
distances, indices = model.kneighbors(embedings)

In [44]:
distances.max()

1.2164419893127687

In [46]:
# Find best threshold for KNN distance
thresholds = list(np.arange(0.5, 0.6, 0.02))
scores = []
for th in thresholds:
    preds = []
    for dist, idx in tqdm(zip(distances, indices)):
        o = traindf.iloc[idx[dist < th]].posting_id.values
        preds.append(o)
        
    traindf[f"knn_th{th}"] = preds
    traindf[f"f1_knn_th{th}"] = traindf.apply(getMetric(f"knn_th{th}"), axis=1)
    score = traindf[f'f1_knn_th{th}'].mean()
    scores.append(score)
    print (f"Threshold: {th}  Score : {score}")
    
print(f"Best threshold: {thresholds[np.argmax(scores)]} Score: {max(scores)}")

0it [00:00, ?it/s]

Threshold: 0.5  Score : 0.6644833684140724


0it [00:00, ?it/s]

Threshold: 0.52  Score : 0.6665841866164229


0it [00:00, ?it/s]

Threshold: 0.54  Score : 0.6675262906041246


0it [00:00, ?it/s]

Threshold: 0.56  Score : 0.6666021881127469


0it [00:00, ?it/s]

Threshold: 0.5800000000000001  Score : 0.6629005921444882
Best threshold: 0.54 Score: 0.6675262906041246


In [51]:
# Apply best threshold
preds = []
for dist, idx in tqdm(zip(distances, indices)):
    o = traindf.iloc[idx[dist < 0.54]].posting_id.values
    preds.append(o)
    

0it [00:00, ?it/s]

In [52]:
traindf['cnn_knn'] = preds
traindf['f1_cnn_knn'] = traindf.apply(getMetric('cnn_knn'), axis=1)
print('KNN score =', traindf.f1_cnn_knn.mean())

KNN score = 0.6675262906041246


In [46]:
distances_cnn = np.matmul(embedings, embedings.T).T

In [31]:
# Find best threshold for CNN distance
thresholds = list(np.arange(0.78, 0.91, 0.02))
scores = []
for th in thresholds:
    preds = []
    for k in tqdm(range(distances_cnn.shape[0])):
        idx = np.where(distances_cnn[k] > th)[0]
        o = traindf.iloc[idx].posting_id.values
        preds.append(o)
        
    traindf[f"cnn_th{th}"] = preds
    traindf[f"f1_cnn_th{th}"] = traindf.apply(getMetric(f"cnn_th{th}"), axis=1)
    score = traindf[f'f1_cnn_th{th}'].mean()
    scores.append(score)
    print (f"Threshold: {th}  Score : {score}")
    
print(f"Best threshold: {thresholds[np.argmax(scores)]} Score: {max(scores)}")
   

  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.78  Score : 0.5974356716876668


  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.8  Score : 0.6341609946778889


  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.8200000000000001  Score : 0.6558892474205212


  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.8400000000000001  Score : 0.6659086256822586


  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.8600000000000001  Score : 0.6670456880086131


  0%|          | 0/32412 [00:00<?, ?it/s]

Threshold: 0.8800000000000001  Score : 0.6632291752465692


  0%|          | 0/32412 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [47]:
#Apply best threshold
preds = []
for k in tqdm(range(distances_cnn.shape[0])):
    idx = np.where(distances_cnn[k] > 0.85)[0]
    o = traindf.iloc[idx].posting_id.values
    preds.append(o)

  0%|          | 0/32412 [00:00<?, ?it/s]

In [48]:
traindf['cnn'] = preds

traindf['f1_cnn'] = traindf.apply(getMetric('cnn'), axis=1)
print('CNN score =', traindf.f1_cnn.mean())

CNN score = 0.66711318104611


**NLP**

In [18]:
import re,string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def clean_title(row):
    title = row.title
    stop = stopwords.words('english')
    title = [x for x in title.split() if not x in stop]
    title = " ".join(title)
    title = title.lower()
    title = re.sub(r"\-","",title)
    title = re.sub(r"\+","",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|","",title)
    title = re.sub(r"\\","",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}","",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [20]:
traindf['title_cleaned'] = traindf.apply(clean_title, axis=1)

In [21]:
tfidf = TfidfVectorizer(stop_words=None, binary=True)
text_embeddings = tfidf.fit_transform(traindf.title_cleaned).toarray()

In [13]:
pca = PCA(n_components=5000)
text_embeddings = pca.fit_transform(text_embeddings)
print('text embeddings shape',text_embeddings.shape)

text embeddings shape (32412, 5000)


In [20]:
pca_embs = text_embeddings

In [32]:
import gc
del tfidf, text_embeddings, pca
gc.collect()

61

In [25]:
distances_tfidf = np.matmul(text_embeddings, text_embeddings.T).T

In [24]:
thresholds = list(np.arange(0.50, 0.91, 0.02))
scores = []
for th in thresholds:
    preds = []
    for k in tqdm(range(distances_tfidf.shape[0])):
        idx = np.where(distances_tfidf[k] > th)[0]
        o = traindf.iloc[idx].posting_id.values
        preds.append(o)
        
    traindf[f"nlp_th{th}"] = preds
    traindf[f"f1_nlp_th{th}"] = traindf.apply(getMetric(f"nlp_th{th}"), axis=1)
    score = traindf[f'f1_nlp_th{th}'].mean()
    scores.append(score)
    print (f"Threshold: {th}  Score : {score}")
    
print(f"Best threshold: {thresholds[np.argmax(scores)]} Score: {max(scores)}")

  0%|          | 0/32412 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [26]:
#Apply best threshold
preds = []
for k in tqdm(range(distances_tfidf.shape[0])):
    idx = np.where(distances_tfidf[k] > 0.54)[0]
    o = traindf.iloc[idx].posting_id.values
    preds.append(o)

  0%|          | 0/32412 [00:00<?, ?it/s]

In [27]:
traindf['nlp'] = preds

traindf['f1_nlp'] = traindf.apply(getMetric('nlp'), axis=1)
print('NLP score =', traindf.f1_nlp.mean())

NLP score = 0.6665377543149971


In [54]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(text_embeddings)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                 radius=1.0)

In [None]:
distances_text, indices_text = model.kneighbors(text_embeddings)

In [None]:
thresholds = list(np.arange(0.5, 0.6, 0.02))
scores = []
for th in thresholds:
    preds = []
    for dist, idx in tqdm(zip(distances_text, indices_text)):
        o = traindf.iloc[idx[dist < th]].posting_id.values
        preds.append(o)
        
    traindf[f"nlp_knn_th{th}"] = preds
    traindf[f"f1_nlp_knn_th{th}"] = traindf.apply(getMetric(f"nlp_knn_th{th}"), axis=1)
    score = traindf[f'f1_nlp_knn_th{th}'].mean()
    scores.append(score)
    print (f"Threshold: {th}  Score : {score}")
    
print(f"Best threshold: {thresholds[np.argmax(scores)]} Score: {max(scores)}")

In [49]:
traindf['matches'] = traindf.apply(combine, axis=1)

In [50]:
traindf['f1'] = traindf.apply(getMetric('matches'), axis=1)
print ('CNN + NLP score =', traindf.f1.mean())

CNN + NLP score = 0.73150613391276


In [53]:
import gc 
del distances_cnn, distances_tfidf
gc.collect()

2756

**Clustering**

In [55]:
clustering_DBSCAN = DBSCAN(eps=0.07, min_samples=2, metric='cosine').fit(embedings)


In [56]:
traindf['clusters_dbscan'] = clustering_DBSCAN.labels_
clustered = (traindf['clusters_dbscan'] != -1)
tmp = traindf.loc[clustered].groupby('clusters_dbscan').posting_id.unique().to_dict()
tmp[-1] = []
for key, value in tmp.items():
    if len(value) > 50:
        tmp[key] = value[:50]

        
traindf['cnn_dbscan'] = traindf['clusters_dbscan'].map(tmp)
traindf['f1_dbscan'] = traindf.apply(getMetric('cnn_dbscan'), axis=1)
print('CNN DBSCAN score =', traindf.f1_dbscan.mean())

CNN DBSCAN score = 0.4150762569220733


In [57]:
clustering_OPTICS = OPTICS(min_samples=2, max_eps=0.05, metric='cosine').fit(embedings)

In [58]:
traindf['clusters_optics'] = clustering_OPTICS.labels_
clustered = (traindf['clusters_optics'] != -1)
tmp = traindf.loc[clustered].groupby('clusters_optics').posting_id.unique().to_dict()
tmp[-1] = []
for key, value in tmp.items():
    if len(value) > 50:
        tmp[key] = value[:50]

        
traindf['cnn_optics'] = traindf['clusters_optics'].map(tmp)
traindf['f1_optics'] = traindf.apply(getMetric('cnn_optics'), axis=1)
print('CNN OPTICS score =', traindf.f1_optics.mean())

CNN OPTICS score = 0.33722592912940347


**OCR**

In [16]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [17]:
ocr_text = []
for i in tqdm(range(traindf.shape[0])):
    img = cv2.imread(traindf['image'].iloc[i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    text = pytesseract.image_to_string(img)
    text = " ".join(text.split())
    if len(text) != 0:
        ocr_text.append(text)
    else:
        ocr_text.append('-1')

traindf['ocr_text'] = ocr_text

  0%|          | 0/32412 [00:00<?, ?it/s]

KeyboardInterrupt: 