In [1]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
from cuml.neighbors import NearestNeighbors
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
warnings.simplefilter('ignore')

In [2]:
base_dir = Path().resolve()
data_dir = base_dir / '../data/tfrecord-skf'

In [3]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [4]:
df = pd.read_csv(data_dir / 'train_folds.csv')
df_cu = cudf.DataFrame(df)
df_cu.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0,train_129225211 train_2278313361,0.666667,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2,train_2288590299 train_3803689425,0.666667,5
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3,train_2406599165 train_3342059966,0.666667,7
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4,train_3369186413 train_921438619,0.666667,9


In [5]:
def get_embeddings(df_cu, max_features: int, binary: bool = False):
    model = TfidfVectorizer(stop_words='english', binary=binary, max_features=max_features)
    embeddings = model.fit_transform(df_cu.title).toarray()
    print('text embeddings shape',embeddings.shape)
    del model
    gc.collect()
    
    return embeddings


def get_neighbors(df, embeddings, thr):
    predictions = []
    CHUNK = 1024*4

    CTS = len(df) // CHUNK
    if len(df) % CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):
        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(df))

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, embeddings[a:b].T).T

        for k in range(b - a):
            idx = cupy.where(cts[k,] > thr)[0]
            o = df.iloc[cupy.asnumpy(idx)].posting_id.values
            predictions.append(o)
    
    return predictions


def evaluate_by_cst(df, embeddings, verbose=False, thr_start=0, thr_end=1, thr_step=0.1):
    CHUNK = 1024 * 4
    CTS = len(embeddings) // CHUNK
    if len(embeddings) % CHUNK != 0: CTS += 1
        
    scores = list()
    thresholds = np.arange(thr_start, thr_end, thr_step)
    for threshold in thresholds:
        predictions = []
        
        for j in range(CTS):
            a, b = j * CHUNK, (j + 1) * CHUNK
            b = min(b, len(embeddings))
            
            cts = cupy.matmul(embeddings, embeddings[a:b].T).T
            
            for k in range(b - a):
                idx = cupy.where(cts[k,] > threshold)[0]
                posting_ids = ' '.join(df['posting_id'].iloc[cupy.asnumpy(idx)].values)
                predictions.append(posting_ids)
                
        df['pred_matches'] = predictions
        df['f1'] = f1_score(df['matches'], df['pred_matches'])            
        score = df['f1'].mean()
        scores.append(score)
        if verbose:
            print(f'Our f1 score for threshold {np.round(threshold, 3)} is {score}')
    
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_score, best_threshold


def evaluate_by_knn(df, embeddings, verbose=False, thr_start=0, thr_end=1, thr_step=0.1):
    model = NearestNeighbors(n_neighbors = 50, metric='cosine')
    model.fit(embeddings)
    
    CHUNK = 1024 * 4
    CTS = len(embeddings) // CHUNK
    if len(embeddings) % CHUNK != 0: CTS += 1
        
    scores = list()
    thresholds = np.arange(thr_start, thr_end, thr_step)
    for threshold in thresholds:
        predictions = []
        
        for j in range(CTS):
            a, b = j * CHUNK, (j + 1) * CHUNK
            b = min(b, len(embeddings))
            distances, indices = model.kneighbors(embeddings[a:b, :])
            
            for k in range(b - a):
                idx = np.where(distances[k, ] < threshold)[0]
                ids = indices[k, ][idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
                
        df['pred_matches'] = predictions
        df['f1'] = f1_score(df['matches'], df['pred_matches'])            
        score = df['f1'].mean()
        scores.append(score)
        if verbose:
            print(f'Our f1 score for threshold {np.round(threshold, 3)} is {score}')
    
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_score, best_threshold


def pca(embeddings, n_comp: int, white: bool):
    model = PCA(n_components = n_comp)
    reduced = model.fit_transform(embeddings)
    del model
    gc.collect()
    
    return reduced


def database_augment(embeddings, n_aug):
    weights = cupy.array(np.logspace(0, -1.5, n_aug))

    dba_embeddings = list()

    CHUNK = 1024*4
    CTS = embeddings.shape[0] // CHUNK
    if embeddings.shape[0] % CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):

        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, embeddings.shape[0])

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, embeddings[a:b].T).T

        for k in range(b - a):
            IDX = (-cts[k, ]).argsort()[:n_aug]
            neighbor_embeddings = embeddings[cupy.asnumpy(IDX)]
            comb = cupy.dot(weights, neighbor_embeddings).astype(np.float16)
            dba_embeddings.append(comb)
    
    del cts, neighbor_embeddings
    gc.collect()
    return cupy.vstack(dba_embeddings)

In [10]:
embeddings = get_embeddings(df_cu, 25000, True)

text embeddings shape (34250, 24939)


In [12]:
evaluate_by_cst(df, embeddings, True)

Our f1 score for threshold 0.0 is 0.02170390994672997
Our f1 score for threshold 0.1 is 0.05344222111667425
Our f1 score for threshold 0.2 is 0.24263556975533523
Our f1 score for threshold 0.3 is 0.47216092903638296
Our f1 score for threshold 0.4 is 0.6088003645474411
Our f1 score for threshold 0.5 is 0.6584799980283037
Our f1 score for threshold 0.6 is 0.6508860684997824
Our f1 score for threshold 0.7 is 0.6139718474362906
Our f1 score for threshold 0.8 is 0.5674725762816865
Our f1 score for threshold 0.9 is 0.519655663267215
Our best score is 0.6584799980283037 and has a threshold 0.5


(0.6584799980283037, 0.5)

In [8]:
embeddings = get_embeddings(df_cu, 15000, True)

text embeddings shape (34250, 15000)


In [9]:
evaluate_by_cst(df, embeddings, True)

Our f1 score for threshold 0.0 is 0.021563968554424412
Our f1 score for threshold 0.1 is 0.04680624182418409
Our f1 score for threshold 0.2 is 0.20693914051700124
Our f1 score for threshold 0.3 is 0.43043017949641477
Our f1 score for threshold 0.4 is 0.5840247777642689
Our f1 score for threshold 0.5 is 0.6548821701124466
Our f1 score for threshold 0.6 is 0.6608917181718009
Our f1 score for threshold 0.7 is 0.6307863367045677
Our f1 score for threshold 0.8 is 0.582342900802781
Our f1 score for threshold 0.9 is 0.5283465337900001
Our best score is 0.6608917181718009 and has a threshold 0.6000000000000001


(0.6608917181718009, 0.6000000000000001)

In [18]:
# dba_embeddings = database_augment(embeddings, n_aug=4)
evaluate_by_cst(df, dba_embeddings, True, 1.1, 1.3, 0.025)

Our f1 score for threshold 1.1 is 0.6740430718960063
Our f1 score for threshold 1.125 is 0.6756116106723131
Our f1 score for threshold 1.15 is 0.6765921954282385
Our f1 score for threshold 1.175 is 0.677230146038827
Our f1 score for threshold 1.2 is 0.6769511398421453
Our f1 score for threshold 1.225 is 0.675626948155604
Our f1 score for threshold 1.25 is 0.6742350342124448
Our f1 score for threshold 1.275 is 0.6716638576679795
Our best score is 0.677230146038827 and has a threshold 1.1749999999999998


(0.677230146038827, 1.1749999999999998)

In [14]:
evaluate_by_knn(df, embeddings, True)

Our f1 score for threshold 0.0 is 0.22589668337160274
Our f1 score for threshold 0.1 is 0.519655663267215
Our f1 score for threshold 0.2 is 0.5674725762816865
Our f1 score for threshold 0.3 is 0.6139718474362906
Our f1 score for threshold 0.4 is 0.6508645672914471
Our f1 score for threshold 0.5 is 0.6584638058535407
Our f1 score for threshold 0.6 is 0.6089996426107022
Our f1 score for threshold 0.7 is 0.4740206364669603
Our f1 score for threshold 0.8 is 0.2653058228238367
Our f1 score for threshold 0.9 is 0.1690893093777188


NameError: name 'thresholds' is not defined

In [16]:
evaluate_by_knn(df, embeddings, True, 0.4, 0.6, 0.025)

Our f1 score for threshold 0.4 is 0.6508645672914471
Our f1 score for threshold 0.425 is 0.6567669084635231
Our f1 score for threshold 0.45 is 0.6603468323297726
Our f1 score for threshold 0.475 is 0.6606102465716948
Our f1 score for threshold 0.5 is 0.6584638058535407
Our f1 score for threshold 0.525 is 0.6527621992402718
Our f1 score for threshold 0.55 is 0.6423353043402702
Our f1 score for threshold 0.575 is 0.6284139827312539


NameError: name 'thresholds' is not defined

In [18]:
for n_aug in [2, 3, 4, 5]:
    dba_embeddings = database_augment(embeddings, n_aug=n_aug)
    evaluate_by_knn(df, dba_embeddings, True, 0, 0.6, 0.1)

  0%|          | 0/9 [00:00<?, ?it/s]

Our f1 score for threshold 0.0 is 0.13721999416614566
Our f1 score for threshold 0.1 is 0.5240900275190714
Our f1 score for threshold 0.2 is 0.5755170537607047
Our f1 score for threshold 0.3 is 0.6226671969984707
Our f1 score for threshold 0.4 is 0.6596576305309454
Our f1 score for threshold 0.5 is 0.6627428546438634
Our best score is 0.6627428546438634 and has a threshold 0.5


  0%|          | 0/9 [00:00<?, ?it/s]

Our f1 score for threshold 0.0 is 0.1361987150703364
Our f1 score for threshold 0.1 is 0.5510136463419336
Our f1 score for threshold 0.2 is 0.6188443355035681
Our f1 score for threshold 0.3 is 0.6653831061019478
Our f1 score for threshold 0.4 is 0.6821859477683
Our f1 score for threshold 0.5 is 0.6588236620025603
Our best score is 0.6821859477683 and has a threshold 0.4


  0%|          | 0/9 [00:00<?, ?it/s]

Our f1 score for threshold 0.0 is 0.1283022208579989
Our f1 score for threshold 0.1 is 0.5912237146485838
Our f1 score for threshold 0.2 is 0.6585533879221791
Our f1 score for threshold 0.3 is 0.682850215600553
Our f1 score for threshold 0.4 is 0.6751296518599637
Our f1 score for threshold 0.5 is 0.630763817126077
Our best score is 0.682850215600553 and has a threshold 0.30000000000000004


  0%|          | 0/9 [00:00<?, ?it/s]

Our f1 score for threshold 0.0 is 0.12879473917238793
Our f1 score for threshold 0.1 is 0.6223800072039957
Our f1 score for threshold 0.2 is 0.6762265881673074
Our f1 score for threshold 0.3 is 0.6813847072979412
Our f1 score for threshold 0.4 is 0.6540300199510073
Our f1 score for threshold 0.5 is 0.5963890233219248
Our best score is 0.6813847072979412 and has a threshold 0.30000000000000004


In [19]:
dba_embeddings = database_augment(embeddings, n_aug=4)
evaluate_by_knn(df, dba_embeddings, True, 0.2, 0.4, 0.025)

  0%|          | 0/9 [00:00<?, ?it/s]

Our f1 score for threshold 0.2 is 0.6585533879221791
Our f1 score for threshold 0.225 is 0.6671140237260946
Our f1 score for threshold 0.25 is 0.6747279729783485
Our f1 score for threshold 0.275 is 0.6795177137129732
Our f1 score for threshold 0.3 is 0.682850215600553
Our f1 score for threshold 0.325 is 0.6845803454287234
Our f1 score for threshold 0.35 is 0.6835199923413185
Our f1 score for threshold 0.375 is 0.6796461835753745
Our best score is 0.6845803454287234 and has a threshold 0.32499999999999996


(0.6845803454287234, 0.32499999999999996)

  0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
evaluate_by_knn(df, dba_embeddings)

Our f1 score for threshold 0.0 is 0.13982649415880108
Our f1 score for threshold 0.1 is 0.5262600991919504
Our f1 score for threshold 0.2 is 0.5795658546877683
Our f1 score for threshold 0.3 is 0.6263165665425965
Our f1 score for threshold 0.4 is 0.6525038806744203
Our f1 score for threshold 0.5 is 0.6386572202883374
Our f1 score for threshold 0.6 is 0.5615881241284257


text embeddings shape (34250, 24939)


In [12]:
evaluate_by_knn(df, embeddings.get())

Our f1 score for threshold 0.0 is 0.22647574858644065
Our f1 score for threshold 0.1 is 0.5218659686112015
Our f1 score for threshold 0.2 is 0.5711856965616262
Our f1 score for threshold 0.3 is 0.6179961493705737
Our f1 score for threshold 0.4 is 0.6453769226657332
Our f1 score for threshold 0.5 is 0.6366735871210853
Our f1 score for threshold 0.6 is 0.5645503903360405
Our f1 score for threshold 0.7 is 0.4200496406578969
Our f1 score for threshold 0.8 is 0.23924797366928147
Our f1 score for threshold 0.9 is 0.1675909071233767


In [31]:
for n_aug in [2, 3, 4]:
    emb = database_augment(reduced_embeddings.get(), n_aug)
    evaluate_by_knn(df, emb)

Our f1 score for threshold 0.0 is 0.1748018195626768
Our f1 score for threshold 0.1 is 0.5546234806194882
Our f1 score for threshold 0.2 is 0.610764417071245
Our f1 score for threshold 0.3 is 0.6364847301378191
Our f1 score for threshold 0.4 is 0.6188215230696176
Our f1 score for threshold 0.5 is 0.5495850365348961
Our f1 score for threshold 0.6 is 0.4279197346084608
Our f1 score for threshold 0.7 is 0.28269289712171003
Our f1 score for threshold 0.8 is 0.18201920554358691
Our f1 score for threshold 0.9 is 0.165951774064816
Our f1 score for threshold 0.0 is 0.17429207040444483
Our f1 score for threshold 0.1 is 0.5852107090801242
Our f1 score for threshold 0.2 is 0.6379684915971255
Our f1 score for threshold 0.3 is 0.6424564199489113
Our f1 score for threshold 0.4 is 0.606578511766547
Our f1 score for threshold 0.5 is 0.5267126581609337
Our f1 score for threshold 0.6 is 0.41040511578503797
Our f1 score for threshold 0.7 is 0.27895245390284384
Our f1 score for threshold 0.8 is 0.18490291

In [19]:
for thr in np.arange(0.2, 1, 0.2):
    predictions = get_neighbors(df, reduced_embeddings, thr)
    df['pred_matches'] = list(map(lambda x: ' '.join(x), predictions))
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    print(f"threshold={np.round(thr, 3)}: {df['f1'].mean()}")

  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.2: 0.19789823267595805


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.4: 0.5356746950309523


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.6: 0.5579984684456785


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.8: 0.31206072376184635
