In [1]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
from cuml.neighbors import NearestNeighbors
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
warnings.simplefilter('ignore')

In [2]:
base_dir = Path().resolve()
data_dir = base_dir / '../data/tfrecord-skf'

In [3]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [4]:
df = pd.read_csv(data_dir / 'train_folds.csv')
df_cu = cudf.DataFrame(df)
df_cu.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0,train_129225211 train_2278313361,0.666667,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2,train_2288590299 train_3803689425,0.666667,5
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3,train_2406599165 train_3342059966,0.666667,7
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4,train_3369186413 train_921438619,0.666667,9


In [28]:
def get_embeddings(df_cu, max_features: int, binary: bool = False):
    model = TfidfVectorizer(stop_words='english', binary=False, max_features=25000)
    embeddings = model.fit_transform(df_cu.title).toarray()
    print('text embeddings shape',embeddings.shape)
    del model
    gc.collect()
    
    return embeddings


def get_neighbors(df, embeddings, thr):
    predictions = []
    CHUNK = 1024*4

    CTS = len(df) // CHUNK
    if len(df) % CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):
        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(df))

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, embeddings[a:b].T).T

        for k in range(b - a):
            idx = cupy.where(cts[k,] > thr)[0]
            o = df.iloc[cupy.asnumpy(idx)].posting_id.values
            predictions.append(o)
    
    return predictions


def evaluate_by_knn(df, embeddings):
    model = NearestNeighbors(n_neighbors = 50, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    for threshold in np.arange(0, 1, 0.1):
        predictions = []
        for k in range(embeddings.shape[0]):
            idx = np.where(distances[k, ] < threshold)[0]
            ids = indices[k, ][idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        df['pred_matches'] = predictions
        df['f1'] = f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Our f1 score for threshold {np.round(threshold, 2)} is {score}')


def pca(embeddings, n_comp: int, white: bool):
    model = PCA(n_components = n_comp)
    reduced = model.fit_transform(embeddings)
    del model
    gc.collect()
    
    return reduced


def database_augment(embeddings, n_aug):
    weights = np.logspace(0, -1.5, n_aug)
    model = NearestNeighbors(n_neighbors=n_aug, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    dba_embeddings = list()
    for k in range(embeddings.shape[0]):
        neighbor_embeddings = embeddings[indices[k, ]]
        comb = np.dot(weights, neighbor_embeddings)
        dba_embeddings.append(comb)
    dba_embeddings = np.vstack(dba_embeddings)
    
    del model, distances, indices
    gc.collect()
    return dba_embeddings

In [6]:
embeddings = get_embeddings(df_cu, 25000, False)

text embeddings shape (34250, 24939)


In [31]:
for n_aug in [2, 3, 4]:
    emb = database_augment(reduced_embeddings.get(), n_aug)
    evaluate_by_knn(df, emb)

Our f1 score for threshold 0.0 is 0.1748018195626768
Our f1 score for threshold 0.1 is 0.5546234806194882
Our f1 score for threshold 0.2 is 0.610764417071245
Our f1 score for threshold 0.3 is 0.6364847301378191
Our f1 score for threshold 0.4 is 0.6188215230696176
Our f1 score for threshold 0.5 is 0.5495850365348961
Our f1 score for threshold 0.6 is 0.4279197346084608
Our f1 score for threshold 0.7 is 0.28269289712171003
Our f1 score for threshold 0.8 is 0.18201920554358691
Our f1 score for threshold 0.9 is 0.165951774064816
Our f1 score for threshold 0.0 is 0.17429207040444483
Our f1 score for threshold 0.1 is 0.5852107090801242
Our f1 score for threshold 0.2 is 0.6379684915971255
Our f1 score for threshold 0.3 is 0.6424564199489113
Our f1 score for threshold 0.4 is 0.606578511766547
Our f1 score for threshold 0.5 is 0.5267126581609337
Our f1 score for threshold 0.6 is 0.41040511578503797
Our f1 score for threshold 0.7 is 0.27895245390284384
Our f1 score for threshold 0.8 is 0.18490291

In [19]:
for thr in np.arange(0.2, 1, 0.2):
    predictions = get_neighbors(df, reduced_embeddings, thr)
    df['pred_matches'] = list(map(lambda x: ' '.join(x), predictions))
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    print(f"threshold={np.round(thr, 3)}: {df['f1'].mean()}")

  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.2: 0.19789823267595805


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.4: 0.5356746950309523


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.6: 0.5579984684456785


  0%|          | 0/9 [00:00<?, ?it/s]

threshold=0.8: 0.31206072376184635
