In [1]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
from cuml.neighbors import NearestNeighbors
warnings.simplefilter('ignore')

In [12]:
@dataclass
class Config:
    outdir: str = "../results/efficientnet-tpu"
    device: str = "cuda:2"
    device_id: int = 2

    datadir: str = '../data/tfrecord-gkf'
    image_dir: str = '../data/shopee-product-matching/train_images'
    seed: int = 42
    n_splits: int = 1
    tf_expt: int = -1
    image_size: List[int] = field(default_factory=lambda: [512, 512])
    
    # Training config
    batch_size: int = 20
    epochs: int = 100
    patience: int = 5
    lr: float = 0.00001
    emb_len: int = 2048

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [13]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [14]:
config = Config()
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)

base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa
import efficientnet.tfkeras as efn

from src.tokenization import *
from src.preprocess import *
from src.image import *
from src.model import *

In [15]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1


def precision_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    return intersection / len_y_pred


def recall_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_true = y_true.apply(lambda x: len(x)).values
    return intersection / len_y_true

In [16]:
# Function to read our test image and return image
def read_image(image, image_size):
    image = tf.io.read_file(image)
    image = decode_image(image, image_size)
    return image


# Function to get our dataset that read images
def get_dataset(image, config):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(lambda x: read_image(x, config.image_size), num_parallel_calls = config.tf_expt)
    dataset = dataset.batch(config.batch_size)
    dataset = dataset.prefetch(config.tf_expt)
    return dataset

In [17]:
train = pd.read_csv(base_dir / config.datadir / 'train_folds.csv')
n_classes = train['label_group'].nunique()
n_classes

11014

In [18]:
def get_embeddings(df: pd.DataFrame, weight_path: Path, en_type: str):
    image_paths = [str(base_dir / config.image_dir / filename) for filename in df['image']]
    
    model = build_efficientnet_model(
        n_classes=n_classes,
        image_size=config.image_size,
        lr=config.lr,
        en_type=en_type,
        train=False,
        emb_len=config.emb_len
    )
    model.load_weights(str(weight_path))
    model = tf.keras.models.Model(inputs=model.input[0:3], outputs=model.layers[-4].output)
    
    chunk = 500
    iterator = np.arange(np.ceil(len(df) / chunk))

    embeds = []
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)

        image_dataset = get_dataset(image_paths[a:b], config)
        embeddings = model.predict(image_dataset)
        
        embeds.append(embeddings)
    
    del model
    gc.collect()
    tf.keras.backend.clear_session()
    return np.concatenate(embeds)

In [32]:
def get_valid_df(fold: int):
    valid_folds = [i for i in range(train.fold.unique().shape[0]) if (i % config.n_splits) == fold]
    
    valid_df = train.query(f'fold in {valid_folds}').copy()
    return valid_df

In [22]:
def get_embeddings_list(epoch: int, en_type: str):
    epoch = format(epoch, '02')
    embeddings_list = list()
    for i in range(config.n_splits):
        outdir = base_dir / config.outdir / f'EfficientNet{en_type}_GKF_seed{config.seed}_fold{i}_emb{config.emb_len}'
        weight_path = outdir / f'epoch{epoch}.h5'
        emb_outpath = outdir / f'embeddings_epoch{epoch}.pkl'

        if os.path.exists(emb_outpath):
            print('get embeddings from the cache')
            embeddings = pickle.load(open(str(emb_outpath), 'rb'))
        else:
            embeddings, cosines = get_embeddings(
                df=train,
                weight_path=weight_path,
                en_type=en_type
            )
            pickle.dump(
                embeddings,
                open(str(emb_outpath), 'wb')
            )
        embeddings_list += [embeddings]
                
    return embeddings_list

In [85]:
def get_matches(df: pd.DataFrame, distance: np.ndarray, indice: np.ndarray, thr: float) -> List[str]:
        idx = np.where(distance < thr)[0]
        ids = indice[idx]
        return df['posting_id'].iloc[ids].values.tolist()
    
    
def search_thresholds(df: pd.DataFrame, embeddings: np.ndarray, fold: int, knn: int = 50, metric='euclidean',
                      thr_start: int = 3, thr_end: int = 5, thr_interval: float = 0.1, verbose=False):
    # KNN
    knn_model = NearestNeighbors(n_neighbors=knn, metric=metric)
    knn_model.fit(embeddings)

    valid_df = get_valid_df(fold=fold)
    valid_embeddings = embeddings[valid_df.index, :]
    distances, indices = knn_model.kneighbors(valid_embeddings)

    # grid search
    thresholds = list(np.arange(thr_start, thr_end, thr_interval))
    scores = []
    for threshold in thresholds:
        predictions = []
        for k in range(valid_embeddings.shape[0]):
            matches = get_matches(df=df, distance=distances[k, ], indice=indices[k, ], thr=threshold)
            posting_ids = ' '.join(matches)
            predictions.append(posting_ids)
        valid_df['pred_matches'] = predictions
        valid_df['f1'] = f1_score(valid_df['matches'], valid_df['pred_matches'])
        score = valid_df['f1'].mean()
        if verbose:
            print(f'Our f1 score for threshold {np.round(threshold, 2)} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_score, best_threshold


def database_augment(embeddings, n_aug):
    weights = np.logspace(0, -1.5, n_aug)
    model = NearestNeighbors(n_neighbors=n_aug, metric='cosine')
    model.fit(embeddings)
    
    dba_embeddings = list()
    
    CHUNK = 1024*4
    CTS = len(embeddings) // CHUNK
    if len(embeddings) % CHUNK != 0: CTS += 1
    for j in tqdm(range( CTS )):

        a, b = j * CHUNK, (j + 1) * CHUNK
        b = min(b, len(embeddings))

        # COSINE SIMILARITY DISTANCE
        distances, indices = model.kneighbors(embeddings[a:b, :])
    
        for k in range(b - a):
            neighbor_embeddings = embeddings[indices[k, ]]
            comb = np.dot(weights, neighbor_embeddings)
            dba_embeddings.append(comb)
        
    del model, distances, indices
    gc.collect()
    return np.vstack(dba_embeddings)

In [29]:
embs = list()
for entype in ['B0', 'B3', 'B5']:
    embs_bx = get_embeddings_list(epoch=20, en_type=entype)[0]
    embs.append(embs_bx)

get embeddings from the cache
get embeddings from the cache
get embeddings from the cache


In [36]:
for emb in embs:
    search_thresholds(train, emb, 0, 50, 'cosine', 0.2, 0.5, 0.05)

Our best score is 0.7481825997918536 and has a threshold 0.39999999999999997
Our best score is 0.7507596436161528 and has a threshold 0.35
Our best score is 0.7488550273098631 and has a threshold 0.35


In [41]:
results = list()
for n_aug in range(2, 6):
    print(n_aug)
    dba_embs = [database_augment(emb, n_aug=n_aug) for emb in embs]
    results += [search_thresholds(train, emb, 0, 50, 'cosine', 0.2, 0.5, 0.05) for emb in dba_embs]

2
Our best score is 0.7535258462780937 and has a threshold 0.35
Our best score is 0.7542562888347566 and has a threshold 0.35
Our best score is 0.7528132094289697 and has a threshold 0.35
3
Our best score is 0.7674997507953082 and has a threshold 0.3
Our best score is 0.7663347251207753 and has a threshold 0.3
Our best score is 0.7655683224935339 and has a threshold 0.3
4
Our best score is 0.7695514976063892 and has a threshold 0.2
Our best score is 0.7701235593847349 and has a threshold 0.2
Our best score is 0.770026601127599 and has a threshold 0.2
5
Our best score is 0.7659990107680901 and has a threshold 0.2
Our best score is 0.7642804654293885 and has a threshold 0.2
Our best score is 0.7637633025248763 and has a threshold 0.2


In [43]:
dba_embs = [database_augment(emb, n_aug=4) for emb in embs]
for emb in dba_embs:
    search_thresholds(train, emb, 0, 50, 'cosine', 0.1, 0.3, 0.01)

Our best score is 0.7710984633445934 and has a threshold 0.21999999999999995
Our best score is 0.7701235593847349 and has a threshold 0.19999999999999996
Our best score is 0.7700500150471253 and has a threshold 0.20999999999999996


In [52]:
def get_prediction(embeddings, valid_idx, thr):
    valid_embeddings = embeddings[valid_idx, :]
    # KNN
    knn_model = NearestNeighbors(n_neighbors=50, metric='cosine')
    knn_model.fit(embeddings)
    distances, indices = knn_model.kneighbors(valid_embeddings)
    
    predictions = []
    for k in range(valid_embeddings.shape[0]):
        matches = get_matches(df=train, distance=distances[k, ], indice=indices[k, ], thr=thr)
        predictions.append(matches)
    return predictions

In [54]:
valid_df = get_valid_df(fold=0)

for thr, emb, entype in zip([0.22, 0.2, 0.21], dba_embs, ['B0', 'B3', 'B5']):
    valid_df[f'preds_{entype}'] = get_prediction(embeddings=emb, valid_idx=valid_df.index, thr=thr)

In [63]:
def concat_matches_or(row):
    matches = np.concatenate([row['preds_B0'], row['preds_B3'], row['preds_B5']])
    return ' '.join( np.unique(matches) )


def concat_matches_and(row):
    matches = list(set(row['preds_B0']) & set(row['preds_B3']) & set(row['preds_B5']))
    return ' '.join(matches)

print('or: ', f1_score(valid_df['matches'], valid_df.apply(concat_matches_or, axis=1)).mean())
print('and: ', f1_score(valid_df['matches'], valid_df.apply(concat_matches_and, axis=1)).mean())

or:  0.7686572817155032
and:  0.7686190196705233


In [73]:
search_thresholds(train, np.hstack(embs), 0, 50, 'cosine', 0, 1, 0.1, verbose=True)
search_thresholds(train, np.mean(embs, axis=0), 0, 50, 'cosine', 0, 1, 0.1, verbose=True)

Our f1 score for threshold 0.0 is 0.19493150498865816
Our f1 score for threshold 0.1 is 0.6502414180236382
Our f1 score for threshold 0.2 is 0.7169371778230083
Our f1 score for threshold 0.3 is 0.7467511391998424
Our f1 score for threshold 0.4 is 0.7541854012158323
Our f1 score for threshold 0.5 is 0.7207435421563243
Our f1 score for threshold 0.6 is 0.5853674767476909
Our f1 score for threshold 0.7 is 0.29580146366960736
Our f1 score for threshold 0.8 is 0.16855946599379565
Our f1 score for threshold 0.9 is 0.16831691175633612
Our best score is 0.7541854012158323 and has a threshold 0.4
Our f1 score for threshold 0.0 is 0.21253980169956915
Our f1 score for threshold 0.1 is 0.6501933612599894
Our f1 score for threshold 0.2 is 0.7174988238007374
Our f1 score for threshold 0.3 is 0.7467027604485956
Our f1 score for threshold 0.4 is 0.7540204219176603
Our f1 score for threshold 0.5 is 0.718966418334804
Our f1 score for threshold 0.6 is 0.5791827488991553
Our f1 score for threshold 0.7 is 

(0.7540204219176603, 0.4)

In [75]:
embs_ens = np.hstack(embs)
results = list()
for n_aug in range(2, 6):
    print(n_aug)
    dba_embs_ens = database_augment(embs_ens, n_aug=n_aug)
    results += [search_thresholds(train, dba_embs_ens, 0, 50, 'cosine', 0, 1, 0.1)]

2
Our best score is 0.7574578052770922 and has a threshold 0.4
3
Our best score is 0.7729341902137608 and has a threshold 0.30000000000000004
4
Our best score is 0.7757328548663609 and has a threshold 0.2
5
Our best score is 0.7754705742133092 and has a threshold 0.2


In [76]:
embs_ens = np.mean(embs, axis=0)
results = list()
for n_aug in range(2, 6):
    print(n_aug)
    dba_embs_ens = database_augment(embs_ens, n_aug=n_aug)
    results += [search_thresholds(train, dba_embs_ens, 0, 50, 'cosine', 0, 1, 0.1)]

2
Our best score is 0.7577914755076941 and has a threshold 0.4
3
Our best score is 0.7732484741852182 and has a threshold 0.30000000000000004
4
Our best score is 0.7756342948285152 and has a threshold 0.2
5
Our best score is 0.7745672538166846 and has a threshold 0.2


In [74]:
search_thresholds(train, np.hstack(dba_embs), 0, 50, 'cosine', 0, 1, 0.1, verbose=True)
search_thresholds(train, np.mean(dba_embs, axis=0), 0, 50, 'cosine', 0, 1, 0.1, verbose=True)

Our f1 score for threshold 0.0 is 0.19528915944629796
Our f1 score for threshold 0.1 is 0.7321883175346043
Our f1 score for threshold 0.2 is 0.7729401129883415
Our f1 score for threshold 0.3 is 0.7730499733199564
Our f1 score for threshold 0.4 is 0.7413945458839016
Our f1 score for threshold 0.5 is 0.6581999194805409
Our f1 score for threshold 0.6 is 0.48438001215985527
Our f1 score for threshold 0.7 is 0.24341859390526474
Our f1 score for threshold 0.8 is 0.16968138855616677
Our f1 score for threshold 0.9 is 0.16958922760181622
Our best score is 0.7730499733199564 and has a threshold 0.30000000000000004
Our f1 score for threshold 0.0 is 0.212138935893935
Our f1 score for threshold 0.1 is 0.732263754640866
Our f1 score for threshold 0.2 is 0.7731480665958421
Our f1 score for threshold 0.3 is 0.7729552706400833
Our f1 score for threshold 0.4 is 0.7407169284205879
Our f1 score for threshold 0.5 is 0.6566666768203135
Our f1 score for threshold 0.6 is 0.4790946683486034
Our f1 score for th

(0.7731480665958421, 0.2)

In [71]:
dba_emb_ens = np.mean(dba_embs, axis=0)
search_thresholds(train, dba_emb_ens, 0, 50, 'cosine', 0, 1, 0.1, verbose=True)

Our f1 score for threshold 0.0 is 0.212138935893935
Our f1 score for threshold 0.1 is 0.732263754640866
Our f1 score for threshold 0.2 is 0.7731480665958421
Our f1 score for threshold 0.3 is 0.7729552706400833
Our f1 score for threshold 0.4 is 0.7407169284205879
Our f1 score for threshold 0.5 is 0.6566666768203135
Our f1 score for threshold 0.6 is 0.4790946683486034
Our f1 score for threshold 0.7 is 0.23730646809554073
Our f1 score for threshold 0.8 is 0.16950999423858204
Our f1 score for threshold 0.9 is 0.16946043458977578
Our best score is 0.7731480665958421 and has a threshold 0.2


(0.7731480665958421, 0.2)

In [82]:
# embs
bert_emb = pickle.load(open(base_dir / '../results/bert-gkf/Bert_seed123_encodelen70_emb2048-gkf/embeddings_epoch15.pkl', 'rb'))
embs.append(bert_emb)

In [83]:
search_thresholds(train, embs[-1], 0, 50, 'cosine', 0.2, 0.5, 0.05)

Our best score is 0.8633810383660473 and has a threshold 0.3


(0.8633810383660473, 0.3)

In [88]:
embs_ens = np.hstack(embs)
for n_aug in range(2, 6):
    print(n_aug)
    dba_embs_ens = database_augment(embs_ens, n_aug=n_aug)
    search_thresholds(train, dba_embs_ens, 0, 50, 'cosine', 0, 1, 0.1)

2


  0%|          | 0/9 [00:00<?, ?it/s]

Our best score is 0.8413435221569675 and has a threshold 0.4
3


  0%|          | 0/9 [00:00<?, ?it/s]

Our best score is 0.8687072799195111 and has a threshold 0.4
4


  0%|          | 0/9 [00:00<?, ?it/s]

Our best score is 0.8729258070653391 and has a threshold 0.30000000000000004
5


  0%|          | 0/9 [00:00<?, ?it/s]

Our best score is 0.8689642330462329 and has a threshold 0.2
