In [1]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
import cudf
import cuml
import cupy
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
warnings.simplefilter('ignore')

In [2]:
@dataclass
class Config:
    outdir: str = "../results/efficientnet-tpu"
    device: str = "cuda:2"
    device_id: int = 2

    datadir: str = '../data/tfrecord-gkf'
    image_dir: str = '../data/shopee-product-matching/train_images'
    seed: int = 42
    n_splits: int = 1
    tf_expt: int = -1
    image_size: List[int] = field(default_factory=lambda: [512, 512])
    
    # Training config
    batch_size: int = 20
    epochs: int = 100
    patience: int = 5
    lr: float = 0.00001
    emb_len: int = 2048

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [6]:
config = Config()
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)

base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa
import efficientnet.tfkeras as efn

from src.tokenization import *
from src.preprocess import *
from src.image import *
from src.model import *
from src.diffussion import *

In [7]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1


def precision_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    return intersection / len_y_pred


def recall_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_true = y_true.apply(lambda x: len(x)).values
    return intersection / len_y_true

In [8]:
# Function to read our test image and return image
def read_image(image, image_size):
    image = tf.io.read_file(image)
    image = decode_image(image, image_size)
    return image


# Function to get our dataset that read images
def get_dataset(image, config):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(lambda x: read_image(x, config.image_size), num_parallel_calls = config.tf_expt)
    dataset = dataset.batch(config.batch_size)
    dataset = dataset.prefetch(config.tf_expt)
    return dataset

In [9]:
train = pd.read_csv(base_dir / config.datadir / 'train_folds.csv')
train_cu = cudf.DataFrame(train)
n_classes = train.query('fold == 0')['label_group'].nunique()
n_classes

2203

In [10]:
def get_embeddings(df: pd.DataFrame, weight_path: Path, en_type: str):
    image_paths = [str(base_dir / config.image_dir / filename) for filename in df['image']]
    
    model = build_efficientnet_model(
        n_classes=n_classes,
        image_size=config.image_size,
        lr=config.lr,
        en_type=en_type,
        train=False,
        emb_len=config.emb_len
    )
    model.load_weights(str(weight_path))
    model = tf.keras.models.Model(inputs=model.input[0:3], outputs=model.layers[-4].output)
    
    chunk = 500
    iterator = np.arange(np.ceil(len(df) / chunk))

    embeds = []
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)

        image_dataset = get_dataset(image_paths[a:b], config)
        embeddings = model.predict(image_dataset)
        
        embeds.append(embeddings)
    
    del model
    gc.collect()
    tf.keras.backend.clear_session()
    return np.concatenate(embeds)

In [11]:
def get_valid_df(fold: int):
    valid_folds = [i for i in range(train.fold.unique().shape[0]) if (i % config.n_splits) == fold]
    
    valid_df = train.query(f'fold in {valid_folds}').copy()
    return valid_df

In [12]:
def get_embeddings_list(epoch: int, en_type: str):
    epoch = format(epoch, '02')
    embeddings_list = list()
    for i in range(config.n_splits):
        if en_type:
            outdir = base_dir / config.outdir / f'EfficientNet{en_type}_GKF_seed{config.seed}_fold{i}_emb{config.emb_len}'
        else:
            outdir = base_dir / config.outdir / f'resnext50_GKF_seed{config.seed}_fold{i}_emb{config.emb_len}'            
        
        weight_path = outdir / f'epoch{epoch}.h5'
        emb_outpath = outdir / f'embeddings_epoch{epoch}.pkl'

        if os.path.exists(emb_outpath):
            print('get embeddings from the cache')
            embeddings = pickle.load(open(str(emb_outpath), 'rb'))
        else:
            embeddings = get_embeddings(
                df=train,
                weight_path=weight_path,
                en_type=en_type
            )
            pickle.dump(
                embeddings,
                open(str(emb_outpath), 'wb')
            )
        embeddings_list += [embeddings]
                
    return embeddings_list


def get_embeddings(df: pd.DataFrame, weight_path: Path, en_type: str):
    image_paths = [str(base_dir / config.image_dir / filename) for filename in df['image']]
    
    if en_type is None:
        model = build_resnext_model(
            n_classes=n_classes,
            image_size=config.image_size,
            lr=config.lr,
            train=False,
            emb_len=config.emb_len
        )
    else:
        model = build_efficientnet_model(
            n_classes=n_classes,
            image_size=config.image_size,
            lr=config.lr,
            en_type=en_type,
            train=False,
            emb_len=config.emb_len
        )
        
    model.load_weights(str(weight_path))
    model = tf.keras.models.Model(inputs=model.input[0:3], outputs=model.layers[-4].output)
    
    chunk = 500
    iterator = np.arange(np.ceil(len(df) / chunk))

    embeds = []
    cosines = []
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)

        image_dataset = get_dataset(image_paths[a:b], config)
        embeddings = model.predict(image_dataset)
        
        embeds.append(embeddings)
    
    del model
    gc.collect()
    tf.keras.backend.clear_session()
    return np.concatenate(embeds)


def get_ifidf_embeddings(df_cu, max_features: int, binary: bool = False):
    model = TfidfVectorizer(stop_words='english', binary=binary, max_features=max_features)
    embeddings = model.fit_transform(df_cu.title).toarray()
    print('text embeddings shape',embeddings.shape)
    del model
    gc.collect()
    
    return embeddings

In [13]:
def get_matches(df: pd.DataFrame, distance: np.ndarray, indice: np.ndarray, thr: float) -> List[str]:
        idx = np.where(distance < thr)[0]
        ids = indice[idx]
        return df['posting_id'].iloc[ids].values.tolist()
    
    
def search_thresholds(df: pd.DataFrame, embeddings: np.ndarray, fold: int, knn: int = 50, metric='euclidean',
                      thr_start: int = 3, thr_end: int = 5, thr_interval: float = 0.1, verbose=False):
    # KNN
    knn_model = NearestNeighbors(n_neighbors=knn, metric=metric)
    knn_model.fit(embeddings)

    valid_df = get_valid_df(fold=fold)
    valid_embeddings = embeddings[valid_df.index, :]
    distances, indices = knn_model.kneighbors(valid_embeddings)

    # grid search
    thresholds = list(np.arange(thr_start, thr_end, thr_interval))
    scores = []
    for threshold in thresholds:
        predictions = []
        for k in range(valid_embeddings.shape[0]):
            matches = get_matches(df=df, distance=distances[k, ], indice=indices[k, ], thr=threshold)
            posting_ids = ' '.join(matches)
            predictions.append(posting_ids)
        valid_df['pred_matches'] = predictions
        valid_df['f1'] = f1_score(valid_df['matches'], valid_df['pred_matches'])
        score = valid_df['f1'].mean()
        if verbose:
            print(f'Our f1 score for threshold {np.round(threshold, 2)} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_score, best_threshold


def database_augment(embeddings, n_aug):
    weights = np.logspace(0, -1.5, n_aug)
    model = NearestNeighbors(n_neighbors=n_aug, metric='cosine')
    model.fit(embeddings)
    
    dba_embeddings = list()
    
    CHUNK = 1024*4
    CTS = len(embeddings) // CHUNK
    if len(embeddings) % CHUNK != 0: CTS += 1
    for j in tqdm(range( CTS )):

        a, b = j * CHUNK, (j + 1) * CHUNK
        b = min(b, len(embeddings))

        # COSINE SIMILARITY DISTANCE
        distances, indices = model.kneighbors(embeddings[a:b, :])
    
        for k in range(b - a):
            neighbor_embeddings = embeddings[indices[k, ]]
            comb = np.dot(weights, neighbor_embeddings)
            dba_embeddings.append(comb)
        
    del model, distances, indices
    gc.collect()
    return np.vstack(dba_embeddings)


def get_prediction(embeddings, valid_idx, thr):
    valid_embeddings = embeddings[valid_idx, :]
    # KNN
    knn_model = NearestNeighbors(n_neighbors=50, metric='cosine')
    knn_model.fit(embeddings)
    distances, indices = knn_model.kneighbors(valid_embeddings)
    
    predictions = []
    for k in range(valid_embeddings.shape[0]):
        matches = get_matches(df=train, distance=distances[k, ], indice=indices[k, ], thr=thr)
        predictions.append(matches)
    return predictions


def get_neighbors_by_cts(df, embeddings, valid_idx, thr):
    valid_embeddings = embeddings[cupy.asnumpy(valid_idx)]
    
    predictions = []
    CHUNK = 1024*4

    CTS = len(valid_embeddings) // CHUNK
    if len(valid_embeddings) % CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):
        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(valid_embeddings))

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, valid_embeddings[a:b].T).T

        for k in range(b - a):
            idx = cupy.where(cts[k,] > thr)[0]
            o = df.iloc[cupy.asnumpy(idx)].posting_id.values
            predictions.append(o)
    
    return predictions


def concat_matches(row):
    matches = np.concatenate([row['concat_predictions'], row['text_predictions']])
    return ' '.join( np.unique(matches) )

In [69]:
embs = list()
for entype in ['B0', 'B3', 'B5']:
    embs_bx = get_embeddings_list(epoch=20, en_type=entype)[0]
    embs.append(embs_bx)

get embeddings from the cache
get embeddings from the cache
get embeddings from the cache


In [70]:
# concat embeddings
embeddings = np.hstack(embs)
embeddings.shape

(34250, 6144)

In [16]:
# dba
dba_embeddings = database_augment(embeddings, n_aug=4)
dba_embeddings.shape

  0%|          | 0/9 [00:00<?, ?it/s]

(34250, 6144)

In [73]:
# knn_model = NearestNeighbors(n_neighbors=QUERYKNN, metric='cosine')
# knn_model.fit(embeddings)
# _, indices = knn_model.kneighbors(embeddings)
# del knn_model
# gc.collect()

# W = np.zeros((embeddings.shape[0], embeddings.shape[0]))
# for i in tqdm(range(embeddings.shape[0])):
#     query = embeddings[i]
#     neighbors = embeddings[indices[i], :]
#     sim = np.dot(neighbors, query)
#     qsim = sim_kernel(sim)
#     W[i, indices[i]] = qsim

# Wn = normalize_connection_graph(W)
# cg_ranks = cg_diffusion(qsim, Wn, alpha)

312

In [105]:
text_embeddings = get_ifidf_embeddings(train_cu, max_features=25000, binary=True)

text embeddings shape (34250, 24939)


In [106]:
valid_df = train.query('fold != 0')
valid_df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,3
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3,train_2406599165 train_3342059966,0.666667,2
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4,train_3369186413 train_921438619,0.666667,2
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,5,train_2464356923 train_2753295474 train_305884580,0.5,3
6,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,6,train_1802986387 train_1396161074 train_713073...,0.25,2


In [16]:
for thr in np.linspace(0.2, 0.3, 5):
    valid_df['pred_matches'] = get_prediction(dba_embeddings, valid_df.index, thr=thr)
    valid_df['pred_matches'] = valid_df['pred_matches'].apply(lambda x: ' '.join(x))
    score = f1_score(valid_df['matches'], valid_df['pred_matches']).mean()
    print(thr, ':', score)

0.2 : 0.7339109650603931
0.225 : 0.7370929116205313
0.25 : 0.737830192552384
0.275 : 0.7358674239358488
0.3 : 0.7317523425132847


In [24]:
scores = list()
concat_thrs = np.arange(0.15, 0.25, 0.01)
text_thrs = np.arange(0.6, 0.8, 0.025)

for concat_thr in concat_thrs:
    scores_ = list()
    for text_thr in text_thrs:
        valid_df['concat_predictions'] = get_prediction(dba_embeddings, valid_df.index, thr=concat_thr)
        valid_df['text_predictions'] = get_neighbors_by_cts(train, text_embeddings, valid_df.index, thr=text_thr)

        valid_df['pred_matches'] = valid_df.apply(concat_matches, axis=1)
        score = f1_score(valid_df['matches'], valid_df['pred_matches']).mean()
        scores_.append(score)
    scores.append(scores_)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [30]:
score_df = pd.DataFrame(scores, columns=text_thrs, index=concat_thrs)
score_df.style.background_gradient(vmax=score_df.values.max(), vmin=score_df.values.min())

Unnamed: 0,0.6,0.625,0.65,0.675,0.7000000000000001,0.7250000000000001,0.7500000000000001,0.7750000000000001,0.8000000000000002
0.15,0.769086,0.770694,0.770708,0.769908,0.76897,0.766524,0.764062,0.760671,0.757852
0.16,0.770214,0.771858,0.772023,0.77149,0.770591,0.76835,0.765959,0.76283,0.760144
0.17,0.770829,0.772539,0.772927,0.772474,0.771736,0.769749,0.76746,0.764506,0.762043
0.18,0.771346,0.773167,0.773723,0.773444,0.77293,0.771122,0.769033,0.766208,0.763905
0.19,0.771159,0.773085,0.773773,0.77361,0.773205,0.77154,0.76951,0.766918,0.764701
0.2,0.770875,0.772904,0.773656,0.773596,0.773314,0.771751,0.769926,0.767496,0.765347
0.21,0.770287,0.772532,0.773458,0.77349,0.773332,0.771913,0.77024,0.767848,0.765801
0.22,0.769704,0.772009,0.772926,0.772984,0.772928,0.771686,0.770066,0.767785,0.765802
0.23,0.76826,0.770578,0.771508,0.771677,0.771673,0.770583,0.769025,0.766873,0.764967
0.24,0.767048,0.769344,0.770383,0.770632,0.770688,0.769646,0.76813,0.766126,0.764273
