In [2]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
from cuml.neighbors import NearestNeighbors
warnings.simplefilter('ignore')

In [3]:
@dataclass
class Config:
    outdir: str = "../results/bert"
    device: str = "cuda:2"
    device_id: int = 2

    datadir: str = '../data/tfrecord-skf'
    modeldir: str = '../models/bert/bert_en_uncased_L-24_H-1024_A-16_1'
    seed: int = 123
    n_splits: int = 3
    
    # Training config
    batch_size: int = 32
    epochs: int = 100
    patience: int = 5
    lr: float = 0.00001

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [5]:
config = Config()
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)

base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

from src.tokenization import *
from src.preprocess import *
from src.text import *
from src.model import *

import tensorflow as tf
import tensorflow_hub as hub

In [6]:
train = pd.read_csv(base_dir / config.datadir / 'train_folds.csv')
n_classes = train['label_group'].nunique()
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0,train_129225211 train_2278313361,0.666667,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2,train_2288590299 train_3803689425,0.666667,5
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3,train_2406599165 train_3342059966,0.666667,7
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4,train_3369186413 train_921438619,0.666667,9


In [7]:
def get_valid_df(fold: int):
    valid_folds = [i for i in range(train.fold.unique().shape[0]) if (i % config.n_splits) == fold]
    print(f'folds : {valid_folds}')
    
    valid_df = train.query(f'fold in {valid_folds}').copy()
    return valid_df

In [8]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [9]:
def get_embeddings(bert_model, df: pd.DataFrame, x):
    chunk = 500
    iterator = np.arange(np.ceil(len(df) / chunk))

    embeds = []
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((x[0][a:b], x[1][a:b], x[2][a:b]))
        embeddings = bert_model.predict(text_chunk, batch_size = config.batch_size)
        embeds.append(embeddings)

    embeddings = np.concatenate(embeds)
    return embeddings

In [10]:
def get_embeddings_list(max_len: int, epoch: int):
    epoch = format(epoch, '02')
    
    embeddings_list = list()
    for fold in range(config.n_splits):
        outdir = base_dir / config.outdir / f'Bert_seed{config.seed}_encodelen{max_len}_fold-{fold}'
        weight_path = outdir / f"epoch{epoch}.h5"
        emb_outdir = outdir/ f'embeddings_epoch{epoch}.pkl'
        
        if os.path.exists(emb_outdir):
            embeddings = pickle.load(open(emb_outdir, 'rb'))
        else:
            bert_layer = hub.KerasLayer(str(base_dir / config.modeldir), trainable=True)
            vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
            do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
            tokenizer = FullTokenizer(vocab_file, do_lower_case)

            bert_model = build_bert_model(bert_layer, n_classes=n_classes, lr=config.lr, max_len=max_len, train=False)
            bert_model.load_weights(str(weight_path))
            bert_model = tf.keras.models.Model(inputs=bert_model.input[0:3], outputs=bert_model.layers[-4].output)

            x = bert_encode(train['title'].values, tokenizer, max_len=max_len)
            embeddings = get_embeddings(bert_model, train, x)
            
            
            pickle.dump(embeddings, open(emb_outdir, 'wb'))
            
            del bert_model
            gc.collect()
            tf.keras.backend.clear_session()
        
        embeddings_list += [embeddings]

    return embeddings_list

In [11]:
def search_thresholds(df: pd.DataFrame, embeddings: np.ndarray, 
                      fold: int, knn: int = 50, thr_start: int = 15, thr_end: int = 30, thr_interval: float = 1):
    # KNN
    knn_model = NearestNeighbors(n_neighbors=knn)
    knn_model.fit(embeddings)
    
    valid_df = get_valid_df(fold=fold)
    valid_embeddings = embeddings[valid_df.index, :]
    distances, indices = knn_model.kneighbors(valid_embeddings)
    
    # grid search
    thresholds = list(np.arange(thr_start, thr_end, thr_interval))
    scores = []
    for threshold in thresholds:
        predictions = []
        for k in range(valid_embeddings.shape[0]):
            idx = np.where(distances[k, ] < threshold)[0]
            ids = indices[k, idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        valid_df['pred_matches'] = predictions
        valid_df['f1'] = f1_score(valid_df['matches'], valid_df['pred_matches'])
        score = valid_df['f1'].mean()
        print(f'Our f1 score for threshold {np.round(threshold, 2)} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_score, best_threshold

In [13]:
max_len = 70
epoch = 30

embeddings_list = get_embeddings_list(max_len, epoch)
scores_70 = list()
for fold in range(config.n_splits):
    scores_70 += [search_thresholds(train, embeddings_list[fold], fold)]

folds : [0, 3, 6, 9, 12]
Our f1 score for threshold 15 is 0.5437239792090968
Our f1 score for threshold 16 is 0.5574592822228386
Our f1 score for threshold 17 is 0.5703745726292365
Our f1 score for threshold 18 is 0.5836655096401759
Our f1 score for threshold 19 is 0.59803712564954
Our f1 score for threshold 20 is 0.6119735423127574
Our f1 score for threshold 21 is 0.6261632013732564
Our f1 score for threshold 22 is 0.6395757026972946
Our f1 score for threshold 23 is 0.6522745976488339
Our f1 score for threshold 24 is 0.6653260752705655
Our f1 score for threshold 25 is 0.6779798171865244
Our f1 score for threshold 26 is 0.6854569221835097
Our f1 score for threshold 27 is 0.6803010289094904
Our f1 score for threshold 28 is 0.6509130705367717
Our f1 score for threshold 29 is 0.5206568965472763
Our best score is 0.6854569221835097 and has a threshold 26
folds : [1, 4, 7, 10, 13]
Our f1 score for threshold 15 is 0.5611012990536544
Our f1 score for threshold 16 is 0.5761758363864841
Our f1 

In [12]:
max_len = 150
epoch = 30

embeddings_list = get_embeddings_list(max_len, epoch)
scores_150 = list()
for fold in range(config.n_splits):
    scores_150 += [search_thresholds(train, embeddings_list[fold], fold)]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

folds : [0, 3, 6, 9, 12]
Our f1 score for threshold 15 is 0.548931038606319
Our f1 score for threshold 16 is 0.563556133470404
Our f1 score for threshold 17 is 0.5779660579851023
Our f1 score for threshold 18 is 0.5934171981845502
Our f1 score for threshold 19 is 0.6100888219553022
Our f1 score for threshold 20 is 0.6265287000721871
Our f1 score for threshold 21 is 0.6433676968895442
Our f1 score for threshold 22 is 0.6607938623740311
Our f1 score for threshold 23 is 0.6763915395107548
Our f1 score for threshold 24 is 0.6883470683532816
Our f1 score for threshold 25 is 0.697880292045753
Our f1 score for threshold 26 is 0.7027718941718689
Our f1 score for threshold 27 is 0.6902701549652426
Our f1 score for threshold 28 is 0.6449550975073862
Our f1 score for threshold 29 is 0.4773899774249712
Our best score is 0.7027718941718689 and has a threshold 26
folds : [1, 4, 7, 10, 13]
Our f1 score for threshold 15 is 0.5452689287391425
Our f1 score for threshold 16 is 0.5587158632918175
Our f1 s