In [1]:
import numpy as np
import pandas as pd
import gc
import os
import sys
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from tqdm.notebook import tqdm
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
warnings.simplefilter('ignore')

In [2]:
@dataclass
class Config:
    outdir: str = "../results/monitor"
    device: str = "cuda:2"
    device_id: int = 2

    datadir: str = '../data/shopee-product-matching'
    modeldir: str = '../models/bert/bert_en_uncased_L-24_H-1024_A-16_1'
    seed: int = 123
    n_splits: int = 5
    
    # Training config
    batch_size: int = 32
    epochs: int = 100
    patience: int = 5
    lr: float = 0.00001

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [4]:
config = Config()
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)

base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

from src.tokenization import *
from src.preprocess import *
from src.text import *
from src.model import *

import tensorflow as tf
import tensorflow_hub as hub

In [5]:
train = pd.read_csv(base_dir / config.datadir / 'train.csv')
train = prepare_skf_dataset(df=train, n_splits=config.n_splits, seed=config.seed)
n_classes = train['label_group'].nunique()
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,train_129225211 train_2278313361,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,train_3386243561 train_3423213080,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,train_2288590299 train_3803689425,4
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,train_2406599165 train_3342059966,2
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,train_3369186413 train_921438619,4


In [6]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [7]:
fold = 0
# _, valid_df = train.query('fold != @fold'), train.query('fold == @fold')
valid_df = train

bert_layer = hub.KerasLayer(str(base_dir / config.modeldir), trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

x_val = bert_encode(valid_df['title'].values, tokenizer, max_len=70)
# y_val = valid_df['label_group'].values
# x_val = (x_val[0], x_val[1], x_val[2], y_val)

bert_model = build_bert_model(bert_layer, n_classes=n_classes, lr=config.lr, max_len=70, train=False)

In [8]:
epoch = '04'
bert_model.load_weights(str(base_dir / config.outdir / f'fold-{fold}' / f'Bert_{config.seed}_epoch{epoch}.h5'))
bert_model = tf.keras.models.Model(inputs=bert_model.input[0:3], outputs=bert_model.layers[-4].output)

In [9]:
chunk = 5000
iterator = np.arange(np.ceil(len(valid_df) / chunk))

embeds = []
for j in tqdm(iterator):
    a = int(j * chunk)
    b = int((j + 1) * chunk)
    text_chunk = ((x_val[0][a:b], x_val[1][a:b], x_val[2][a:b]))
    text_embeddings = bert_model.predict(text_chunk, batch_size = config.batch_size)
    embeds.append(text_embeddings)
    
del bert_model
gc.collect()
tf.keras.backend.clear_session()

  0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
text_embeddings = np.concatenate(embeds)

In [11]:
from cuml.neighbors import NearestNeighbors

In [12]:
KNN = 50

knn_model = NearestNeighbors(n_neighbors=KNN)
knn_model.fit(text_embeddings)
distances, indices = knn_model.kneighbors(text_embeddings)

In [13]:
thresholds = list(np.arange(15, 35, 1))
scores = []
for threshold in thresholds:
    predictions = []
    for k in range(text_embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = ' '.join(valid_df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
    valid_df['pred_matches'] = predictions
    valid_df['f1'] = f1_score(valid_df['matches'], valid_df['pred_matches'])
    score = valid_df['f1'].mean()
    print(f'Our f1 score for threshold {threshold} is {score}')
    scores.append(score)
thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

Our f1 score for threshold 15 is 0.5904284634407867
Our f1 score for threshold 16 is 0.6078010907004866
Our f1 score for threshold 17 is 0.6262444618870211
Our f1 score for threshold 18 is 0.6433745287012458
Our f1 score for threshold 19 is 0.6584849712437543
Our f1 score for threshold 20 is 0.6720204720232597
Our f1 score for threshold 21 is 0.681051074085284
Our f1 score for threshold 22 is 0.6819029387826657
Our f1 score for threshold 23 is 0.6703116962604577
Our f1 score for threshold 24 is 0.6412431469730054
Our f1 score for threshold 25 is 0.5858921261923475
Our f1 score for threshold 26 is 0.49727332480622916
Our f1 score for threshold 27 is 0.37089468430853184
Our f1 score for threshold 28 is 0.23346265831060334
Our f1 score for threshold 29 is 0.16819325287907133
Our f1 score for threshold 30 is 0.16604049304791893
Our f1 score for threshold 31 is 0.16604049304791893
Our f1 score for threshold 32 is 0.16604049304791893
Our f1 score for threshold 33 is 0.16604049304791893
Our f

In [16]:
thresholds = list(np.arange(15, 35, 1))
scores = []
for threshold in thresholds:
    predictions = []
    for k in range(text_embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = ' '.join(valid_df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
    valid_df['pred_matches'] = predictions
    valid_df['f1'] = f1_score(valid_df['matches'], valid_df['pred_matches'])
    score = valid_df['f1'].mean()
    print(f'Our f1 score for threshold {threshold} is {score}')
    scores.append(score)
thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})

Our f1 score for threshold 15 is 0.4690892731019354
Our f1 score for threshold 16 is 0.47021160222525515
Our f1 score for threshold 17 is 0.47149105556399534
Our f1 score for threshold 18 is 0.4727997612684424
Our f1 score for threshold 19 is 0.47412651321752763
Our f1 score for threshold 20 is 0.47548494696068766
Our f1 score for threshold 21 is 0.4757822644130587
Our f1 score for threshold 22 is 0.475896577844836
Our f1 score for threshold 23 is 0.47542861102884915
Our f1 score for threshold 24 is 0.4725793451545964
Our f1 score for threshold 25 is 0.4669689519704598
Our f1 score for threshold 26 is 0.4566863675669318
Our f1 score for threshold 27 is 0.4367305224294404
Our f1 score for threshold 28 is 0.3967416351062135
Our f1 score for threshold 29 is 0.30082482295270874
Our f1 score for threshold 30 is 0.08783728392872799
Our f1 score for threshold 31 is 0.05006093453515851
Our f1 score for threshold 32 is 0.05006093453515851
Our f1 score for threshold 33 is 0.05006093453515851
Our

In [45]:
max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
best_threshold = max_score['thresholds'].values[0]
best_score = max_score['scores'].values[0]
print(f'Our best score is {best_score} and has a threshold {best_threshold}')

Our best score is 0.8895570193870124 and has a threshold 23


In [55]:
# Use threshold
predictions = []
for k in range(text_embeddings.shape[0]):
    idx = np.where(distances[k,] < 18.0)[0]
    ids = indices[k, idx]
    posting_ids = valid_df['posting_id'].iloc[ids].values
    predictions.append(posting_ids)

  0%|          | 0/34250 [00:00<?, ?it/s]

In [59]:
train['text_prediction'] = predictions

In [60]:
train

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,fold,pred_matches,f1,test_prediction,text_prediction
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,train_129225211 train_2278313361,1,train_129225211 train_2278313361 train_1306907...,0.076923,"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]"
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,train_3386243561 train_3423213080,2,train_3386243561 train_3423213080 train_414821...,0.076923,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]"
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,train_2288590299 train_3803689425,4,train_2288590299 train_3803689425 train_980670...,0.076923,[train_2288590299],[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,train_2406599165 train_3342059966,2,train_2406599165 train_1744956981 train_150810...,0.076923,"[train_2406599165, train_1744956981, train_150...","[train_2406599165, train_1744956981, train_150..."
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,train_3369186413 train_921438619,4,train_3369186413 train_921438619 train_3944799...,0.076923,"[train_3369186413, train_921438619]","[train_3369186413, train_921438619]"
...,...,...,...,...,...,...,...,...,...,...,...
34245,train_4028265689,fff1c07ceefc2c970a7964cfb81981c5.jpg,e3cd72389f248f21,Masker Bahan Kain Spunbond Non Woven 75 gsm 3 ...,9735,train_2829161572 train_4028265689,1,train_4028265689 train_2829161572 train_323845...,0.076923,"[train_4028265689, train_2829161572]","[train_4028265689, train_2829161572]"
34246,train_769054909,fff401691371bdcb382a0d9075dfea6a.jpg,be86851f72e2853c,MamyPoko Pants Royal Soft - S 70 - Popok Celana,7038,train_1463059254 train_769054909,1,train_1463059254 train_769054909 train_2530102...,0.076923,"[train_1463059254, train_769054909]","[train_1463059254, train_769054909]"
34247,train_614977732,fff421b78fa7284284724baf249f522e.jpg,ad27f0d08c0fcbf0,KHANZAACC Robot RE101S 1.2mm Subwoofer Bass Me...,10537,train_4126022211 train_3926241003 train_232545...,4,train_614977732 train_512157627 train_9568348 ...,0.305085,"[train_614977732, train_512157627, train_95683...","[train_614977732, train_512157627, train_95683..."
34248,train_3630949769,fff51b87916dbfb6d0f8faa01bee67b8.jpg,e3b13bd1d896c05c,"Kaldu NON MSG HALAL Mama Kamu Ayam Kampung , S...",4242,train_3419392575 train_1431563868 train_363094...,3,train_3630949769 train_3419392575 train_143156...,0.113208,"[train_3630949769, train_3419392575]","[train_3630949769, train_3419392575]"
