In [1]:
import numpy as np
import pandas as pd
import gc
import sys
import os
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
from sklearn.model_selection import train_test_split
warnings.simplefilter('ignore')

In [2]:
@dataclass
class Config:
    outdir: str = "../results/bert-gkf"
    device: str = "cuda:1"
    device_id: int = 1

    datadir: str = '../data/tfrecord-skf'
    modeldir: str = '../models/bert/bert_en_uncased_L-24_H-1024_A-16_1'
    seed: int = 123
    n_splits: int = 3
    
    # Training config
    batch_size: int = 32
    epochs: int = 25
    patience: int = 5
    lr: float = 0.00001
    encode_len: int = 70
    emb_len: int = 2048

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [4]:
base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

config_dict = {
#     'epochs': 1,
}

config = Config().update(config_dict)
config.to_yaml(base_dir / config.outdir / 'config.yaml')

os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)


from src.tokenization import *
from src.preprocess import *
from src.text import *
from src.model import *

import tensorflow as tf
import tensorflow_hub as hub

In [5]:
# train = pd.read_csv(base_dir / config.datadir / 'train.csv')
# train = prepare_dataset(df=train, n_splits=config.n_splits, seed=config.seed)

train = pd.read_csv(base_dir / config.datadir / 'train_folds.csv')
train_labels, test_labels = train_test_split(train.label_group.unique(), test_size=0.3, random_state=config.seed)
train.loc[train.label_group.isin(train_labels), 'fold'] = 0
train.loc[train.label_group.isin(test_labels), 'fold'] = 1
train.to_csv(base_dir / config.outdir / 'train_fold.csv', index=False)

train = train.query(f'fold == 0')
labelconvmap = {label: i for i, label in enumerate(sorted(train.label_group.unique()))}
train['label_group'] = train.label_group.map(labelconvmap)

n_classes = train['label_group'].nunique()
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0,train_129225211 train_2278313361,0.666667,0
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,0
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2,train_2464356923 train_2753295474 train_305884580,0.5,0
8,train_86570404,0019a3c6755a194cb2e2c12bfc63972e.jpg,ea9af4f483249972,"[LOGU] Tempelan kulkas magnet angka, tempelan ...",3,train_86570404 train_2837452969 train_77364776,0.5,0
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,BIG SALE SEPATU PANTOFEL KULIT KEREN KERJA KAN...,4,train_831680791 train_3031035861,0.666667,0


In [6]:
def get_lr_callback():
    lr_start   = 0.00001
    lr_max     = 0.0001
    lr_min     = 0.00001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start   
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max    
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min    
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
    return lr_callback

In [7]:
seed_everything(config.seed)

outdir = base_dir / config.outdir / f'Bert_seed{config.seed}_encodelen{config.encode_len}_emb{config.emb_len}-gkf'
os.makedirs(str(outdir), exist_ok=True)

bert_layer = hub.KerasLayer(str(base_dir / config.modeldir), trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

x_train = bert_encode(train['title'].values, tokenizer, max_len=config.encode_len)
y_train = train['label_group'].values

x_train = (x_train[0], x_train[1], x_train[2], y_train)

bert_model = build_bert_model(bert_layer, n_classes=n_classes, lr=config.lr, max_len=config.encode_len, emb_len=config.emb_len)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    str(outdir / 'epoch{epoch:02d}.h5'),
    monitor = 'loss', 
    verbose = 1, 
    save_best_only = True,
    save_weights_only = True, 
    mode = 'min'
)


history = bert_model.fit(
    x_train, y_train,
    epochs = config.epochs, 
    callbacks = [checkpoint, get_lr_callback()],
    batch_size = config.batch_size,
    verbose = 1
)

pickle.dump(history.history, open(str(outdir / 'history.pkl'), 'wb'))

del bert_model, bert_layer, train, x_train, y_train
gc.collect()
tf.keras.backend.clear_session()


Epoch 00001: LearningRateScheduler reducing learning rate to 1e-05.
Epoch 1/25
Epoch 00001: loss improved from inf to 23.20206, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/bert-gkf/Bert_seed123_encodelen70_emb2048-gkf/epoch01.h5

Epoch 00002: LearningRateScheduler reducing learning rate to 2.8000000000000003e-05.
Epoch 2/25
Epoch 00002: loss improved from 23.20206 to 21.52355, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/bert-gkf/Bert_seed123_encodelen70_emb2048-gkf/epoch02.h5

Epoch 00003: LearningRateScheduler reducing learning rate to 4.6e-05.
Epoch 3/25
Epoch 00003: loss improved from 21.52355 to 18.58431, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/bert-gkf/Bert_seed123_encodelen70_emb2048-gkf/epoch03.h5

Epoch 00004: LearningRateScheduler reducing learning rate to 6.4e-05.
Epoch 4/25
Epoch 00004: loss improved from 18.58431 to 15.50438, saving model to /home/yamaguchi-milkcocholate/Shopee/no