In [1]:
import numpy as np
import pandas as pd
import gc
import sys
import os
import yaml
import re
import random
import math
import matplotlib.pyplot as plt
import warnings
import pickle
from typing import *
from pathlib import Path
from dataclasses import dataclass, field, asdict
from shutil import copyfile
warnings.simplefilter('ignore')

In [3]:
@dataclass
class Config:
    outdir: str = "../results"
    device: str = "cuda:0"
    device_id: int = 0

    datadir: str = '../data/shopee-product-matching'
    modeldir: str = '../models/bert/bert_en_uncased_L-24_H-1024_A-16_1'
    seed: int = 123
    n_splits: int = 5
    
    # Training config
    batch_size: int = 32
    epochs: int = 100
    patience: int = 5
    lr: float = 0.00001

    def update(self, param_dict: Dict) -> "Config":
        # Overwrite by `param_dict`
        for key, value in param_dict.items():
            if not hasattr(self, key):
                raise ValueError(f"[ERROR] Unexpected key for flag = {key}")
            setattr(self, key, value)
        return self
    
    def to_yaml(self, filepath: str, width: int = 120):
        with open(filepath, 'w') as f:
            yaml.dump(asdict(self), f, width=width)

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [2]:
config_dict = {
#     'epochs': 1,
}

config = Config().update(config_dict)
config.to_yaml(base_dir / config.outdir / 'config.yaml')

os.environ["CUDA_VISIBLE_DEVICES"] = str(config.device_id)

base_dir = Path().resolve()
sys.path.append(os.path.abspath(base_dir / '../'))

from src.tokenization import *
from src.preprocess import *
from src.text import *
from src.model import *

import tensorflow as tf
import tensorflow_hub as hub

In [6]:
train = pd.read_csv(base_dir / config.datadir / 'train.csv')
train = prepare_dataset(df=train, n_splits=config.n_splits, seed=config.seed)
n_classes = train['label_group'].nunique()
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,train_129225211 train_2278313361,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,train_3386243561 train_3423213080,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,train_2288590299 train_3803689425,4
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,train_2406599165 train_3342059966,2
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,train_3369186413 train_921438619,4


In [None]:
for fold in range(config.n_splits):
    if fold == 0:
        continue
    
    seed_everything(config.seed)

    outdir = base_dir / config.outdir / f'fold-{fold}'
    os.makedirs(str(outdir), exist_ok=True)
    
    train_df, valid_df = train.query('fold != @fold'), train.query('fold == @fold')
    
    bert_layer = hub.KerasLayer(str(base_dir / config.modeldir), trainable=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = FullTokenizer(vocab_file, do_lower_case)
    
    x_train = bert_encode(train_df['title'].values, tokenizer, max_len=70)
    x_val = bert_encode(valid_df['title'].values, tokenizer, max_len=70)
    y_train = train_df['label_group'].values
    y_val = valid_df['label_group'].values
    
    x_train = (x_train[0], x_train[1], x_train[2], y_train)
    x_val = (x_val[0], x_val[1], x_val[2], y_val)
    
    bert_model = build_bert_model(bert_layer, n_classes=n_classes, lr=config.lr, max_len=70)
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        str(outdir / f'Bert_{config.seed}.h5'),
        monitor = 'val_loss', 
        verbose = 1, 
        save_best_only = True,
        save_weights_only = True, 
        mode = 'min'
    )
#     lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
#         monitor='val_loss', 
#         factor=0.1, 
#         patience=5, 
#         verbose=1, 
#         mode='auto', 
#         min_delta=0.0, 
#         cooldown=0, 
#         min_lr=0
#     )

    
    history = bert_model.fit(
        x_train, y_train,
        validation_data = (x_val, y_val),
        epochs = config.epochs, 
#         callbacks = [checkpoint, lr_scheduler],
        callbacks = [checkpoint],
        batch_size = config.batch_size,
        verbose = 1
    )
    
    pickle.dump(history.history, open(str(outdir / 'history.pkl'), 'wb'))
    
    del bert_model, bert_layer, train_df, valid_df, x_train, x_val, y_train, y_val
    gc.collect()
    tf.keras.backend.clear_session()

Epoch 1/100
Epoch 00001: val_loss improved from inf to 23.58607, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 2/100
Epoch 00002: val_loss improved from 23.58607 to 22.61378, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 3/100
Epoch 00003: val_loss improved from 22.61378 to 21.51455, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 4/100
Epoch 00004: val_loss improved from 21.51455 to 20.56214, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 5/100
Epoch 00005: val_loss improved from 20.56214 to 19.69246, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 6/100
Epoch 00006: val_loss improved from 19.69246 to 18.92444, saving model to /home/yamaguchi-milkcocholate/Shopee/notebooks/../results/fold-1/Bert_123.h5
Epoch 7/100
Epoch 00007: va