In [None]:
import numpy as np
import pandas as pd
from time import perf_counter

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
from official.nlp import optimization  # to create AdamW optimizer

from transformers import BertTokenizer, TFBertModel


## Example

In [None]:
tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
encoder = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)

In [3]:
example_text = "Determine all functions f : mathbb R to mathbb R satisfying the following two conditions: (a) f(x + y) + f(x - y) = 2f(x)f(y) for all x, y in mathbb R, and (b) lim_{x to infty} f(x) = 0."
encoded_inputs = tokenizer(example_text, return_tensors="tf", padding="max_length", max_length=512, truncation=True)

print(f"output keys                     -> {list(encoded_inputs.keys())}")
print(f"input_word_ids _ input_ids      -> {encoded_inputs['input_ids'][0, :12]}")
print(f"input_mask _ attention_mask     -> {encoded_inputs['attention_mask'][0, :12]}")
print(f"input_type_ids _ token_type_ids -> {encoded_inputs['token_type_ids'][0, :12]}")
print(f"shape: {encoded_inputs['attention_mask'].shape}\n")

# output = model(encoded_inputs)
# output

output keys                     -> ['input_ids', 'token_type_ids', 'attention_mask']
input_word_ids _ input_ids      -> [  101  5646  2035  4972  1042  1024  8785 10322  1054  2000  8785 10322]
input_mask _ attention_mask     -> [1 1 1 1 1 1 1 1 1 1 1 1]
input_type_ids _ token_type_ids -> [0 0 0 0 0 0 0 0 0 0 0 0]
shape: (1, 512)



In [4]:
example_text = 'hElLO an bisa the aku'
encoded_inputs = tokenizer(example_text, return_tensors="tf", padding="max_length", max_length=12, truncation=True)

print(f"INPUT TEXT               --> '{example_text}'")
print(f"tokenize                 --> {tokenizer.tokenize(example_text)}")
print(f"decode                   --> {tokenizer.decode(encoded_inputs['input_ids'][0])}")
print(f"encode                   --> {tokenizer.encode(example_text)}")
print(f"convert_ids_to_tokens    --> {tokenizer.convert_ids_to_tokens([101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102])}")
print(f"convert_tokens_to_ids    --> {tokenizer.convert_tokens_to_ids(['[CLS]', 'hello', 'an', 'bis', '##a', 'the', 'ak', '##u', '[SEP]'])}")
print(f"convert_tokens_to_string --> {tokenizer.convert_tokens_to_string(['hello', 'an', 'bis', '##a', 'the', 'ak', '##u'])}")

print()
print(f"cls_token : {tokenizer.cls_token}  - cls_token_id: {tokenizer.cls_token_id}")
print(f"mask_token: {tokenizer.mask_token} - mask_token_id: {tokenizer.mask_token_id}")
print(f"pad_token : {tokenizer.pad_token}  - pad_token_id: {tokenizer.pad_token_id} - pad_token_type_id: {tokenizer.pad_token_type_id}")
print(f"unk_token : {tokenizer.unk_token}  - unk_token_id: {tokenizer.unk_token_id}")
print(f"sep_token : {tokenizer.sep_token}  - sep_token_id: {tokenizer.sep_token_id}")
print()
print(f"all_special_ids             --> {tokenizer.all_special_ids}")
print(f"all_special_tokens          --> {tokenizer.all_special_tokens}")
print(f"all_special_tokens_extended --> {tokenizer.all_special_tokens_extended}")
print()
print(f"name_or_path              --> {tokenizer.name_or_path}")
print(f"vocab_size                --> {tokenizer.vocab_size}")
print(f"model_max_length          --> {tokenizer.model_max_length}")
print(f"model_input_names         --> {tokenizer.model_input_names}")
print(f"prepare_for_model         --> {tokenizer.prepare_for_model([101, 7592, 2019, 20377])}")

print(f"SPECIAL_TOKENS_ATTRIBUTES --> {tokenizer.SPECIAL_TOKENS_ATTRIBUTES}")


INPUT TEXT               --> 'hElLO an bisa the aku'
tokenize                 --> ['hello', 'an', 'bis', '##a', 'the', 'ak', '##u']
decode                   --> [CLS] hello an bisa the aku [SEP] [PAD] [PAD] [PAD]
encode                   --> [101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102]
convert_ids_to_tokens    --> ['[CLS]', 'hello', 'an', 'bis', '##a', 'the', 'ak', '##u', '[SEP]']
convert_tokens_to_ids    --> [101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102]
convert_tokens_to_string --> hello an bisa the aku

cls_token : [CLS]  - cls_token_id: 101
mask_token: [MASK] - mask_token_id: 103
pad_token : [PAD]  - pad_token_id: 0 - pad_token_type_id: 0
unk_token : [UNK]  - unk_token_id: 100
sep_token : [SEP]  - sep_token_id: 102

all_special_ids             --> [100, 102, 0, 101, 103]
all_special_tokens          --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
all_special_tokens_extended --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

name_or_path              --> tbs17/M

## Prepare & Load Dataset

In [2]:
class DataMaster:
    def __init__(self, path: str, max_length=512):
        self.path = path
        self.tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
        self.max_length = max_length
        
    def tokenize_text(self, kalimat: str):
        return self.tokenizer(kalimat, padding='max_length', max_length=self.max_length, truncation=True, return_tensors='tf')

    def load_tokenized_train_val_test_ds(self, batch_size, seed, AUTOTUNE=tf.data.AUTOTUNE):
        df_train = pd.read_csv(f"{self.path}train.csv")
        df_test = pd.read_csv(f"{self.path}test.csv")
        df_val = pd.read_csv(f"{self.path}val.csv")

        train_post_canonical = list(df_train['post_canonical'].values)
        train_score = df_train['score'].values

        test_post_canonical = list(df_test['post_canonical'].values)
        test_score = df_test['score'].values

        val_post_canonical = list(df_val['post_canonical'].values)
        val_score = df_val['score'].values
        
        train_ds = tf.data.Dataset.from_tensor_slices((self.tokenize_text(train_post_canonical), train_score))
        val_ds = tf.data.Dataset.from_tensor_slices((self.tokenize_text(val_post_canonical), val_score))
        test_ds = tf.data.Dataset.from_tensor_slices((self.tokenize_text(test_post_canonical), test_score))
        
        train_ds = train_ds.shuffle(buffer_size=len(df_train), seed=seed).batch(batch_size).prefetch(buffer_size=AUTOTUNE)
        val_ds = val_ds.batch(batch_size).prefetch(buffer_size=AUTOTUNE)
        test_ds = test_ds.batch(batch_size).prefetch(buffer_size=AUTOTUNE)
        
        return train_ds, val_ds, test_ds

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 42

datamaster = DataMaster("../data/regression/imo/")
train_ds, val_ds, test_ds = datamaster.load_tokenized_train_val_test_ds(batch_size=batch_size, seed=seed, AUTOTUNE=AUTOTUNE)


## Modeling

In [3]:
class MathBertRegressorModel:
    def __init__(self, max_length=512) -> None:
        self.max_length = max_length
        self.encoder = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)
        
        self.model = self.build_model()
        
        self.train_ds = None
        self.validation_data = None
        self.epochs = None
        
        self.history = None

    def compile_model(self, train_ds, validation_data, epochs, learning_rate, optimizer_type='adamw', loss=tf.keras.losses.mean_squared_error, metrics=['mae']):
        self.train_ds = train_ds
        self.validation_data = validation_data
        self.epochs = epochs
        
        steps_per_epoch = tf.data.experimental.cardinality(self.train_ds).numpy()
        num_train_steps = steps_per_epoch * self.epochs
        optimizer = optimization.create_optimizer(
            init_lr=learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=int(0.1*num_train_steps),
            optimizer_type=optimizer_type
        )
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def build_model(self):
        input_ids_layer = tf.keras.Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids_layer')
        attention_mask_layer = tf.keras.Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask_layer')
        token_type_ids_layer = tf.keras.Input(shape=(self.max_length,), dtype=tf.int32, name='token_type_ids_layer')
        inputs = [input_ids_layer, attention_mask_layer, token_type_ids_layer]
        
        self.encoder.trainable = False
        encoded_text = self.encoder(
            input_ids=input_ids_layer, 
            attention_mask=attention_mask_layer,
            token_type_ids=token_type_ids_layer)
        last_hidden_state = encoded_text['last_hidden_state'][:, 0, :]  # Output -> dict.keys(['last_hidden_state', 'pooler_output'])
        
        net = tf.keras.layers.Dropout(0.1)(last_hidden_state)
        net = tf.keras.layers.Dense(512, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(256, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(128, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(1, activation='linear')(net)
    
        return tf.keras.Model(inputs=inputs, outputs=net)
    
    def train(self):
        print("Start training..")
        start = perf_counter()
        history = self.model.fit(
            self.train_ds, 
            validation_data=self.validation_data, 
            epochs=self.epochs
        )
        end = perf_counter()
        print(f"\nTotal training time: {end-start:.2f}s")

        self.history = history.history
        return history
    
    def evaluate_test(self, test_ds):
        loss, mae = self.model.evaluate(test_ds)
        return {'loss': loss, 'mae': mae}
        
    def plot_training_history_over_time(self, figsize=(14, 12)):
        print(self.history.keys())
        print("Training history over time")
        
        mae = self.history['mae']
        val_acc = self.history['val_mae']
        loss = self.history['loss']
        val_mae = self.history['val_mae']
        epochs = range(1, len(mae) + 1)
        
        figure, ax = plt.subplots(2, 1, figsize=figsize, layout="constrained")
        # loss
        plt.subplot(2, 1, 1)
        plt.plot(epochs, loss, 'r', label='Training loss')
        plt.plot(epochs, val_mae, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        # mae
        plt.subplot(2, 1, 2)
        plt.plot(epochs, mae, 'r', label='Training mae')
        plt.plot(epochs, val_acc, 'b', label='Validation mae')
        plt.title('Training and validation mae')
        plt.xlabel('Epochs')
        plt.ylabel('MAE')
        plt.legend()

epochs = 40
learning_rate = 1e-6
optimizer_type = 'adamw'
loss = tf.keras.losses.mean_squared_error
metrics = ['mae']


## Train

In [None]:
mathbert_regressor_model = MathBertRegressorModel()
mathbert_regressor_model.compile_model(
    train_ds, 
    val_ds, 
    epochs=2, 
    learning_rate=1e-4,
    learning_rate=learning_rate, 
    optimizer_type=optimizer_type, 
    loss=loss, 
    metrics=metrics
)


In [None]:
mathbert_regressor_model.train()


In [None]:
mathbert_regressor_model.plot_training_history_over_time()
