In [1]:
import numpy as np
import pandas as pd
from time import perf_counter

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
from official.nlp import optimization  # to create AdamW optimizer

from transformers import BertTokenizer, TFBertModel


kalimat = "An n times n matrix whose entries come from the set S = {1, 2, ldots , 2n - 1 } is called a silver matrix if, for each i = 1, 2, ldots , n, the i-th row and the i-th column together contain all elements of S. Show that: (a) there is no silver matrix for n = 1997; (b) silver matrices exist for infinitely many values of n."
print(len(kalimat))
print(len(kalimat.split(" ")))

322
71


In [7]:
tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
model = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [3]:
def tokenize_text(text):
    return tokenizer.encode(text, padding='max_length', max_length=512, truncation=True, return_tensors='tf')

res = tokenize_text(kalimat)
# res

In [None]:
# model(res['input_ids'], attention_mask=res['attention_mask'])
model(
    input_ids=tf.keras.Input(shape=(512,), dtype=tf.int32, name=''), 
    attention_mask=tf.keras.Input(shape=(512,), dtype=tf.int32, name=''),
    token_type_ids=tf.keras.Input(shape=(512,), dtype=tf.int32, name='')
)
# model_ = hub.KerasLayer(model)
model(res)


In [3]:
class MathBertRegressorModel:
    def __init__(self) -> None:
        self.tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
        self.encoder = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)
        
        self.model = self.build_model()
        
        self.train_ds = None
        self.validation_data = None
        self.epochs = None
        
        self.history = None

    def tokenize_text(self):
        """
        TODO: 
        Buat func untuk single text tokenize
        """
        ...
        
    def tokenize_train_ds(self):
        ...

    def compile_model(self, train_ds, val_ds, epochs, learning_rate, optimizer_type='adamw', loss=tf.keras.losses.mean_squared_error, metrics=['mae']):
        self.train_ds = train_ds
        self.validation_data = val_ds
        self.epochs = epochs
        
        steps_per_epoch = tf.data.experimental.cardinality(self.train_ds).numpy()
        num_train_steps = steps_per_epoch * self.epochs
        optimizer = optimization.create_optimizer(
            init_lr=learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=int(0.1*num_train_steps),
            optimizer_type=optimizer_type
        )
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def build_model(self):
        input_ids_layer = tf.keras.Input(shape=(512,), dtype=tf.int32, name='input_ids_layer')
        attention_mask_layer = tf.keras.Input(shape=(512,), dtype=tf.int32, name='attention_mask_layer')
        token_type_ids_layer = tf.keras.Input(shape=(512,), dtype=tf.int32, name='token_type_ids_layer')
        inputs = [input_ids_layer, attention_mask_layer, token_type_ids_layer]
        
        self.encoder.trainable = False
        encoded_text = self.encoder(
            input_ids=input_ids_layer, 
            attention_mask=attention_mask_layer,
            token_type_ids=token_type_ids_layer)
        last_hidden_state = encoded_text['last_hidden_state'][:, 0, :]
        
        net = tf.keras.layers.Dropout(0.1)(last_hidden_state)
        net = tf.keras.layers.Dense(512, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(256, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(128, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(1, activation='linear')(net)
    
        return tf.keras.Model(inputs=inputs, outputs=net)
    
    def train(self):
        print("Start training..")
        start = perf_counter()
        history = self.model.fit(
            self.train_ds, 
            validation_data=self.validation_data, 
            epochs=self.epochs
        )
        end = perf_counter()
        print(f"\nTotal training time: {end-start:.2f}s")

        self.history = history
        return history
    
    def evaluate_test(self, test_ds):
        loss, mae = self.model.evaluate(test_ds)
        return {'loss': loss, 'mae': mae}
        
    def plot_training_history_over_time(self, figsize=(14, 12)):
        print(self.history.keys())
        print("Training history over time")
        
        mae = self.history['mae']
        val_acc = self.history['val_mae']
        loss = self.history['loss']
        val_mae = self.history['val_mae']
        epochs = range(1, len(mae) + 1)
        
        figure, ax = plt.subplots(2, 1, figsize=figsize, layout="constrained")
        # loss
        plt.subplot(2, 1, 1)
        plt.plot(epochs, loss, 'r', label='Training loss')
        plt.plot(epochs, val_mae, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        # mae
        plt.subplot(2, 1, 2)
        plt.plot(epochs, mae, 'r', label='Training mae')
        plt.plot(epochs, val_acc, 'b', label='Validation mae')
        plt.title('Training and validation mae')
        plt.xlabel('Epochs')
        plt.ylabel('MAE')
        plt.legend()
        
    
tes_model = MathBertRegressorModel()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [47]:
def regression_model(text):
    encoded_text = tokenizer(text, padding='max_length', max_length=512, truncation=True ,return_tensors='tf')
    print(encoded_text)
    outputs = model(encoded_text)
    print(outputs.keys())
    print(outputs['last_hidden_state'])

    last_hidden_state = outputs['last_hidden_state'][:, 0, :]  # Get CLS token representation
    net = tf.keras.layers.Dropout(0.1)(last_hidden_state)
    net = tf.keras.layers.Dense(512, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(256, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='linear')(net)

    return net

regression_model(kalimat)


{'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[  101,  2019,  1050,  2335,  1050,  8185,  3005, 10445,  2272,
         2013,  1996,  2275,  1055,  1027,  1063,  1015,  1010,  1016,
         1010, 25510, 12868,  1010,  1016,  2078,  1011,  1015,  1065,
         2003,  2170,  1037,  3165,  8185,  2065,  1010,  2005,  2169,
         1045,  1027,  1015,  1010,  1016,  1010, 25510, 12868,  1010,
         1050,  1010,  1996,  1045,  1011, 16215,  5216,  1998,  1996,
         1045,  1011, 16215,  5930,  2362,  5383,  2035,  3787,  1997,
         1055,  1012,  2265,  2008,  1024,  1006,  1037,  1007,  2045,
         2003,  2053,  3165,  8185,  2005,  1050,  1027,  2722,  1025,
         1006,  1038,  1007,  3165, 21520,  4839,  2005, 25773,  2116,
         5300,  1997,  1050,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.11337584]], dtype=float32)>

In [17]:
tfhub_handle_preprocess = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-cased-preprocess/versions/3"
tfhub_handle_encoder    = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-cased-l-12-h-768-a-12/versions/4"

tokenizer_TFHUB = hub.load(tfhub_handle_preprocess)
model_TFHUB = hub.load(tfhub_handle_encoder)


In [21]:
['sequence_output', 'encoder_outputs', 'default', 'pooled_output']

outputtt = model_TFHUB(tokenizer_TFHUB([kalimat]))
outputtt['encoder_outputs']


[<tf.Tensor: shape=(1, 128, 768), dtype=float32, numpy=
 array([[[ 0.3765292 ,  0.0570243 , -0.06860661, ...,  0.03646583,
           0.03290379, -0.15170978],
         [-0.6161929 , -0.12232574,  0.26684394, ..., -0.25346434,
           0.35446113,  0.20755616],
         [-0.8727162 ,  0.6887318 ,  0.2333233 , ...,  0.11457242,
           1.0981388 ,  0.30038944],
         ...,
         [-0.4910985 ,  0.03118425,  0.2841043 , ..., -0.13263746,
          -0.03974351,  0.1775467 ],
         [-0.6594098 , -0.17344199,  0.14063303, ..., -0.1556083 ,
           0.09107862,  0.41679722],
         [-0.45056465,  0.22565478,  0.21684526, ..., -0.46307218,
           0.12560755,  0.3367008 ]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 128, 768), dtype=float32, numpy=
 array([[[ 6.0117072e-01,  4.3541983e-02, -7.3847249e-02, ...,
          -1.9527021e-01, -1.3143281e-03, -4.4605013e-02],
         [ 1.4845307e-01, -2.0472026e-01,  2.7761897e-01, ...,
          -6.4247358e-01,  6.2928402e-01,  2.0

# ----

In [2]:
def build_model():
    # Step 1: Define input layer
    input_ids_KERAS_INPUT = tf.keras.Input(shape=(), dtype=tf.int32, name='input_ids')
    attention_mask_KERAS_INPUT = tf.keras.Input(shape=(), dtype=tf.int32, name='attention_mask')

    # BERT encoding
    outputs = model(input_ids_KERAS_INPUT, attention_mask=attention_mask_KERAS_INPUT)
    pooled_output = outputs.last_hidden_state[:, 0, :]  # Use CLS token
    # # Step 2: Tokenize batches of text inputs
    # bert_preprocess = hub.load("https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-cased-preprocess/versions/3")
    # tokenize = hub.KerasLayer(bert_preprocess.tokenize)
    # tokenized_input = tokenize(text_input)
    
    # # Step 3: Pack input sequences for the Transformer encoder
    # bert_pack_inputs = hub.KerasLayer(
    #     bert_preprocess.bert_pack_inputs,
    #     arguments=dict(seq_length=512))
    # encoder_inputs = bert_pack_inputs([tokenized_input])
    
    # Load BERT encoder
    encoder = hub.KerasLayer("https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-cased-l-12-h-768-a-12/versions/4", trainable=True, name='BERT_encoder')
    # Pass encoder inputs through BERT encoder
    outputs = encoder(encoder_inputs)

    # Define classifier layers
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(512, activation='relu')(net)  # Additional dense layer
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(256, activation='relu')(net)  # Additional dense layer
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='relu')(net)  # Additional dense layer
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='linear', name='regressor')(net)
    
    return tf.keras.Model(text_input, net)


build_model()

<keras.engine.functional.Functional at 0x2180a8203d0>

## Example

In [3]:
example_text = "Determine all functions f : mathbb R to mathbb R satisfying the following two conditions: (a) f(x + y) + f(x - y) = 2f(x)f(y) for all x, y in mathbb R, and (b) lim_{x to infty} f(x) = 0."
encoded_inputs = tokenizer(example_text, return_tensors="tf", padding="max_length", max_length=512, truncation=True)

print(f"output keys                     -> {list(encoded_inputs.keys())}")
print(f"input_word_ids _ input_ids      -> {encoded_inputs['input_ids'][0, :12]}")
print(f"input_mask _ attention_mask     -> {encoded_inputs['attention_mask'][0, :12]}")
print(f"input_type_ids _ token_type_ids -> {encoded_inputs['token_type_ids'][0, :12]}")
print(f"shape: {encoded_inputs['attention_mask'].shape}\n")

# output = model(encoded_inputs)
# output

output keys                     -> ['input_ids', 'token_type_ids', 'attention_mask']
input_word_ids _ input_ids      -> [  101  5646  2035  4972  1042  1024  8785 10322  1054  2000  8785 10322]
input_mask _ attention_mask     -> [1 1 1 1 1 1 1 1 1 1 1 1]
input_type_ids _ token_type_ids -> [0 0 0 0 0 0 0 0 0 0 0 0]
shape: (1, 512)



In [4]:
example_text = 'hElLO an bisa the aku'
encoded_inputs = tokenizer(example_text, return_tensors="tf", padding="max_length", max_length=12, truncation=True)

print(f"INPUT TEXT               --> '{example_text}'")
print(f"tokenize                 --> {tokenizer.tokenize(example_text)}")
print(f"decode                   --> {tokenizer.decode(encoded_inputs['input_ids'][0])}")
print(f"encode                   --> {tokenizer.encode(example_text)}")
print(f"convert_ids_to_tokens    --> {tokenizer.convert_ids_to_tokens([101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102])}")
print(f"convert_tokens_to_ids    --> {tokenizer.convert_tokens_to_ids(['[CLS]', 'hello', 'an', 'bis', '##a', 'the', 'ak', '##u', '[SEP]'])}")
print(f"convert_tokens_to_string --> {tokenizer.convert_tokens_to_string(['hello', 'an', 'bis', '##a', 'the', 'ak', '##u'])}")

print()
print(f"cls_token : {tokenizer.cls_token}  - cls_token_id: {tokenizer.cls_token_id}")
print(f"mask_token: {tokenizer.mask_token} - mask_token_id: {tokenizer.mask_token_id}")
print(f"pad_token : {tokenizer.pad_token}  - pad_token_id: {tokenizer.pad_token_id} - pad_token_type_id: {tokenizer.pad_token_type_id}")
print(f"unk_token : {tokenizer.unk_token}  - unk_token_id: {tokenizer.unk_token_id}")
print(f"sep_token : {tokenizer.sep_token}  - sep_token_id: {tokenizer.sep_token_id}")
print()
print(f"all_special_ids             --> {tokenizer.all_special_ids}")
print(f"all_special_tokens          --> {tokenizer.all_special_tokens}")
print(f"all_special_tokens_extended --> {tokenizer.all_special_tokens_extended}")
print()
print(f"name_or_path              --> {tokenizer.name_or_path}")
print(f"vocab_size                --> {tokenizer.vocab_size}")
print(f"model_max_length          --> {tokenizer.model_max_length}")
print(f"model_input_names         --> {tokenizer.model_input_names}")
print(f"prepare_for_model         --> {tokenizer.prepare_for_model([101, 7592, 2019, 20377])}")

print(f"SPECIAL_TOKENS_ATTRIBUTES --> {tokenizer.SPECIAL_TOKENS_ATTRIBUTES}")


INPUT TEXT               --> 'hElLO an bisa the aku'
tokenize                 --> ['hello', 'an', 'bis', '##a', 'the', 'ak', '##u']
decode                   --> [CLS] hello an bisa the aku [SEP] [PAD] [PAD] [PAD]
encode                   --> [101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102]
convert_ids_to_tokens    --> ['[CLS]', 'hello', 'an', 'bis', '##a', 'the', 'ak', '##u', '[SEP]']
convert_tokens_to_ids    --> [101, 7592, 2019, 20377, 2050, 1996, 17712, 2226, 102]
convert_tokens_to_string --> hello an bisa the aku

cls_token : [CLS]  - cls_token_id: 101
mask_token: [MASK] - mask_token_id: 103
pad_token : [PAD]  - pad_token_id: 0 - pad_token_type_id: 0
unk_token : [UNK]  - unk_token_id: 100
sep_token : [SEP]  - sep_token_id: 102

all_special_ids             --> [100, 102, 0, 101, 103]
all_special_tokens          --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
all_special_tokens_extended --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

name_or_path              --> tbs17/M

## Prepare & Load Dataset

In [2]:
AUTOTUNE = tf.data.AUTOTUNE  # Output: -1 --> <class 'int'>
batch_size = 16
seed = 42


from utils.load_dataset_for_regression_model import load_and_prepare_dataset

train_ds, val_ds, test_ds = load_and_prepare_dataset("../data/regression/imo/", seed=seed, batch_size=batch_size, AUTOTUNE=AUTOTUNE)


## Modeling

In [3]:
class MathBertRegressorModel:
    def __init__(self, seq_length=512):
        self.tokenizer_MathBERT = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
        # self.model_MathBERT = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)

        self.seq_length = seq_length
        self.model = self.__build_model()
        self.history = None
        
        self.train_ds = None
        self.validation_data = None
        self.epochs = None
        self.learning_rate = None
        self.optimizer_type = None
        self.loss = None
        self.metrics = None
        self.optimizer = None
        self.__is_compiled = False
        
        self.__is_trained = False

    def __build_model(self):
        # Step 1: Define text input layer
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        
        # Step 2: Tokenize and prepare input sequences
        def tokenize_text(text: str):
            inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.seq_length, return_tensors="tf")
            return {
                'input_ids': inputs['input_ids'],
                'attention_mask': inputs['attention_mask']
            }
            
        tokenized_input = tf.keras.layers.Lambda(tokenize_text, output_shape=(self.seq_length,), dtype='int32')(text_input)

        # Step 3: Load BERT model
        bert_model = TFBertModel.from_pretrained('tbs17/MathBERT', from_pt=True)

        # Step 4: Pass inputs through BERT model
        bert_output = bert_model(tokenized_input)

        # Additional dense layers for regression
        pooled_output = bert_output.pooler_output
        net = tf.keras.layers.Dense(512, activation='relu')(pooled_output)  # Additional dense layer
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(256, activation='relu')(net)  # Additional dense layer
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(128, activation='relu')(net)  # Additional dense layer
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(1, activation='linear', name='regression_output')(net)

        return tf.keras.Model(text_input, net)
    
    def compile_model(
        self, 
        train_ds, 
        validation_data,
        epochs, 
        learning_rate, 
        optimizer_type='adamw', 
        loss=tf.keras.losses.mean_squared_error,
        metrics=['mae']):
        
        self.train_ds = train_ds
        self.validation_data = validation_data
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.optimizer_type = optimizer_type
        self.loss = loss
        self.metrics = metrics
        
        steps_per_epoch = tf.data.experimental.cardinality(self.train_ds).numpy()
        num_train_steps = steps_per_epoch * self.epochs
        optimizer = optimization.create_optimizer(
            init_lr=self.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=int(0.1*num_train_steps),
            optimizer_type=self.optimizer_type
        )
        self.optimizer= optimizer
        
        # Comple model
        self.model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)
        self.__is_compiled = True
    
    def train(self):
        if self.__is_compiled is False: raise Warning("Model is not compiled yet")
        
        start = perf_counter()
        history = self.model.fit(
            self.train_ds, 
            validation_data=self.validation_data, 
            epochs=self.epochs
            )
        end = perf_counter()
        print(f"\nTotal training time: {end-start:.2f}s")
        
        self.history = history.history
        self.__is_trained = True
        return history
    
    def evaluate_test(self, test_ds):
        if self.__is_trained is False: raise Warning("Model is not trained yet")

        loss, mae = self.model.evaluate(test_ds)

        return {'loss': loss, 'mae': mae}
    
    def predict(self, texts):
        # Tokenize input texts
        input_ids = []
        attention_masks = []
        
        for text in texts:
            encoded_dict = self.BERTtokenizer.encode_plus(
                text, 
                add_special_tokens=True, 
                max_length=self.seq_length, 
                padding='max_length', 
                return_attention_mask=True, 
                return_tensors='tf', 
                truncation=True
            )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        
        input_ids = tf.concat(input_ids, axis=0)
        attention_masks = tf.concat(attention_masks, axis=0)
        
        # Make predictions
        predictions = self.model.predict([input_ids, attention_masks])
        return predictions
    

## Train

In [4]:
epochs = 40
learning_rate = 1e-6
optimizer_type = 'adamw'
loss = tf.keras.losses.mean_squared_error
metrics = ['mae']


In [5]:
math_bert = MathBertRegressorModel()
math_bert.compile_model(
    train_ds=train_ds,
    validation_data=val_ds,
    epochs=epochs, 
    learning_rate=learning_rate, 
    optimizer_type=optimizer_type, 
    loss=loss, 
    metrics=metrics
)

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'bert.embeddings.position_ids', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [6]:
math_bert.train()


Epoch 1/40


ValueError: in user code:

    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\IBDA\.conda\envs\victor_aops_mathbert\lib\site-packages\keras\engine\input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>]


In [None]:
math_bert.plot_training_history_over_time()


    def plot_training_history_over_time(self, figsize=(14, 12)):
        if self.history is None: raise Warning("Nothing to plot because model is not trained yet")
        
        print(self.history.keys())
        print("Training history over time")
        
        mae = self.history['mae']
        val_acc = self.history['val_mae']
        loss = self.history['loss']
        val_mae = self.history['val_mae']
        epochs = range(1, len(mae) + 1)
        
        figure, ax = plt.subplots(2, 1, figsize=figsize, layout="constrained")

        plt.subplot(2, 1, 1)
        plt.plot(epochs, loss, 'r', label='Training loss')  # r is for "solid red line"
        plt.plot(epochs, val_mae, 'b', label='Validation loss')  # b is for "solid blue line"
        plt.title('Training and validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(2, 1, 2)
        plt.plot(epochs, mae, 'r', label='Training mae')
        plt.plot(epochs, val_acc, 'b', label='Validation mae')
        plt.title('Training and validation mae')
        plt.xlabel('Epochs')
        plt.ylabel('MAE')
        plt.legend()