In [1]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import losses
import tensorflow as tf

from modeling.preprocess.get_training_data import compile_training_data
from modeling.models.custom_models import MyCustomModel, MyCustomModel1

In [2]:
# Set shared hyperparameters - User customizable
label_seq_length = 20
n_vocab = 100000

# Set input pipeline hyperparameters - User customizable
pipeline_hparams = {
    "training_data_folder": "datasets/test_training data",
    "fred_series_id": "GDPC1",
    "series_seq_length": 40,
    "label_seq_length": label_seq_length,
    "n_vocab": n_vocab,
    "num_threads": 8,
    "batch_size": 1
}

In [3]:
train_data, test_data, n_examples = compile_training_data(**pipeline_hparams)

In [4]:
train_data.element_spec

((TensorSpec(shape=(None,), dtype=tf.string, name=None),
  TensorSpec(shape=(None, 40), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))

## Try one model

In [5]:
from tensorflow.keras.metrics import MeanSquaredError

In [8]:
mse = MeanSquaredError()
y_true = tf.constant([0.0, 0.5, 1.0, 1.5])
y_pred = tf.constant([0.1, 0.4, 1.1, 1.6])
mse.update_state(y_true=y_true, y_pred=y_pred)
mse.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.010000002>

In [None]:
# Set model hyperparameters - User customizable
hparams = {
    "decoder_stack_height": 1,
    "d_model": 12, # must be divisible by h_model
    "h_model": 2,
    "decoder_dropout_rate": 0.1,
    "n_decoder_vocab": n_vocab,
    "label_seq_length": label_seq_length,
    "encoder_max_seq_len": 512
}

my_model = MyCustomModel(**hparams)
my_model.compile(optimizer=Adam(),
                 loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
my_model.fit(train_data, epochs=1, verbose=2)
output = my_model.evaluate(test_data)
print(my_model.metrics_names)
print(output)

## Try another model

In [4]:
# Set model hyperparameters - User customizable
hparams1 = {
    "decoder_stack_height": 1,
    "d_values": 12, # must be divisible by h_model
    "d_keys": 12,
    "h_model": 2,
    "decoder_dropout_rate": 0.1,
    "n_decoder_vocab": n_vocab,
    "label_seq_length": label_seq_length,
    "encoder_max_seq_len": 512
}

my_model1 = MyCustomModel1(**hparams1)
my_model1.compile(optimizer=Adam(),
                 loss=losses.MeanSquaredError(),
                 metrics=['mse'])
my_model1.fit(train_data, epochs=1, verbose=2)
output = my_model1.evaluate(test_data)
print(my_model1.metrics_names)
print(output)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 15.80

7/7 - 53s - loss: 159524976.0000 - mse: 159524976.0000 - 53s/epoch - 8s/step


100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 21.11it/s]


['loss', 'mse']
[159741856.0, 159741856.0]
