# Setup

## Imports

In [1]:
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import losses
import tensorflow_text as tfText

from transformers import TFBertModel, BertConfig
from tqdm import tqdm
import numpy as np
import re
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


## Initialize notebook parameters

In [2]:
# Input pipeline parameters:
training_data_folder = "datasets/test_training data"
fred_series_id = "GDPC1"
num_threads = 8 # For tf.data.Dataset.map operations
batch_size = 2

# General parameters
n_vocab = 100000
label_seq_length = 20
series_seq_length = 40

# Model hyperparameters
dropout_rate = 0.1 # for Attention blocks
num_heads = 2 # for Attention blocks
decoder_stack_height = 1
dv1 = 12 # Dimensionality of key/query for each attention head. Must be divisible by num_heads

# Create input pipeline

## Load training data from directory

In [28]:
def get_training_data(path, fred_series_id, return_filepaths):
    data_folder = os.path.basename(path)
    pattern = r".+_(.+)"
    matches = re.match(pattern, data_folder)
    prefix = matches.group(1)
    x1_filepath = os.path.join(path, prefix + ".txt")
    x2_filepath = os.path.join(path, fred_series_id, fred_series_id + "_series.csv")
    y_filepath = os.path.join(path, fred_series_id, fred_series_id + "_label.csv")
    paths = [x1_filepath, x2_filepath, y_filepath]

    if return_filepaths:
        return paths
    else:
        file_contents = []
        for path in paths:
            file_contents.append(tf.io.read_file(path))
        return file_contents

# Get folders that are valid training examples
training_example_folders = []
for folder in os.listdir(training_data_folder):
    path = os.path.join(training_data_folder, folder)
    if os.path.isdir(path):
        training_example_folders.append(path)

# Get feature and label filepaths from valid training example folders
data_filepaths = {"billtext": [],
                  "series": [],
                  "label": []}
for training_example in training_example_folders:
    x1, x2, y = get_training_data(training_example, fred_series_id, return_filepaths=True)
    data_filepaths["billtext"].append(x1)
    data_filepaths["series"].append(x2)
    data_filepaths["label"].append(y)


## Create Tensorflow input pipeline

### Create functions for dataset transformations

In [29]:
def get_train_test_split(test_split, dataset, **kwargs):
    default_batch_size = 32
    default_num_parallel_calls = 8

    temp = dataset.shuffle(1000, reshuffle_each_iteration=False)
    divisor = round(100/test_split)

    def is_test(x, y):
        return x % divisor == 0

    def is_train(x, y):
        return not is_test(x, y)

    recover = lambda x,y: y

    test_dataset = temp.enumerate() \
                        .filter(is_test) \
                        .map(recover)
    train_dataset = temp.enumerate() \
                        .filter(is_train) \
                        .map(recover)

    if 'batch_size' in kwargs and 'num_parallel_calls' in kwargs:
        test_dataset = test_dataset.batch(
            batch_size=kwargs['batch_size'],
            num_parallel_calls=kwargs['num_parallel_calls']
        )
        train_dataset = train_dataset.batch(
            batch_size=kwargs['batch_size'],
            num_parallel_calls=kwargs['num_parallel_calls']
        )
    elif 'batch_size' in kwargs:
        test_dataset = test_dataset.batch(
            batch_size=kwargs['batch_size'],
            num_parallel_calls=default_num_parallel_calls
        )
        train_dataset = train_dataset.batch(
            batch_size=kwargs['batch_size'],
            num_parallel_calls=default_num_parallel_calls
        )
    else:
        test_dataset = test_dataset.batch(
            batch_size=default_batch_size,
            num_parallel_calls=default_num_parallel_calls
        )
        train_dataset = train_dataset.batch(
            batch_size=default_batch_size,
            num_parallel_calls=default_num_parallel_calls
        )

    return train_dataset, test_dataset

# This is where we pull out the actual data we want
def load_data(path, is_csv=False):
    if is_csv:
        decoded_path = bytes.decode(path.numpy())
        x = np.loadtxt(decoded_path, dtype=str, delimiter=",", skiprows=1)
        relevant_series = x[:,3].astype(float).round().astype('int32')  # reduce to int for homogeneity across fred series
        x = tf.constant(relevant_series)
    else:
        x = tf.io.read_file(path)
    return x

def set_shape(item, shape):
    if not isinstance(shape, list):
        raise ValueError("shape must be a List")
    item.set_shape(shape)
    return item


### Create the pipeline

In [30]:
# Hark! A pipeline!
for key in data_filepaths:
    data_filepaths[key] = tf.data.Dataset.from_tensor_slices(data_filepaths[key])

# Do dataset mappings
my_tokenizer = layers.IntegerLookup(vocabulary=np.arange(n_vocab, dtype='int32'))
seq_length = {"series": series_seq_length, "label": label_seq_length}
data = {}
for key in data_filepaths:
    if key == "billtext":
        # Process billtext data
        data[key] = data_filepaths[key] \
        .map(lambda x: tf.py_function(load_data, [x, False], tf.string),
             num_parallel_calls=num_threads) \
        .map(lambda item: set_shape(item, []),
             num_parallel_calls=num_threads)
    else:
        # Process label and series data
        data[key] = data_filepaths[key] \
        .map(lambda x: tf.py_function(load_data, [x, True], tf.int32),
             num_parallel_calls=num_threads) \
        .map(lambda item: set_shape(item, [seq_length[key]]),
             num_parallel_calls=num_threads) \
        .map(my_tokenizer.call, num_parallel_calls=num_threads)

# Zip
training_data = tf.data.Dataset.zip((data["billtext"], data["series"]), data["label"])
train_data, test_data = get_train_test_split(20,
                                             training_data,
                                             batch_size=batch_size,
                                             num_parallel_calls=num_threads)

## Test input pipeline (run if needed)

### Element shapes

Let's see what our Dataset elements look like...

In [31]:
train_data.element_spec

((TensorSpec(shape=(None,), dtype=tf.string, name=None),
  TensorSpec(shape=(None, 40), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))

In [32]:
test_data.element_spec

((TensorSpec(shape=(None,), dtype=tf.string, name=None),
  TensorSpec(shape=(None, 40), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))

### Element values

Let's see the data in each element (warning: the two cells below have large outputs)

In [None]:
for item in train_data.take(1):
    print(item)

In [None]:
for item in test_data.take(1):
    print(item)

So far we've created a tf.data.Dataset object with two components. The first component is a tuple (bill_text, pre-bill_series), containing the text of the legislative bill in the 0th index and FRED series (pre-bill) data in the 1st index. These are the features. The second component contains the label, which is FRED series post-bill data.

# Create our model

## Create ancillary layers

In [34]:
class PositionalEncoder(layers.Layer):
    def __init__(self, output_dim):
        # output_dim: dimensionality of positional encoding vector
        super().__init__()
        self.output_dim = output_dim

    def call(self, inputs):
        entry = np.tile(np.expand_dims(np.arange(inputs.shape[1]), -1), (1, self.output_dim))
        two_i = np.tile(np.repeat(np.arange(0, self.output_dim, 2), 2), (inputs.shape[1],1))
        if not self.output_dim % 2 == 0:
            two_i = two_i[:,:-1]
        base = 10000*np.ones([inputs.shape[1], self.output_dim])
        quotient = entry/(np.power(base,(two_i/self.output_dim)))
        sin_mask = np.tile(np.arange(self.output_dim), (inputs.shape[1],1)) % 2 == 0
        cos_mask = np.logical_not(sin_mask)
        output = sin_mask*np.sin(quotient) + cos_mask*np.cos(quotient)
        return output

class BaseAttention(layers.Layer):
    def __init__(self, num_heads, key_dim, value_dim, dropout_rate):
        super().__init__()
        self.add = layers.Add()
        self.norm = layers.LayerNormalization()
        self.mha = layers.MultiHeadAttention(num_heads, key_dim, value_dim, dropout=dropout_rate)
    # add a build function for mha (per docs)?
    def build(self, inputs):
        # super().build()
        self.mha._build_from_signature(query=inputs, value=inputs, key=inputs)

class SimpleSelfAttention(BaseAttention):
    def call(self, inputs):
        x = self.mha(query=inputs, value=inputs, key=inputs)
        x = self.add([x, inputs])
        x = self.norm(x)
        return x

class SimpleCrossAttention(BaseAttention):
    def call(self, inputs, context):
        x = self.mha(query=inputs, value=context, key=context)
        x = self.add([x, inputs])
        x = self.norm(x)
        return x

class SimpleCausalSelfAttention(BaseAttention):
    def call(self, inputs):
        x = self.mha(query=inputs, value=inputs, key=inputs, use_causal_mask=True)
        x = self.add([x, inputs])
        x = self.norm(x)
        return x

class FeedForwardNN(layers.Layer):
    def __init__(self, output_dim, ff_dropout_rate):
        super().__init__()
        self.relu = layers.Dense(units=output_dim, activation='relu')
        self.linear = layers.Dense(units=output_dim)
        self.dropout = layers.Dropout(ff_dropout_rate)
        self.add = layers.Add()
        self.norm = layers.LayerNormalization()

    def call(self, inputs):
        x = self.relu(inputs)
        x = self.linear(x)
        x = self.add([x, inputs])
        x = self.norm(x)
        return x

class MyCustomDecoderLayer(layers.Layer):
    def __init__(self,
                 output_dim,
                 sa_num_heads,
                 ca_num_heads,
                 sa_key_dim,
                 ca_key_dim,
                 sa_value_dim,
                 ca_value_dim,
                 ca_dropout_rate,
                 ff_dropout_rate):
        super().__init__()
        self.msa = SimpleCausalSelfAttention(sa_num_heads, sa_key_dim, sa_value_dim, 0.0) # masked self attention, no dropout
        self.ca = SimpleCrossAttention(ca_num_heads, ca_key_dim, ca_value_dim, ca_dropout_rate) # cross attention
        self.ffn = FeedForwardNN(output_dim, ff_dropout_rate)

    def call(self, inputs, context):
        x = self.msa(inputs)
        x = self.ca(x, context)
        x = self.ffn(x)
        return x

## Create MyCustomDecoder

In [35]:
class MyCustomDecoder(layers.Layer):
    def __init__(self, stack_height, d_model, h_model, dropout_rate):
        super().__init__()
        self.stack_height = stack_height
        self.decoder_layers = [MyCustomDecoderLayer(dv1,
                                                  h_model,
                                                  h_model,
                                                  d_model,
                                                  d_model,
                                                  int(d_model/h_model),
                                                  int(d_model/h_model),
                                                  dropout_rate,
                                                  dropout_rate)
                               for _ in range(stack_height)]

    def call(self, input, context):
        x = input
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, context)
        return x

## Create MyCustomModel

In [36]:
class MyBertTokenizer(layers.Layer):
    START_TOKEN = 101
    END_TOKEN = 102
    def __init__(self):
        super().__init__()
        self.tokenizer = tfText.BertTokenizer("datasets/Bert_Vocabulary.txt")
    def call(self, inputs):
        tokenized = self.tokenizer.tokenize(tf.strings.lower(inputs)).merge_dims(-2, -1)
        processed_segments, segment_ids = tfText.combine_segments([tokenized],
                                            MyBertTokenizer.START_TOKEN,
                                            MyBertTokenizer.END_TOKEN)
        processed_segments = tf.cast(processed_segments.to_tensor(), dtype=tf.int32)
        segment_ids = tf.cast(segment_ids.to_tensor(), dtype=tf.int32)
        return {'input_ids': processed_segments,
                'token_type_ids': segment_ids}

class BertEncoder(layers.Layer):
    def __init__(self, projection_dim):
        super().__init__()
        # configuration = BertConfig(max_position_embeddings=max_seq_length) # awaiting HF team response on Github issue
        self.tokenizer = MyBertTokenizer()
        self.bert = TFBertModel.from_pretrained("distilbert/distilbert-base-uncased")
        self.broadcaster = layers.Dense(units=projection_dim)

    def call(self, input):
        x = self.tokenizer(input)
        x = self.bert(**x)
        x = tf.expand_dims(self.broadcaster(x.pooler_output), 1) # dimension issue maybe?
        return x

class MyCustomModel(keras.Model):
    def __init__(self, decoder_stack_height, d_model, h_model, decoder_dropout_rate, n_decoder_vocab, label_seq_length, ):
        super().__init__()
        self.encoder = BertEncoder(d_model)
        self.decoder = MyCustomDecoder(decoder_stack_height,
                                       d_model,
                                       h_model,
                                       decoder_dropout_rate)
        self.tokenizer = layers.IntegerLookup(vocabulary=np.arange(n_decoder_vocab, dtype='int32'))
        self.embedding = layers.Embedding(n_decoder_vocab+1, d_model)
        self.positional_encoding = PositionalEncoder(d_model)
        self.output_layer = layers.Dense(units=n_decoder_vocab+1)

    def call(self, input):
        # Input is a tuple
        bill_text, prebill_series = input
        context = self.encoder(bill_text)

        # Input is a list of ids (tokenized)
        dec_inp = self.embedding(prebill_series) # creates embedding values for each item in the list. Output shae: [Ty, dv1]
        for i in tqdm(range(label_seq_length)): # not efficient cuz context vectors recomputed every time
            x = dec_inp + self.positional_encoding(dec_inp) # augments embedding id's with positional data. Output shape: [Ty, dv1]
            x = self.decoder(x, context)
            # Get last item
            new_token_emb = tf.expand_dims(x[:,-1,:], 1)
            dec_inp = tf.concat([dec_inp, new_token_emb],1)

        x = dec_inp[:,-label_seq_length:,:]
        x = self.output_layer(x)

        return x


In [37]:
my_model = MyCustomModel(decoder_stack_height,
                         dv1,
                         num_heads,
                         dropout_rate,
                         n_vocab,
                         label_seq_length)
my_model.compile(optimizer=Adam(),
                 loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
my_model.fit(train_data, epochs=2, verbose=2)

In [None]:
output = my_model.evaluate(test_data)
print(my_model.metrics_names)
print(output)