## Run imports and set variables

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text
import datetime
import keras

import pandas as pd
import numpy as np

In [4]:
bert_model_path = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
bert_preprocessing_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

## Import data
Import the dataset from Tensorflow Hub and split it into train and test sets.

In [45]:
train_data, test_data = tfds.load(
    name='ag_news_subset',
    split=(f'train[:{5}%]', 'test'),
    shuffle_files=False,
    as_supervised=True,
    batch_size=1
)

## Import BERT model and preprocessing handler

In [6]:
bert_preprocessing = hub.KerasLayer(bert_preprocessing_path, name='preprocessing')
bert = hub.KerasLayer(bert_model_path, trainable = False, name = 'BERT')

2021-12-13 16:09:03.068166: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


## Build the model
We create a function to define and compile the NN with the pretrained BERT model.

In [5]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='description')
    encoder_inputs = bert_preprocessing(text_input)
    outputs = bert(encoder_inputs)

    # Only retrieve the outputs from the corresponding [CLS] token
    net = outputs['pooled_output']
    
    # Build and compile the model
    model = tf.keras.Model(text_input, net)
    model.compile(
        optimizer='Adam',
        loss='SparseCategoricalCrossentropy',
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

model = build_model()

In [46]:
def build_token_level_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='description')
    encoder_inputs = bert_preprocessing(text_input)
    outputs = bert(encoder_inputs)

    # Retrieve the token embeddings for each token
    net = outputs['sequence_output']
    
    # Build and compile the model
    model = tf.keras.Model(text_input, net)
    model.compile(
        optimizer='Adam',
        loss='SparseCategoricalCrossentropy',
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

model = build_token_level_model()

In [47]:
import numpy as np
X = model.predict(train_data)

In [51]:
# TODO: if sample is shorter than 128, will [PAD] tokens mess with the average?
averaged_embeddings = np.array([np.average(sample, axis=0) for sample in X])

y = np.concatenate([[y] for _, y in train_data], axis=1)
data = np.append(averaged_embeddings, y.T, axis=1)

In [52]:
df = pd.DataFrame(data)
df.to_csv('../data/vec_token_data.csv', index = False)

In [14]:
label_embeddings = model.predict(['World', 'Sports', 'Business', 'Science and Technology'])
label_df = pd.DataFrame(label_embeddings)
label_df.to_csv('../data/label_embeddings.csv', index = False)

In [50]:
label_embeddings = model.predict(['World', 'Sports', 'Business', 'Science and Technology'])
first_token_embeddings = [embedding[0] for embedding in label_embeddings]
label_df = pd.DataFrame(first_token_embeddings)
label_df.to_csv('../data/label_token_embeddings.csv', index = False)