# Vectorize articles and categories by using BERT CLS tokens

In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text
import pandas as pd
import numpy as np

bert_model_path = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
bert_preprocessing_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert = hub.KerasLayer(bert_model_path, trainable = False, name = 'BERT');
bert_preprocessing = hub.KerasLayer(bert_preprocessing_path, name='preprocessing');

In [9]:
percent_train_data = 10

train_data, _ = tfds.load(
    name='ag_news_subset',
    split=(f'train[:{percent_train_data}%]', 'test'),
    shuffle_files=False,
    as_supervised=True,
    batch_size=1
);

In [10]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='description')
    encoder_inputs = bert_preprocessing(text_input)
    outputs = bert(encoder_inputs)

    # Only retrieve the outputs from the corresponding [CLS] token
    net = outputs['pooled_output']
    
    # Build and compile the model
    model = tf.keras.Model(text_input, net)
    model.compile(
        optimizer='Adam',
        loss='SparseCategoricalCrossentropy',
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

model = build_model()

### Retrieve CLS vectors and save articles

In [4]:
article_embeddings = model.predict(train_data)
labels = np.concatenate([[label] for _, label in train_data], axis=1)
article_data = np.append(article_embeddings, labels.T, axis=1)

In [11]:
article_df = pd.DataFrame(article_data)
article_df.to_csv(f'../../data/articles_cls_token.csv', index = False)

### Retrieve CLS vectors and save categories

In [6]:
category_embeddings = model.predict(['World', 'Sports', 'Business', 'Science and Technology'])

In [12]:
category_df = pd.DataFrame(category_embeddings)
category_df.to_csv('../../data/categories_cls_token.csv', index = False)