# Vectorize articles and categories by averaging all BERT tokens

In [11]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text
import pandas as pd
import numpy as np

bert_model_path = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
bert_preprocessing_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert = hub.KerasLayer(bert_model_path, trainable = False, name = 'BERT');
bert_preprocessing = hub.KerasLayer(bert_preprocessing_path, name='preprocessing');

In [12]:
percent_train_data = 10

train_data, _ = tfds.load(
    name='ag_news_subset',
    split=(f'train[:{percent_train_data}%]', 'test'),
    shuffle_files=False,
    as_supervised=True,
    batch_size=1
);

In [13]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='description')
    encoder_inputs = bert_preprocessing(text_input)
    outputs = bert(encoder_inputs)

    # Retrieve the token embeddings for each token
    net = outputs['sequence_output']
    
    # Build and compile the model
    model = tf.keras.Model(text_input, net)
    model.compile(
        optimizer='Adam',
        loss='SparseCategoricalCrossentropy',
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

model = build_model()

### Vectorize, average and save articles

In [4]:
article_embeddings = model.predict(train_data)
avg_article_embeddings = np.array([np.average(embs, axis=0) for embs in article_embeddings])
labels = np.concatenate([[label] for _, label in train_data], axis=1)
article_data = np.append(avg_article_embeddings, labels.T, axis=1)

In [14]:
article_df = pd.DataFrame(article_data)
article_df.to_csv(f'../data/articles_avg_token.csv', index = False)

# Vectorize, average and save categories

### Categories as category names

In [9]:
category_embeddings = model.predict(['World', 'Sports', 'Business', 'Science and Technology'])
avg_category_embeddings = np.array([np.average(embs, axis=0) for embs in category_embeddings])

In [15]:
category_df = pd.DataFrame(avg_category_embeddings)
category_df.to_csv('../data/categories_avg_token.csv', index = False)

### Categories as vectors with synonmys

In [19]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
 
def get_synonym_strings(words):
    all_synonym_strings = []
    for word in words:
        syn = ''
        for synset in wordnet.synsets(word):
            for lemma in synset.lemmas():
                syn += lemma.name() + '. '
        all_synonym_strings.append(syn)
    return all_synonym_strings

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/viktorenzell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
synonym_strings = get_synonym_strings(['World', 'Sports', 'Business', 'Technology'])
category_syn_embeddings = model.predict(synonym_strings)
avg_category_syn_embeddings = np.array([np.average(embs, axis=0) for embs in category_syn_embeddings])

In [21]:
category_syn_df = pd.DataFrame(avg_category_syn_embeddings)
category_syn_df.to_csv('../data/categories_avg_synonym.csv', index = False)