# Vectorize articles and categories by averaging all BERT tokens

In [221]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text
import pandas as pd
import numpy as np

bert_model_path = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
bert_preprocessing_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert = hub.KerasLayer(bert_model_path, trainable=False, name='BERT');
bert_preprocessing = hub.KerasLayer(bert_preprocessing_path, name='preprocessing');

In [222]:
# Using batch size = 100% of samples for preprocessing to work
percent_train_data = 10
num_training_samples = 1200 * percent_train_data

train_data, _ = tfds.load(
    name='ag_news_subset',
    split=(f'train[:{percent_train_data}%]', 'test'),
    shuffle_files=False,
    as_supervised=True,
    batch_size=num_training_samples
);

In [223]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='description')
    encoder_inputs = bert_preprocessing(text_input)
    outputs = bert(encoder_inputs)

    # Retrieve the token embeddings for each token
    net = outputs['sequence_output']
    
    # Build and compile the model
    model = tf.keras.Model(text_input, net)
    model.compile(
        optimizer='Adam',
        loss='SparseCategoricalCrossentropy',
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

model = build_model()

In [231]:
def mean_pooling(predictions, sentences):
    # Average the token embeddings, discarding spacial tokens ([CLS], [SEP], [PAD])
    all_input_word_ids = bert_preprocessing(sentences)['input_word_ids'].numpy()
    all_mean_pools = []

    for i, input_word_ids in enumerate(all_input_word_ids):
        # Count WordPiece tokens by:
        # discarding [PAD] with np.non_zero()
        # discarding [CLS] and [SEP] with -2
        num_input_tokens = np.count_nonzero(input_word_ids) - 2
        embeddings = predictions[i]
        input_token_embeddings = embeddings[1:num_input_tokens + 1]
        mean_pool = np.average(input_token_embeddings, axis=0)
        all_mean_pools.append(mean_pool)
    
    return np.array(all_mean_pools)

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
 
def get_synonym_strings(word_lists):
    all_synonym_strings = []
    for word_list in word_lists:
        syn = []
        syn_str = ''
        for word in word_list:
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    parsed_lemma = lemma.name().replace('_', ' ')
                    if parsed_lemma not in syn:
                        syn.append(parsed_lemma)
                        syn_str += parsed_lemma + '. '
        
        all_synonym_strings.append(syn)
    return all_synonym_strings

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/viktorenzell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Vectorize, average and save articles

In [225]:
predictions = model.predict(train_data)
sentences = np.array(list(train_data))[:,0,:][0]

avg_article_embeddings = mean_pooling(predictions, sentences)

labels = np.concatenate([[label] for _, label in train_data], axis=1)
article_data = np.append(avg_article_embeddings, labels.T, axis=1)

In [226]:
article_df = pd.DataFrame(article_data)
article_df.to_csv(f'../../data/articles_avg_token.csv', index = False)

# Vectorize, average and save categories

### Categories as category names

In [227]:
category_labels = ['Politics', 'Sports', 'Business', 'Science and Technology']
category_embeddings = model.predict(category_labels)
avg_category_embeddings = mean_pooling(category_embeddings, category_labels)

In [228]:
category_df = pd.DataFrame(avg_category_embeddings)
category_df.to_csv('../../data/categories_avg_token.csv', index = False)

### Categories as vectors with synonmys

In [240]:
synonym_strings = get_synonym_strings([['Politics'], ['Sports'], ['Business'], ['Science', 'Technology']])

for s in synonym_strings:
    print(s)
    print()

avg_category_syn_embeddings = []
for category_synonyms in synonym_strings:
    mean_poolings = mean_pooling(model.predict(category_synonyms), category_synonyms)
    avg_category_syn_embeddings.append(np.average(mean_poolings, axis=0))

['politics', 'political relation', 'political science', 'government', 'political sympathies']

['sport', 'athletics', 'summercater', 'sportsman', 'sportswoman', 'mutant', 'mutation', 'variation', 'fun', 'play', 'feature', 'boast', 'frolic', 'lark', 'rollick', 'skylark', 'disport', 'cavort', 'gambol', 'frisk', 'romp', 'run around', 'lark about']

['business', 'concern', 'business concern', 'business organization', 'business organisation', 'commercial enterprise', 'business enterprise', 'occupation', 'job', 'line of work', 'line', 'business sector', 'clientele', 'patronage', 'stage business', 'byplay']

['science', 'scientific discipline', 'skill', 'technology', 'engineering', 'engineering science', 'applied science']



In [241]:
category_syn_df = pd.DataFrame(avg_category_syn_embeddings)
category_syn_df.to_csv('../../data/categories_avg_synonym.csv', index = False)