## Hyper parameter tuning of embedding model

In [1]:
# UNCOMMENT THIS CELL IF RUNNING IN COLAB
COLAB = True
!rm -r ./virality-in-headliness
!git clone https://github.com/leonleo997/virality-in-headliness
!pip uninstall headlines-project -y
!pip install ./virality-in-headliness
DATASET_FILE = './virality-in-headliness/datasets'

In [2]:
# # UNCOMMENT THIS CELL IF RUNNING DIRECTLY FROM THIS NOTEBOOK
# COLAB = False
# DATASET_FILE = '../datasets'
# !pip uninstall headlines-project -y
# !pip install ../

Found existing installation: headlines-project 0.0.5
Uninstalling headlines-project-0.0.5:
  Successfully uninstalled headlines-project-0.0.5
Processing /home/sebastian/Data_Science_Projects/virality-in-headliness
Building wheels for collected packages: headlines-project
  Building wheel for headlines-project (setup.py) ... [?25ldone
[?25h  Created wheel for headlines-project: filename=headlines_project-0.0.5-py3-none-any.whl size=12370 sha256=a71a02df553328a707975ddeaccfd079dd1ad13e4aa886a7c85caed80267a34c
  Stored in directory: /home/sebastian/.cache/pip/wheels/e2/24/0b/340bc509f1ec151908912726a4a40570bb7dd853685bf12092
Successfully built headlines-project
Installing collected packages: headlines-project
Successfully installed headlines-project-0.0.5


In [3]:
import headlines_project 
from headlines_project.models import TransformerEncoder
from headlines_project.data import create_data_pipeline
from headlines_project.utils import plot_attention_weights

print(headlines_project.__path__)

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
from bpemb import BPEmb

_NamespacePath(['/home/sebastian/anaconda3/envs/headlines/lib/python3.7/site-packages/headlines_project'])


# Set hyperparameters

In [4]:
# Transformer encoder params
HP_NUM_HEADS = hp.HParam('num_heads', hp.Discrete([1, 2, 5]))
HP_NUM_LAYERS = hp.HParam('num_layers', hp.Discrete([1, 2, 4]))
HP_DFF = hp.HParam('dff', hp.Discrete([128, 256, 512]))
HP_FINETUNE_EMBEDDINGS = hp.HParam('finetune_embeddings', hp.Discrete([1, 0]))
HP_VOCAB_SIZE = hp.HParam('vocab_size', hp.Discrete([10000, 25000]))
HP_EMBEDDING_DIM = hp.HParam('embedding_dim', hp.Discrete([100, 200, 300]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_NUM_HEADS, HP_NUM_LAYERS, HP_DFF, HP_FINETUNE_EMBEDDINGS, 
                 HP_VOCAB_SIZE, HP_EMBEDDING_DIM],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
    )

# Load seralized dataset

In [None]:
import pickle as pkl

data_path = f"{DATASET_FILE}/dataset_dict_features.pickle"
with open(data_path,"rb") as f:
    data = pkl.load(f)
    
df = data["df"]
packages_ids= data["packages_ids"]
packages_ids.keys()

# Prepare training and testing functions

In [6]:
batch_sizes = {
    "train":2048,
    "val":512,
    "test":512
}

train_steps = ((len(packages_ids["train"]) * 2) // batch_sizes["train"])
val_steps = len(packages_ids["val"]) // batch_sizes["val"]
test_steps = len(packages_ids["test"]) // batch_sizes["test"]
MAX_LENGTH = 45
histories = {}

def prepate_embeddings(vocab_size, embedding_dim, trainable):
    bpemb_en = BPEmb(lang="en", vs=vocab_size, dim=embedding_dim)
    
    embedding_layer = tf.keras.layers.Embedding(
        input_dim = bpemb_en.vectors.shape[0],
        output_dim = bpemb_en.vectors.shape[1],
        input_length = MAX_LENGTH,
        embeddings_initializer = tf.keras.initializers.Constant(bpemb_en.vectors),
        trainable=trainable
    )
    
    return bpemb_en, embedding_layer
    
class TransfomerEmbedder(tf.keras.models.Model):
    def __init__(self, d_model, num_heads, dff, embedding_layer, 
               use_causal_attention=False, num_layers=1, dropout_rate=0.2):
        super(TransfomerEmbedder, self).__init__()
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.transformer_encoder = TransformerEncoder(num_layers=num_layers, d_model=embedding_layer.output_dim, 
                                                      num_heads=num_heads, dff=dff, embedding_layer=embedding_layer)
        self.global_max_pool_1d = tf.keras.layers.GlobalMaxPooling1D(name="max_pooling")


    def call(self, input_tokens):
        out, attention_weights = self.transformer_encoder(input_tokens)
        out = self.global_max_pool_1d(out)
        return out, attention_weights


def create_classifier(transformer_embedder, dropout_rate=0.2):
    input_h1 = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_tokens_h1")

    input_h2 = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_tokens_h2")

    embeddings_h1, _ = transformer_embedder(input_h1)

    embeddings_h2, _ = transformer_embedder(input_h2)

    concat = tf.keras.layers.Concatenate()([embeddings_h1, embeddings_h2])

    dropout1 = tf.keras.layers.Dropout(dropout_rate)(concat)

    output_classification = tf.keras.layers.Dense(1, activation="sigmoid", name="class")(dropout1)

    model = tf.keras.models.Model(inputs = {
      "input_tokens_h1": input_h1,
      "input_tokens_h2": input_h2,
    }, outputs=output_classification, name="model")
    return model    

def train_test_model(hp):
    '''
    hp(tensorboard.plugins.hparams.api.hp): HParams
    '''
    num_heads = hp[HP_NUM_HEADS]
    num_layers = hp[HP_NUM_LAYERS]
    dff = hp[HP_DFF]
    finetune_embeddings = bool(hp[HP_FINETUNE_EMBEDDINGS])
    vocab_size = hp[HP_VOCAB_SIZE]
    embedding_dim = hp[HP_EMBEDDING_DIM]
    
    bpemb_en, embedding_layer = prepate_embeddings(vocab_size, embedding_dim, finetune_embeddings)
    
    transformer_embedder = TransfomerEmbedder(embedding_dim, num_heads, dff, embedding_layer, 
                                              num_layers=num_layers, dropout_rate=0.2)
    classifier = create_classifier(transformer_embedder)
    
    train_data, val_data, test_data = create_data_pipeline(df, packages_ids, batch_sizes, MAX_LENGTH, encoder_fn = bpemb_en.encode_ids, 
                         classification = True, enforced = True)
    
    classifier.compile(optimizer="adam", loss="bce", metrics=["accuracy", tf.keras.metrics.AUC()])
    
    epochs = 100
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, 
                                                      monitor="val_accuracy", mode="max")
    print(classifier.summary())
    
    classifier.fit(
        train_data,
        steps_per_epoch = train_steps,
        validation_data = val_data,
        validation_steps= val_steps,
        epochs= epochs,
        verbose=1,
        use_multiprocessing=True,
        callbacks=[early_stopping]
    )
    
    loss, accuracy, auc = classifier.evaluate(test_data, steps=test_steps)
    return accuracy

In [7]:
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        accuracy = train_test_model(hparams)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

# Run!

In [7]:
%load_ext tensorboard
%tensorboard --logdir logs/hparam_tuning

In [8]:
if __name__ == "__main__":
    session_num = 0
    
    HP_NUM_HEADS = hp.HParam('num_heads', hp.Discrete([1, 2, 5]))
    HP_NUM_LAYERS = hp.HParam('num_layers', hp.Discrete([1, 2, 4]))
    HP_DFF = hp.HParam('dff', hp.Discrete([128, 256, 512]))
    HP_FINETUNE_EMBEDDINGS = hp.HParam('finetune_embeddings', hp.Discrete([1, 0]))
    HP_VOCAB_SIZE = hp.HParam('vocab_size', hp.Discrete([10000, 25000]))
    HP_EMBEDDING_DIM = hp.HParam('embedding_dim', hp.Discrete([100, 200, 300]))
    
    for num_heads in HP_NUM_HEADS.domain.values:
        for num_layers in HP_NUM_LAYERS.domain.values:
            for dff in HP_DFF.domain.values:
                for finetune in HP_FINETUNE_EMBEDDINGS.domain.values:
                    for vs in HP_VOCAB_SIZE.domain.values:
                        for embed_dim in HP_EMBEDDING_DIM.domain.values:
                            hparams = {
                                HP_NUM_HEADS: num_heads,
                                HP_NUM_LAYERS: num_layers,
                                HP_DFF: dff,
                                HP_FINETUNE_EMBEDDINGS: finetune,
                                HP_VOCAB_SIZE: vs,
                                HP_EMBEDDING_DIM: embed_dim
                            }
                            run_name = "run-%d" % session_num
                            print('--- Starting trial: %s' % run_name)
                            print({h.name: hparams[h] for h in hparams})
                            run('logs/hparam_tuning/' + run_name, hparams)
                            session_num += 1

--- Starting trial: run-0
{'num_heads': 1, 'num_layers': 1, 'dff': 128, 'finetune_embeddings': 0, 'vocab_size': 10000, 'embedding_dim': 100}
--- Starting trial: run-1
{'num_heads': 1, 'num_layers': 1, 'dff': 128, 'finetune_embeddings': 0, 'vocab_size': 10000, 'embedding_dim': 200}
downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d200.w2v.bin.tar.gz


100%|██████████| 7492341/7492341 [00:02<00:00, 2834655.55B/s]




KeyboardInterrupt: 