In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as npr
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from keras.optimizers import Adam
from keras_nlp.layers import PositionEmbedding

In [3]:
seed = 428

np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [4]:
def bert_module(query, key, value, embed_dim, num_head, i):
    
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=num_head,
        key_dim=embed_dim // num_head,
        name="encoder_{}/multiheadattention".format(i)
    )(query, key, value, use_causal_mask=True)
    
    # Add & Normalize
    attention_output = layers.Add()([query, attention_output])  # Skip Connection
    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
    
    # Feedforward network
    ff_net = keras.models.Sequential([
        layers.Dense(2 * embed_dim, activation='relu', name="encoder_{}/ffn_dense_1".format(i)),
        layers.Dense(embed_dim, name="encoder_{}/ffn_dense_2".format(i)),
    ])

    # Apply Feedforward network
    ffn_output = ff_net(attention_output)

    # Add & Normalize
    ffn_output = layers.Add()([attention_output, ffn_output])  # Skip Connection
    ffn_output = layers.LayerNormalization(epsilon=1e-6)(ffn_output)
    
    return ffn_output

In [5]:
def get_sinusoidal_embeddings(sequence_length, embedding_dim):
    position_enc = np.array([
        [pos / np.power(10000, 2. * i / embedding_dim) for i in range(embedding_dim)]
        if pos != 0 else np.zeros(embedding_dim)
        for pos in range(sequence_length)
    ])
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return tf.cast(position_enc, dtype=tf.float32)

In [7]:
#### K = number of countries = number of capitals
#### S = number of noise words
#### L = sentence length
#### embed_dim = dimension of embeddings
#### n_sentences = number of training sentences

def train_model(K, S, L, embed_dim, n_sentences):
    
    countries = ['country_' + str(i) for i in range(K)]
    capitals = ['capital_' + str(i) for i in range(K)]
    randoms = ['random_' + str(i) for i in range(S)]

    vocabs = countries + capitals + randoms
    vocab_map = {}

    for i in range(len(vocabs)):
        vocab_map[vocabs[i]] = i
        
    sentences = []
    sentences_number = []

    for i in range(n_sentences):

        sentence = []
        
        pair = (np.random.choice(np.arange(K), 1, replace = False))[0]
        sentence.append(countries[pair])
        sentence += list(np.random.choice(randoms, L - 2, replace = False))
        sentence.append(capitals[pair])        
        
        sentence_number = [vocab_map[i] for i in sentence]
        sentences.append(sentence)
        sentences_number.append(sentence_number)
        
    x_train = np.array(sentences_number)
    n_cat = len(vocab_map)
    x_masked_train = x_train
    y_masked_labels_train = x_train[:,1:]
    
    callback = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
    inputs = layers.Input((x_masked_train.shape[1],), dtype=tf.int64)
    word_embeddings = layers.Embedding(n_cat, embed_dim, name="word_embedding")(inputs)
    sinusoidal_embeddings = get_sinusoidal_embeddings(len(x_masked_train[0]), embed_dim)
    encoder_output = word_embeddings + sinusoidal_embeddings
    
    num_head = 2
    for i in range(5):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, embed_dim, num_head, i)

    encoder_output = keras.layers.Lambda(lambda x: x[:,:-1,:], name='slice')(encoder_output)
    mlm_output = layers.Dense(n_cat, name="mlm_cls", activation="softmax", use_bias=False)(encoder_output)
    mlm_model = keras.Model(inputs = inputs, outputs = mlm_output)
    adam = Adam()
    mlm_model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)

    history = mlm_model.fit(x_masked_train, y_masked_labels_train,
                        validation_split = 0.5, callbacks = [callback], 
                        epochs=500, batch_size=128, verbose=0)
    
    return sentences, vocab_map, mlm_model

In [8]:
def get_acc_prob(K, S, L, embed_dim, n_sentences, n_samples):
    
    sentences, vocab_map, current_model = train_model(K, S, L, embed_dim, n_sentences)

    acc_countries = []
    prob_countries = []

    for _ in range(n_samples):
        sentence = []
        random_countries = np.random.choice(np.arange(K), int(L/2), replace = False)
        for random_country in random_countries:
            sentence.append('country_' + str(random_country))
            sentence.append('capital_' + str(random_country))
        sentence_number = [vocab_map[i] for i in sentence]
        temp = keras.backend.function(inputs = current_model.layers[0].input, outputs = current_model.layers[-1].output) \
            (np.array(sentence_number).reshape(1,len(sentence_number)))
        temp = temp[:,-1,:]
        actual = vocab_map['capital_' + str(random_countries[-1])]
        acc_countries.append(1 if np.argsort(-1 * temp)[0][0] == actual else 0)
        prob_countries.append(temp[0][vocab_map['capital_' + str(random_countries[-1])]])
        

    return sentences, current_model, vocab_map, (np.mean(acc_countries), np.mean(prob_countries))

In [9]:
K = 20 # number of countries
L = 6 # sentence length
S = 20 # number of words used by both topics
embed_dim = 10 # CBOW embedding dimension
n_sentences = 50000 # number of sentences in the training set
n_samples = 1000

In [11]:
accs = 0
probs = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c \
        = get_acc_prob(K, S, L, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    
    accs += acc_c[0]/10
    probs += acc_c[1]/10
    
print((accs, probs))

(0.0, 3.334043e-05)
(0.0, 3.1926855e-05)
(0.0, 3.7160396e-05)
(0.0, 1.6826812e-05)
(0.0, 1.5725662e-05)
(0.0, 2.473471e-05)
(0.0, 3.9954966e-06)
(0.0, 0.00040489435)
(0.0, 2.2451863e-05)
(0.0, 0.00079280045)
(0.0, 0.0001383857027576596)


In [12]:
K = 20 # number of countries
L = 6 # sentence length
S = 20 # number of words used by both topics
embed_dim = 100 # CBOW embedding dimension
n_sentences = 50000 # number of sentences in the training set
n_samples = 1000

In [13]:
accs = 0
probs = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c \
        = get_acc_prob(K, S, L, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    
    accs += acc_c[0]/10
    probs += acc_c[1]/10
    
print((accs, probs))

(0.0, 9.323947e-06)
(0.0, 1.3394154e-05)
(0.0, 1.7631817e-05)
(0.0, 1.5488922e-05)
(0.0, 1.2705495e-05)
(0.0, 1.609711e-05)
(0.0, 9.61809e-06)
(0.0, 1.8950805e-05)
(0.0, 1.5426936e-05)
(0.0, 1.7783279e-05)
(0.0, 1.4642055521107976e-05)
