In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as npr
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from keras.optimizers import Adam
from keras_nlp.layers import PositionEmbedding

In [3]:
seed = 428

np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [4]:
def bert_module(query, key, value, embed_dim, num_head, i):
    
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=num_head,
        key_dim=embed_dim // num_head,
        name="encoder_{}/multiheadattention".format(i)
    )(query, key, value, use_causal_mask=True)
    
    # Add & Normalize
    attention_output = layers.Add()([query, attention_output])  # Skip Connection
    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
    
    # Feedforward network
    ff_net = keras.models.Sequential([
        layers.Dense(2 * embed_dim, activation='relu', name="encoder_{}/ffn_dense_1".format(i)),
        layers.Dense(embed_dim, name="encoder_{}/ffn_dense_2".format(i)),
    ])

    # Apply Feedforward network
    ffn_output = ff_net(attention_output)

    # Add & Normalize
    ffn_output = layers.Add()([attention_output, ffn_output])  # Skip Connection
    ffn_output = layers.LayerNormalization(epsilon=1e-6)(ffn_output)
    
    return ffn_output

In [5]:
def get_sinusoidal_embeddings(sequence_length, embedding_dim):
    position_enc = np.array([
        [pos / np.power(10000, 2. * i / embedding_dim) for i in range(embedding_dim)]
        if pos != 0 else np.zeros(embedding_dim)
        for pos in range(sequence_length)
    ])
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return tf.cast(position_enc, dtype=tf.float32)

In [6]:
def insert_element_randomly(my_list, element):
    
    if len(my_list) > 1:

        index = random.randint(0, len(my_list) - 1)
    else:
        
        index = 0

    new_list = my_list[:index] + [element] + my_list[index:]
    
    return new_list

In [7]:
N = 20 # vocab_size
M = 20 # number of random words

vocabs = ['word_' + str(i) for i in range(N)] + ['random_' + str(i) for i in range(M)]

vocab_map = {}
for i in range(len(vocabs)):
    vocab_map[vocabs[i]] = i

In [8]:
def get_accuracy_prob(embed_dim):
    
    pairs = []

    for i in vocabs:
        for j in vocabs:
            if 'word_' in i and 'word_' in j and i != j:
                pairs.append((i,j))

    indicator = np.random.choice([0, 1], size=len(pairs), p=[0.5, 0.5])

    pairs_train = [pairs[i] for i in range(len(indicator)) if indicator[i] == 1]
    pairs_test = [pairs[i] for i in range(len(indicator)) if indicator[i] == 0]
    
    sentences_train = []
    sentences_number_train = []
    sentences_test = []
    sentences_number_test = []

    x_masked_train = []
    y_masked_labels_train = []
    x_masked_test = []
    y_masked_labels_test = []

    for _ in range(50000):

        random_word = random.sample(['random_' + str(i) for i in range(M)], 1)[0]

        [(a,b), (c,d)] = random.sample(pairs_train, 2)

        temp = [a, b, a, c, d, c]
        temp = insert_element_randomly(temp, random_word)

        sentences_train.append(temp)
        sentences_number_train.append([vocab_map[i] for i in temp])
        x_masked_train.append([vocab_map[i] for i in temp])
        y_masked_labels_train.append([vocab_map[i] for i in temp][1:])

        [(a,b), (c,d)] = random.sample(pairs_test, 2)

        sentences_test.append([a, b, a, c, d, c, random_word])
        sentences_number_test.append([vocab_map[a], vocab_map[b], vocab_map[a], 
                                       vocab_map[c], vocab_map[d], vocab_map[c], vocab_map[random_word]])
        x_masked_test.append([vocab_map[a], vocab_map[b], vocab_map[a], 
                                       vocab_map[c], vocab_map[d], vocab_map[c], vocab_map[random_word]])
        y_masked_labels_test.append([vocab_map[b], vocab_map[a], 
                                       vocab_map[c], vocab_map[d], vocab_map[c], vocab_map[random_word]])

    x_masked_train = np.array(x_masked_train)
    y_masked_labels_train = np.array(y_masked_labels_train)
    x_masked_test = np.array(x_masked_test)
    y_masked_labels_test = np.array(y_masked_labels_test)

    perm = np.random.permutation(len(x_masked_train))
    x_masked_train = x_masked_train[perm]
    y_masked_labels_train = y_masked_labels_train[perm]
    
    
    num_head = 2

    callback = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
    inputs = layers.Input((x_masked_train.shape[1],), dtype=tf.int64)
    word_embeddings = layers.Embedding(N + M, embed_dim, name="word_embedding")(inputs)
    position_embeddings = PositionEmbedding(sequence_length=len(x_masked_train[0]))(word_embeddings)
    encoder_output = word_embeddings + position_embeddings

    for i in range(1):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, embed_dim, num_head, i)

    encoder_output = keras.layers.Lambda(lambda x: x[:,:-1,:], name='slice')(encoder_output)
    mlm_output = layers.Dense(N + M, name="mlm_cls", activation="softmax")(encoder_output)
    mlm_model = keras.Model(inputs = inputs, outputs = mlm_output)
    adam = Adam()
    mlm_model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)
    history = mlm_model.fit(x_masked_train, y_masked_labels_train,
                            validation_split = 0.5, callbacks = [callback], 
                            epochs=2000, batch_size=5000, 
                            verbose=0)
    
    acc = []
    prob = []
    x_test_subset = x_masked_test[np.random.choice(x_masked_test.shape[0], size=1000, replace=False)]

    for sentence_number in x_test_subset:
        temp = keras.backend.function(inputs = mlm_model.layers[0].input, outputs = mlm_model.layers[-1].output) \
            (np.array(sentence_number).reshape(1,len(sentence_number)))
        temp = temp[:,-2,:]
        acc.append(1 if temp.argmax() == sentence_number[-2] else 0)
        prob.append(temp[0][sentence_number[-2]])
        
    return (np.mean(acc), np.mean(prob))

In [9]:
accs = 0
probs = 0

for _ in range(10):
    
    (acc, prob) = get_accuracy_prob(10)
    
    print((acc, prob))
    
    accs += acc/10
    probs += prob/10
    
print((accs, probs))

2024-05-08 14:08:00.198751: W tensorflow/tsl/platform/default/dso_loader.cc:66] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-05-08 14:08:00.198811: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2024-05-08 14:08:00.198835: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gl3172.arc-ts.umich.edu): /proc/driver/nvidia/version does not exist
2024-05-08 14:08:00.199109: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(0.0, 0.00023924194)
(0.0, 0.00023125067)
(0.0, 0.0004355542)
(0.0, 0.00016954009)
(0.0, 3.725001e-05)
(0.0, 0.0004747167)
(0.0, 0.00018435648)
(0.0, 0.00022450146)
(0.0, 4.0201863e-05)
(0.0, 0.0002761143)
(0.0, 0.00023127277127059644)


In [10]:
accs = 0
probs = 0

for _ in range(10):
    
    (acc, prob) = get_accuracy_prob(100)
    
    print((acc, prob))
    
    accs += acc/10
    probs += prob/10
    
print((accs, probs))

(0.0, 0.00019878842)
(0.0, 0.0002890869)
(0.0, 0.0004500305)
(0.0, 0.00038685606)
(0.0, 0.00043059213)
(0.0, 0.0003880611)
(0.0, 0.00027912564)
(0.0, 0.00026863546)
(0.0, 0.00028535546)
(0.0, 0.00029668515)
(0.0, 0.00032732168037910017)
