In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as npr
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from keras.optimizers import Adam
from keras_nlp.layers import PositionEmbedding

In [32]:
seed = 428

np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

In [33]:
def get_masked_input_and_labels(encoded_texts, n_cat):
    # For each sentence, mask each word one-by-one

    encoded_texts_masked = []
    y_labels = []

    for encoded_text in encoded_texts:
        for i in range(len(encoded_text)):
            encoded_text_masked = np.copy(encoded_text)
            y_label = encoded_text_masked[i]
            encoded_texts_masked.append(np.delete(encoded_text_masked, i))
            y_labels.append(np.array([y_label]))

    return np.array(encoded_texts_masked), np.array(y_labels)

In [35]:
#### K = number of countries = number of capitals = number of universities = number of mascots
#### M = number of words only used by each topic
#### S = number of words used by both topics
#### L = sentence length
#### q1, q2 = probability of having 1 or 2 pairs
#### embed_dim = dimension of embeddings
#### n_sentences = number of training sentences

def train_model(K, M, S, L, q1, q2, embed_dim, n_sentences):
    
    countries = ['country_' + str(i) for i in range(K)]
    capitals = ['capital_' + str(i) for i in range(K)]
    universities = ['university_' + str(i) for i in range(K)]
    mascots = ['mascot_' + str(i) for i in range(K)]
    random_capitals = ['random_capital_' + str(i) for i in range(M)]
    random_mascots = ['random_mascot_' + str(i) for i in range(M)]
    randoms = ['random_' + str(i) for i in range(S)]

    vocabs = countries + capitals + universities + mascots + random_capitals + random_mascots + randoms
    vocab_map = {}

    for i in range(len(vocabs)):
        vocab_map[vocabs[i]] = i
        
    sentences = []
    sentences_number = []
    
    q0 = 1 - q1 - q2

    for i in range(n_sentences):

        sentence = []
        
        temp = npr.uniform()
        temp2 = npr.uniform()
        
        if temp2 <= q0:
            n_pairs = 0
        elif temp2 <= q0 + q1:
            n_pairs = 1
        else:
            n_pairs = 2
        
        if temp <= 0.5: ### country - capital
        
            pairs = np.random.choice(np.arange(K), n_pairs, replace = False)
#             pairs2 = np.random.choice(np.arange(M), n_pairs, replace = False)
            for pair in pairs:
                sentence.append(countries[pair])
                sentence.append(capitals[pair])        
#             for pair in pairs2:
#                 temp3 = npr.uniform()
#                 if temp3 <= 0.5:
#                     sentence.append(random_capitals[pair])
#                 else:
#                     sentence.append(random_mascots[pair])
            
            randoms_dup = 4 * random_capitals + 2 * randoms + 1 * random_mascots
            sentence += list(np.random.choice(randoms_dup, L - 2 * n_pairs, replace = False))  
                 
        else: ### university - mascot
            
            pairs = np.random.choice(np.arange(K), n_pairs, replace = False)
#             pairs2 = np.random.choice(np.arange(M), n_pairs, replace = False)
            for pair in pairs:
                sentence.append(universities[pair])
                sentence.append(mascots[pair])        
#             for pair in pairs2:
#                 temp3 = npr.uniform()
#                 if temp3 <= 0.5:
#                     sentence.append(random_mascots[pair])
#                 else:
#                     sentence.append(random_capitals[pair])

            
            randoms_dup = 1 * random_capitals + 2 * randoms + 4 * random_mascots
            sentence += list(np.random.choice(randoms_dup, L - 2 * n_pairs, replace = False))  
            
        
        

        #sentence += list(np.random.choice(randoms + random_capitals + random_mascots, L - 2 * n_pairs, replace = False))  

        sentence_number = [vocab_map[i] for i in sentence]
        sentences.append(sentence)
        sentences_number.append(sentence_number)
        
    x_train = np.array(sentences_number)
    n_cat = len(vocab_map)
    x_masked_train, y_masked_labels_train = get_masked_input_and_labels(x_train, n_cat)
    
    callback = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
    inputs = layers.Input((x_masked_train.shape[1],), dtype=tf.int64)
    word_embeddings = layers.Embedding(n_cat, embed_dim, name="word_embedding")(inputs)
    encoder_output = layers.GlobalAveragePooling1D()(word_embeddings)
    mlm_output = layers.Dense(n_cat, name="mlm_cls", activation="softmax", use_bias=False)(encoder_output)
    mlm_model = keras.Model(inputs = inputs, outputs = mlm_output)
    adam = Adam()
    mlm_model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)

    history = mlm_model.fit(x_masked_train, y_masked_labels_train,
                        validation_split = 0.5, callbacks = [callback], 
                        epochs=500, batch_size=128, verbose=0)
    
    return sentences, vocab_map, mlm_model

In [50]:
def get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples):
    
    sentences, vocab_map, current_model = train_model(K, M, S, L, q1, q2, embed_dim, n_sentences)

    acc_capitals = []
    prob_capitals = []

    for _ in range(n_samples):
        sentence = []
        random_capitals = np.random.choice(np.arange(K), int(L/2), replace = False)
        for random_capital in random_capitals:
            sentence.append('country_' + str(random_capital))
            sentence.append('capital_' + str(random_capital))
        sentence = sentence[:-1]
        sentence_number = [vocab_map[i] for i in sentence]
        temp = keras.backend.function(inputs = current_model.layers[0].input, outputs = current_model.layers[-1].output) \
            (np.array(sentence_number).reshape(1,len(sentence_number)))
        actual = vocab_map['capital_' + str(random_capitals[-1])]
        acc_capitals.append(1 if np.argsort(-1 * temp)[0][0] == actual else 0)
        prob_capitals.append(temp[0][vocab_map['capital_' + str(random_capitals[-1])]])
        
    acc_mascots = []
    prob_mascots = []

    for _ in range(n_samples):
        sentence = []
        random_mascots = np.random.choice(np.arange(K), int(L/2), replace = False)
        for random_mascot in random_mascots:
            sentence.append('university_' + str(random_mascot))
            sentence.append('mascot_' + str(random_mascot))
        sentence = sentence[:-1]
        sentence_number = [vocab_map[i] for i in sentence]
        temp = keras.backend.function(inputs = current_model.layers[0].input, outputs = current_model.layers[-1].output) \
            (np.array(sentence_number).reshape(1,len(sentence_number)))
        actual = vocab_map['mascot_' + str(random_mascots[-1])]
        acc_mascots.append(1 if np.argsort(-1 * temp)[0][0] == actual else 0)
        prob_mascots.append(temp[0][vocab_map['mascot_' + str(random_mascots[-1])]])
        

    return sentences, current_model, vocab_map, (np.mean(acc_capitals), np.mean(prob_capitals)), \
                (np.mean(acc_mascots), np.mean(prob_mascots))

In [51]:
K = 10 # number of countries
L = 8 # sentence length
M = 20 # number of words used by each topic
S = 20 # number of words used by both topics
embed_dim = 100 # CBOW embedding dimension
n_sentences = 50000 # number of sentences in the training set
n_samples = 1000

In [52]:
q0 = 0 # probability of having 0 pairs
q1 = 1 # probability of having 1 pair
q2 = 0 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(0.0, 1.154062e-21)
(0.0, 1.5228837e-21)
(0.0, 1.6053485e-23)
(0.0, 2.7045765e-23)
(0.0, 1.2031249e-21)
(0.0, 7.8614647e-22)
(0.0, 1.836189e-19)
(0.0, 1.8794081e-19)
(0.0, 1.3488517e-18)
(0.0, 1.8111322e-18)
(0.0, 4.572774e-19)
(0.0, 6.0283274e-19)
(0.0, 5.0528913e-20)
(0.0, 1.3067675e-19)
(0.0, 4.4711148e-21)
(0.0, 1.5663081e-21)
(0.0, 7.579977e-19)
(0.0, 6.1955756e-19)
(0.0, 8.985452e-20)
(0.0, 2.6620153e-19)
(0.0, 2.8949734619786257e-19)
(0.0, 3.6222439917697333e-19)


In [53]:
q0 = 0 # probability of having 0 pairs
q1 = 0 # probability of having 1 pair
q2 = 1 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(0.0, 0.0005164545)
(0.0, 0.0005576749)
(0.0, 0.00010011451)
(0.0, 4.280828e-05)
(0.0, 0.0007729619)
(0.0, 0.0005721296)
(0.0, 5.185382e-05)
(0.0, 8.634732e-05)
(0.0, 0.0001318318)
(0.0, 0.00014507488)
(0.0, 0.0006744081)
(0.0, 0.00072805147)
(0.0, 0.00024266353)
(0.0, 0.00025882182)
(0.0, 0.0004167617)
(0.0, 0.0005755397)
(0.0, 0.00025204572)
(0.0, 0.00024926328)
(0.0, 0.00070766866)
(0.0, 0.0010008372)
(0.0, 0.0003866764236590825)
(0.0, 0.0004216548477415927)


In [54]:
q0 = 1/2 # probability of having 0 pairs
q1 = 1/2 # probability of having 1 pair
q2 = 0 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(0.984, 0.5869939)
(0.933, 0.605797)
(0.936, 0.5250668)
(0.841, 0.39346707)
(0.962, 0.53291523)
(0.933, 0.4464029)
(0.867, 0.3985117)
(0.972, 0.5339934)
(0.924, 0.5126887)
(0.993, 0.5726846)
(0.954, 0.5663357)
(0.944, 0.52554965)
(0.972, 0.55343086)
(0.992, 0.57759655)
(0.966, 0.5291147)
(0.942, 0.5192274)
(0.997, 0.60069335)
(0.994, 0.58791685)
(0.945, 0.49200365)
(0.979, 0.5829629)
(0.9507000000000001, 0.5297754555940628)
(0.9522999999999999, 0.534559828042984)


In [55]:
q0 = 1/2 # probability of having 0 pairs
q1 = 0 # probability of having 1 pair
q2 = 1/2 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(1.0, 0.99937916)
(1.0, 0.99932396)
(1.0, 0.9996558)
(1.0, 0.99964494)
(1.0, 0.9993679)
(1.0, 0.99938)
(1.0, 0.99984086)
(1.0, 0.9998229)
(1.0, 0.99985826)
(1.0, 0.99982053)
(1.0, 0.9992331)
(1.0, 0.9992539)
(1.0, 0.99937195)
(1.0, 0.999287)
(1.0, 0.9996326)
(1.0, 0.99965894)
(1.0, 0.99962205)
(1.0, 0.99967843)
(1.0, 0.99964976)
(1.0, 0.99965054)
(0.9999999999999999, 0.9995611429214478)
(0.9999999999999999, 0.9995521187782288)


In [56]:
q0 = 0 # probability of having 0 pairs
q1 = 1/2 # probability of having 1 pair
q2 = 1/2 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(1.0, 0.9943809)
(1.0, 0.9929509)
(1.0, 0.9942)
(1.0, 0.9915437)
(1.0, 0.99607325)
(1.0, 0.99680614)
(1.0, 0.9934083)
(1.0, 0.9938324)
(1.0, 0.9942946)
(1.0, 0.9928545)
(1.0, 0.9953002)
(1.0, 0.9967576)
(1.0, 0.9888664)
(1.0, 0.98785675)
(1.0, 0.99288917)
(1.0, 0.9929553)
(1.0, 0.9941575)
(1.0, 0.99408484)
(1.0, 0.9931708)
(1.0, 0.9944836)
(0.9999999999999999, 0.9936741054058075)
(0.9999999999999999, 0.9934125781059265)


In [38]:
q0 = 1/3 # probability of having 0 pairs
q1 = 1/3 # probability of having 1 pair
q2 = 1/3 # probability of having 2 pairs

accs_c = 0
probs_c = 0
accs_d = 0
probs_d = 0

for _ in range(10):
    sentences, mlm_model, vocab_map, acc_c, acc_d \
        = get_acc_prob(K, M, S, L, q1, q2, embed_dim, n_sentences, n_samples)
    
    print(acc_c)
    print(acc_d)
    
    accs_c += acc_c[0]/10
    probs_c += acc_c[1]/10
    accs_d += acc_d[0]/10
    probs_d += acc_d[1]/10
    
print((accs_c, probs_c))
print((accs_d, probs_d))

(1.0, 0.99928176)
(1.0, 0.9992274)
(1.0, 0.99957615)
(1.0, 0.9996253)
(1.0, 0.9991695)
(1.0, 0.99918544)
(1.0, 0.9984966)
(1.0, 0.9984711)
(1.0, 0.99848205)
(1.0, 0.9985639)
(1.0, 0.99900866)
(1.0, 0.99916685)
(1.0, 0.99837416)
(1.0, 0.99819994)
(1.0, 0.99964756)
(1.0, 0.99962205)
(1.0, 0.99832726)
(1.0, 0.9984043)
(1.0, 0.99917585)
(1.0, 0.9992494)
(0.9999999999999999, 0.9989539563655854)
(0.9999999999999999, 0.9989715695381165)
