<a href="https://colab.research.google.com/github/ua-datalab/QNLP/blob/megh_dev/OOV_MRPC_paraphrase_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
from lambeq.text2diagram.tree_reader import BobcatParser
import lambeq

parser= BobcatParser()


In [8]:

MAXPARAMS = 108

In [None]:
import numpy as np

from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt


In [9]:
loss = lambda y_hat, y: -np.sum(y * np.log(y_hat)) / len(y)  # binary cross-entropy loss
acc = lambda y_hat, y: np.sum(np.round(y_hat) == y) / len(y) / 2  # half due to double-counting

eval_metrics = {"acc": acc}

def generate_initial_parameterisation(train_circuits, test_circuits, embedding_model, qnlp_model):

    # Note that in this vocab, the same word can have multiple types, which each occur separately
    train_vocab = {symb.name.rsplit('_', 1)[0] for d in train_circuits for symb in d.free_symbols}
    test_vocab = {symb.name.rsplit('_', 1)[0] for d in test_circuits for symb in d.free_symbols}

    print(len(test_vocab.union(train_vocab)), len(train_vocab), len(test_vocab))
    print(f'OOV word count: {len(test_vocab - train_vocab)} / {len(test_vocab)}')

    n_oov_symbs = len({symb.name for d in test_circuits for symb in d.free_symbols} - {symb.name for d in train_circuits for symb in d.free_symbols})
    print(f'OOV symbol count: {n_oov_symbs} / {len({symb.name for d in test_circuits for symb in d.free_symbols})}')

    max_word_param_length = max(max(int(symb.name.rsplit('_', 1)[1]) for d in train_circuits for symb in d.free_symbols),
                            max(int(symb.name.rsplit('_', 1)[1]) for d in test_circuits for symb in d.free_symbols)) + 1
    print(f'Max params/word: {max_word_param_length}')

    train_vocab_embeddings = {wrd: embedding_model[wrd.split('__')[0]] for wrd in train_vocab}
    test_vocab_embeddings = {wrd: embedding_model[wrd.split('__')[0]] for wrd in test_vocab}

    initial_param_vector = []

    for sym in qnlp_model.symbols:
        wrd, idx = sym.name.rsplit('_', 1)
        initial_param_vector.append(train_vocab_embeddings[wrd][int(idx)])

    qnlp_model.weights = np.array(initial_param_vector)

    return train_vocab_embeddings, test_vocab_embeddings, max_word_param_length


def generate_OOV_parameterising_model(trained_qnlp_model, train_vocab_embeddings, max_word_param_length):

    trained_params_raw = {symbol: param for symbol, param in zip(trained_qnlp_model.symbols, trained_qnlp_model.weights)}
    trained_param_vectors = {wrd: np.zeros(max_word_param_length) for wrd in train_vocab_embeddings}

    for symbol, train_val in trained_params_raw.items():
        wrd, idx = symbol.name.rsplit('_', 1)
        trained_param_vectors[wrd][int(idx)] = train_val

    wrds_in_order = list(train_vocab_embeddings.keys())

    NN_train_X = np.array([train_vocab_embeddings[wrd] for wrd in wrds_in_order])
    NN_train_Y = np.array([trained_param_vectors[wrd] for wrd in wrds_in_order])

    print(NN_train_X[0][:5])
    print(NN_train_Y[0][:5])

    OOV_NN_model = keras.Sequential([
      layers.Dense(int((max_word_param_length + MAXPARAMS) / 2), activation='tanh'),
      layers.Dense(max_word_param_length, activation='tanh'),
    ])

    OOV_NN_model.compile(loss='mean_absolute_error', optimizer=keras.optimizers.Adam(0.001))

    # Embedding dim!
    OOV_NN_model.build(input_shape=(None, MAXPARAMS))

    hist = OOV_NN_model.fit(NN_train_X, NN_train_Y, validation_split=0.2, verbose=0, epochs=120)

    print(f'OOV NN model final epoch loss: {(hist.history["loss"][-1], hist.history["val_loss"][-1])}')

    plt.plot(hist.history['loss'], label='loss')
    plt.plot(hist.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.show()

    return OOV_NN_model


def evaluate_test_set(pred_model, test_circuits, test_labels, trained_params, test_vocab_embeddings, max_word_param_length, OOV_strategy='random', OOV_model=None):

    pred_parameter_map = {}

    # Use the words from train wherever possible, else use DNN prediction
    for wrd, embedding in test_vocab_embeddings.items():
        if OOV_strategy == 'model':
            pred_parameter_map[wrd] = trained_params.get(wrd, OOV_model.predict(np.array([embedding]), verbose=0)[0])
        elif OOV_strategy == 'embed':
            pred_parameter_map[wrd] = trained_params.get(wrd, embedding)
        elif OOV_strategy == 'zeros':
            pred_parameter_map[wrd] = trained_params.get(wrd, np.zeros(max_word_param_length))
        else:
            pred_parameter_map[wrd] = trained_params.get(wrd, 2 * np.random.rand(max_word_param_length)-1)

    pred_weight_vector = []

    for sym in pred_model.symbols:
        wrd, idx = sym.name.rsplit('_', 1)
        pred_weight_vector.append(pred_parameter_map[wrd][int(idx)])

    pred_model.weights = pred_weight_vector

    preds = pred_model.get_diagram_output(test_circuits)

    return loss(preds, test_labels), acc(preds, test_labels)


def trained_params_from_model(trained_qnlp_model, train_embeddings, max_word_param_length):

    trained_param_map = { symbol: param for symbol, param in zip(trained_qnlp_model.symbols, trained_qnlp_model.weights)}
    trained_parameterisation_map = {wrd: np.zeros(max_word_param_length) for wrd in train_embeddings}

    for symbol, train_val in trained_param_map.items():
        wrd, idx = symbol.name.rsplit('_', 1)
        trained_parameterisation_map[wrd][int(idx)] = train_val

    return trained_parameterisation_map

In [18]:
#check if my data files are in the same format as original files
import string

train_X = []
train_y = []

with open("./uspantan_train.txt", encoding='utf-8-sig') as f:
    for line in f:
        procd_line = line.strip().split('  ')
        train_X.append(procd_line[1])
        train_y.append(int(procd_line[0]))

test_X = []
test_y = []

with open("./uspantan_test.txt", encoding='utf-8-sig') as f:
    for line in f:
        procd_line = line.strip().split('  ')
        test_X.append(procd_line[1])
        test_y.append(int(procd_line[0]))


MAXLEN = 10


filt_train_X = []
filt_train_y = []

filt_test_X = []
filt_test_y = []

ctr_train = 0
for label, s in zip(train_y, train_X):
    if len(s.split(' ')) <= MAXLEN:
        ctr_train += 1
        filt_train_X.append(s.translate(str.maketrans('', '', string.punctuation)))
        this_y = [0, 0]
        this_y[label] = 1
        filt_train_y.append(this_y)

ctr_test = 0
for label, s in zip(test_y, test_X):
    if len(s.split(' ')) <= MAXLEN:
        ctr_test += 1
        filt_test_X.append(s.translate(str.maketrans('', '', string.punctuation)))
        this_y = [0, 0]
        this_y[label] = 1
        filt_test_y.append(this_y)

print(ctr_train, ctr_test)

14 2


In [None]:
# !pip install fasttext
# import fasttext as ft

# # We don't need the model, jsut need ot find where embeddings are used and just add the Spanish embeddings there.
# embedding_model = ft.load_model(f'./dataset/cc.en.{MAXPARAMS}.bin')

In [27]:
!wget https://zenodo.org/record/3234051/files/embeddings-l-model.bin?download=1

--2024-06-21 23:21:10--  https://zenodo.org/record/3234051/files/embeddings-l-model.bin?download=1
Resolving zenodo.org (zenodo.org)... 188.184.103.159, 188.184.98.238, 188.185.79.172, ...
Connecting to zenodo.org (zenodo.org)|188.184.103.159|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/3234051/files/embeddings-l-model.bin [following]
--2024-06-21 23:21:11--  https://zenodo.org/records/3234051/files/embeddings-l-model.bin
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 5576446827 (5.2G) [application/octet-stream]
Saving to: ‘embeddings-l-model.bin?download=1’


2024-06-21 23:26:19 (17.3 MB/s) - ‘embeddings-l-model.bin?download=1’ saved [5576446827/5576446827]



In [34]:
# !pip install fasttext
import fasttext as ft
embedding_model = ft.load_model('/content/embeddings-l-model.bin')

In [28]:
# !pip install lambeq
from lambeq.text2diagram.tree_reader import BobcatParser
import lambeq

parser= BobcatParser()

Downloading model: 0.0%|          |0.000/1.533GB [00:00<?]

Evaluating checksum: 0.0%|          |0.000/1.533GB [00:00<?]

Extracting model...


In [29]:
train_diags = parser.sentences2diagrams(filt_train_X, suppress_exceptions=False)

test_diags = parser.sentences2diagrams(filt_test_X, suppress_exceptions=False)


Tagging sentences:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing tagged sentences:   0%|          | 0/14 [00:00<?, ?it/s]

Parse trees to diagrams:   0%|          | 0/14 [00:00<?, ?it/s]

Tagging sentences:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing tagged sentences:   0%|          | 0/2 [00:00<?, ?it/s]

Parse trees to diagrams:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
from collections import Counter
# We omit any case where the 2 phrases are not parsed to the same type
# Khatri et al. is creating a circuit with the combination of X1 and X2.
# We are not joining, so not needed
# joint_diagrams_train = [d1 @ d2.r if d1.cod == d2.cod else None for d1 in train_diags]
# joint_diagrams_test = [d1 @ d2.r if d1.cod == d2.cod else None for d1 in test_diags]

#  Editing lines to get what we need from train_diags
train_diags_raw = [d for d in train_diags if d is not None]
train_y = np.array([y for d,y in zip(train_diags, filt_train_y) if d is not None])

test_diags_raw = [d for d in test_diags if d is not None]
test_y = np.array([y for d,y in zip(test_diags, filt_test_y) if d is not None])

print("FINAL DATASET SIZE:")
print("-----------------------------------")
print(f"Training: {len(train_diags_raw)} {Counter([tuple(elem) for elem in train_y])}")
print(f"Testing : {len(test_diags_raw)} {Counter([tuple(elem) for elem in test_y])}")

FINAL DATASET SIZE:
-----------------------------------
Training: 14 Counter({(1, 0): 9, (0, 1): 5})
Testing : 2 Counter({(1, 0): 1, (0, 1): 1})


In [40]:
from tqdm import tqdm
from lambeq import Rewriter, RemoveCupsRewriter

rewriter = RemoveCupsRewriter()
# rewriter = Rewriter(['prepositional_phrase', 'determiner', 'coordination', 'connector', 'prepositional_phrase'])

train_X = []
test_X = []

for d in tqdm(train_diags):
    if d is not None:
        train_X.append(rewriter(d).normal_form())
    else:
        print("found d is null")

for d in tqdm(test_diags):
              if d is not None:
                test_X.append(rewriter(d).normal_form())

100%|██████████| 14/14 [00:00<00:00, 76.64it/s]
100%|██████████| 2/2 [00:00<00:00, 73.49it/s]


In [41]:
# from discopy.quantum.gates import CX, Rx, H, Bra, Id

# equality_comparator = (CX >> (H @ Rx(0.5)) >> (Bra(0) @ Id(1)))
# equality_comparator.draw()

In [46]:
from lambeq import AtomicType, IQPAnsatz, Sim14Ansatz, Sim15Ansatz
from lambeq import TketModel, NumpyModel, QuantumTrainer, SPSAOptimizer, Dataset
import time
import json

SEED = 0
EPOCHS = 1000
BATCH_SIZE = 30

N = AtomicType.NOUN
S = AtomicType.SENTENCE
P = AtomicType.PREPOSITIONAL_PHRASE


def run_experiment(nlayers=1, seed=SEED):
    print(f'RUNNING WITH {nlayers} layers')
    ansatz = Sim15Ansatz({N: 1, S: 1, P:1}, n_layers=nlayers, n_single_qubit_params=3)

    train_circs = [ansatz(d) for d in train_X]
    test_circs = [ansatz(d) for d in test_X]

    lmbq_model = NumpyModel.from_diagrams(train_circs, use_jit=True)

    trainer = QuantumTrainer(
        lmbq_model,
        loss_function=loss,
        epochs=EPOCHS,
        optimizer=SPSAOptimizer,
        optim_hyperparams={'a': 0.05, 'c': 0.06, 'A':0.01*EPOCHS},
        evaluate_functions=eval_metrics,
        evaluate_on_train=True,
        verbose = 'text',
        seed=seed
    )

    train_dataset = Dataset(
                train_circs,
                train_y,
                batch_size=BATCH_SIZE)

    np.random.seed(seed)

    train_embeddings, test_embeddings, max_w_param_length = generate_initial_parameterisation(train_circs, test_circs, embedding_model, lmbq_model)

    print('BEGINNING QNLP MODEL TRAINING')
    trainer.fit(train_dataset)

    train_preds = lmbq_model.get_diagram_output(train_circs)
    train_loss = loss(train_preds, train_y)
    train_acc = acc(train_preds, train_y)
    print(f'TRAIN STATS: {train_loss, train_acc}')

    print('BEGINNING DNN MODEL TRAINING')
    NN_model = generate_OOV_parameterising_model(lmbq_model, train_embeddings, max_w_param_length)

    prediction_model = NumpyModel.from_diagrams(test_circs, use_jit=True)

    trained_wts = trained_params_from_model(lmbq_model, train_embeddings, max_w_param_length)

    print('Evaluating SMART MODEL')
    smart_loss, smart_acc = evaluate_test_set(prediction_model,
                                              test_circs,
                                              test_y,
                                              trained_wts,
                                              test_embeddings,
                                              max_w_param_length,
                                              OOV_strategy='model',
                                              OOV_model=NN_model)

    print('Evaluating EMBED model')
    embed_loss, embed_acc = evaluate_test_set(prediction_model,
                                              test_circs,
                                              test_y,
                                              trained_wts,
                                              test_embeddings,
                                              max_w_param_length,
                                              OOV_strategy='embed')

    print('Evaluating ZEROS model')
    zero_loss, zero_acc = evaluate_test_set(prediction_model,
                                              test_circs,
                                              test_y,
                                              trained_wts,
                                              test_embeddings,
                                              max_w_param_length,
                                              OOV_strategy='zeros')

    rand_losses = []
    rand_accs = []

    print('Evaluating RAND MODEL')
    for _ in range(1000):


        rl, ra = evaluate_test_set(prediction_model,
                                   test_circs,
                                   test_y,
                                   trained_wts,
                                   test_embeddings,
                                   max_w_param_length,
                                   OOV_strategy='random')

        rand_losses.append(rl)
        rand_accs.append(ra)

    res =  {'TRAIN': (train_loss, train_acc),
            'NN': (smart_loss, smart_acc),
            'EMBED': (embed_loss, embed_acc),
            'RAND': (rand_losses, rand_accs),
            'ZERO': (zero_loss, zero_acc)
           }
    print(f'ZERO: {res["ZERO"]}')
    print(f'EMBED: {res["EMBED"]}')
    print(f'NN: {res["NN"]}')

    return res


In [47]:
import tensorflow as tf
compr_results = {}

tf_seeds = [0, 1, 2]

for tf_seed in tf_seeds:
    tf.random.set_seed(tf_seed)
    this_seed_results = []
    for nl in [3,2,1]:
        this_seed_results.append(run_experiment(nl, tf_seed))
    compr_results[tf_seed] = this_seed_results

RUNNING WITH 3 layers
76 65 12
OOV word count: 11 / 12
OOV symbol count: 111 / 123
Max params/word: 30
BEGINNING QNLP MODEL TRAINING


ValueError: All input arrays must have the same shape.

In [None]:
import json

bkup = compr_results

with open('./results/MSR_OOV_S15.json', 'w') as f:
    json.dump(bkup, f)