In [1]:
from joblib import Memory

memory = Memory(".cache", verbose=0)

In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

@memory.cache
def get_tokenizer_model():
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    bert_model = BertModel.from_pretrained('bert-base-cased')
    sbert_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

    return bert_tokenizer, bert_model, sbert_model

bert_tokenizer, bert_model, sbert_model = get_tokenizer_model()

In [3]:
from pathlib import Path
import itertools
from natsort import natsorted

@memory.cache
def read_problem_files(problem_folder, n=None):
    problems = []
    files = itertools.islice(natsorted(Path(problem_folder).glob('problem-*.txt')), n)
    for problem_file in files:
        # number = problem_file.name[len("problem-") : -len(".txt")]
        with open(problem_file, 'r', encoding="utf8") as fh:
            problems.append(fh.readlines())
    return problems

In [4]:
from evaluation.evaluator import read_ground_truth_files

@memory.cache
def cached_read_ground_truth(x):
    return read_ground_truth_files(x)

# ground_truth = cached_read_ground_truth("pan21/train")

In [5]:
def get_embeddings(paragraph):
    # Tokenize the sentence
    tokens = bert_tokenizer.tokenize(paragraph)
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensor
    token_tensor = torch.tensor([token_ids])
    segment_ids = [0] * len(token_ids)
    segment_tensor = torch.tensor([segment_ids])

    # Get BERT model output
    with torch.no_grad():
        outputs = bert_model(token_tensor, segment_tensor)

    # Extract word embeddings from BERT model output
    return outputs[0]

In [6]:
# problems = read_problem_files("pan21/train", n=5)
# problems[0]

In [7]:
#@memory.cache
# def get_max_length(n=None):
#     max_train = max([len(x) for y in read_problem_files("pan21/train", n=n) for x in y])
#     max_val = max([len(x) for y in read_problem_files("pan21/validation", n=n) for x in y])
#     return max([max_train, max_val])

In [8]:
import torch
import numpy as np

max_input_length = 512

def pad_paragraph(paragraph_embedding, desired_length):
    d1, d2, d3 = paragraph_embedding.shape
    # print(f"{paragraph_embedding.shape=}")

    target = torch.zeros(d1, desired_length, d3)
    # print(f"{target.shape=}")
    target[:, :d2, :] = paragraph_embedding

    return target


def get_paragraph_pairs(problem_text):
    # print(problem_text)
    paragraph_embeddings = [pad_paragraph(get_embeddings(para[:512]), max_input_length) for para in problem_text]
    # print(f"{[paras.shape for paras in paragraph_embeddings]=}")
    # print(f"{[x.shape for x in paragraph_embeddings]}")
    pairs = itertools.combinations(paragraph_embeddings, 2)
    return [torch.flatten(torch.stack(pair, dim=2), start_dim=1, end_dim=2) for pair in pairs]


In [9]:
from tqdm.notebook import tqdm

# #@memory.cache
def get_problem_embeddings(problems, verbose=False):
    if verbose:
        return [get_paragraph_pairs(problem_text) for problem_text in tqdm(problems)]
    else:
        return [get_paragraph_pairs(problem_text) for problem_text in problems]

In [10]:
# problems_embed = get_problem_embeddings(problems)
# for i in range(5):
#     print(problems_embed[i][0][0].shape)
# print(f"{problems_embed[0][0].shape=}")

In [11]:
def get_simple_ground_truth(ground_truth, problem_numbers):
    simple_ground_truth = []
    for num in problem_numbers:
        task_3_ground_truth = ground_truth[f"problem-{num}"]["paragraph-authors"]
        simple_ground_truth.append(task_3_ground_truth)
    return simple_ground_truth


# simple_ground_truth = get_simple_ground_truth(ground_truth, range(1, len(problems_embed)))

In [12]:
def get_task_3_ground_truth(simple_ground_truth):
    task_gt = []
    for problem in simple_ground_truth:
        problem_gt = []
        for author1, author2 in itertools.combinations(problem, 2):
            problem_gt.append(int(author1 != author2))
        task_gt.append(problem_gt)
    return task_gt

# task_3_ground_truth = get_task_3_ground_truth(simple_ground_truth)

In [13]:
# TODO: Invert the function get_task_3_ground_truth. Our model will output a bunch of binary labels which need to be converted to the task 3 ground truth format
# Ground truth format (gtf): [1, 2, 2, 2, 2, 3, 2, 2]
# Binary labels for comparisons (bl): [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
# Each binary label is the result of comparing two paragraphs. 1 means there was an author change, 0 means there was no author change
# For example, bl[0], is the result of comparing gtf[0]=1 and gtf[1]=2. 1 != 2, therefore bl[0] = 1. bl[1]=1 is the result of gtf[0] == gtf[2] (1 == 2)
# def get_simple_ground_truth_from_task_3(task_3_ground_truth):
#     pass

# simple_ground_truth == get_simple_ground_truth_from_task_3(task_3_ground_truth)

In [14]:
# for s, t in zip(simple_ground_truth, task_3_ground_truth):
#     print(f"{s}")
#     print(f"{t}")

In [15]:
# Add code to write out the embeddings (X) and ground truths (y) so we can train without having to rerun the preprocessing step

In [16]:
import math
from tensorflow.keras.utils import PyDataset

def flatten_problems(problems_list, squeeze=False):
        # [print(f"{pair=}") for problem in problems_list for pair in problem]
        return [pair.squeeze(0) if squeeze else pair for problem in problems_list for pair in problem]

class Pan21PyDataset(PyDataset):

    def __init__(self, x_set, y_set, batch_size, **kwargs):
        super().__init__(**kwargs)
        self.x, self.y = read_problem_files(x_set), read_ground_truth_files(y_set)
        self.batch_size = batch_size

    def __len__(self):
        # Return number of batches.
        return math.ceil(len(self.x) / self.batch_size)

    def get_data(self, low_idx, high_idx):
        # print(f"{len(self.x[low_idx])=} {self.x[low_idx]=}")
        embeddings = get_problem_embeddings(self.x[low_idx:high_idx])
        batch_x = np.array(flatten_problems(embeddings, squeeze=True))
        batch_y = np.array(flatten_problems(get_task_3_ground_truth(get_simple_ground_truth(self.y, range(low_idx+1, high_idx+1)))))
        return batch_x, batch_y

    def __getitem__(self, idx):
        # print(f"{idx=}")
        # Return x, y for batch idx.
        low = idx * self.batch_size
        # Cap upper bound at array length; the last batch may be smaller
        # if the total number of items is not a multiple of batch size.
        high = min(low + self.batch_size, len(self.x))
        batch_x, batch_y = self.get_data(low, high)

        # print(f"{low=} {high=}")
        # print(f"{batch_x[0].shape=}")
        # print(f"{len(batch_x)=}")
        # print(f"{len(batch_y)=}")

        return batch_x, batch_y

batch_size = 32
seed = 42

train_ds = Pan21PyDataset("pan21/train", "pan21/train", batch_size)
val_ds = Pan21PyDataset("pan21/validation", "pan21/validation", batch_size)


In [18]:
# import numpy as np

# def get_data(data_path, num_problems):
#     x = np.array(flatten_problems(get_problem_embeddings(read_problem_files(data_path, n=num_problems), verbose=True), squeeze=True))
#     y = np.array(flatten_problems(get_task_3_ground_truth(get_simple_ground_truth(cached_read_ground_truth(data_path), range(1, num_problems+1)))))
#     return x, y

# num_problems_train, num_problems_val = 500, 150
# # num_problems_train, num_problems_val = None, None
# x_train, y_train = get_data("pan21/train", num_problems_train)
# x_val, y_val = get_data("pan21/validation", num_problems_val)

In [None]:
print(f"Num training examples: {len(x_train)=} {len(y_train)=}")
print(f"Num training examples: {len(x_val)=} {len(y_val)=}")

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Input, Flatten
 
# Code implementation of the RNN for sequence labeling
def create_rnn_model(vocab_size, num_labels, embedding_dim, lstm_units):
    model = Sequential()
    # model.add(Embedding(vocab_size, embedding_dim))
    model.add(Input(shape=(max_input_length*2, embedding_dim)))
    # model.add(Input(shape=(embedding_dim,)))
    # model.add(LSTM(lstm_units, return_sequences=True))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Flatten())
    model.add(Dense(num_labels, activation='sigmoid'))
    return model
 
# Example usage of the RNN model
vocab_size = 30000  # Replace with the actual size of the vocabulary
num_labels = 1  # Replace with the actual number of entity labels
embedding_dim = 768
lstm_units = 64
max_sequence_length = 35

# max_input_length = 512*2
 
model = create_rnn_model(vocab_size, num_labels, embedding_dim, lstm_units)
# model.build((1, max_input_length))
model.build()
model.summary()

In [20]:
from tensorflow.keras import optimizers, losses, metrics


model.compile(
    optimizer=optimizers.RMSprop(),  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    # List of metrics to monitor
    metrics=[metrics.BinaryAccuracy(), metrics.AUC()],
)


In [21]:
# history = model.fit(
#     x_train,
#     y_train,
#     batch_size=50,
#     epochs=3,
#     # We pass some validation for
#     # monitoring validation loss and metrics
#     # at the end of each epoch
#     validation_data=(x_val, y_val),
# )
history = model.fit(
    train_ds,
    epochs=1,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=val_ds,
    verbose=1,
)

import datetime

model.save(f"models/baseline_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}.keras")

[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8152s[0m 23s/step - binary_accuracy: 0.5598 - loss: 0.7017 - val_binary_accuracy: 0.5454 - val_loss: 0.6820


In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)
