In [1]:
from joblib import Memory

memory = Memory(".cache", verbose=0)

In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

@memory.cache
def get_tokenizer_model():
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    bert_model = BertModel.from_pretrained('bert-base-cased')
    sbert_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

    return bert_tokenizer, bert_model, sbert_model

bert_tokenizer, bert_model, sbert_model = get_tokenizer_model()

In [3]:
from pathlib import Path
import itertools
from natsort import natsorted

@memory.cache
def read_problem_files(problem_folder, n=None):
    problems = []
    files = itertools.islice(natsorted(Path(problem_folder).glob('problem-*.txt')), n)
    for problem_file in files:
        # number = problem_file.name[len("problem-") : -len(".txt")]
        with open(problem_file, 'r', encoding="utf8") as fh:
            problems.append(fh.readlines())
    return problems

In [4]:
from evaluation.evaluator import read_ground_truth_files

@memory.cache
def cached_read_ground_truth(x):
    return read_ground_truth_files(x)

# ground_truth = cached_read_ground_truth("pan21/train")

In [5]:
def get_embeddings(paragraph):
    # Tokenize the sentence
    tokens = bert_tokenizer.tokenize(paragraph)
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensor
    token_tensor = torch.tensor([token_ids])
    segment_ids = [0] * len(token_ids)
    segment_tensor = torch.tensor([segment_ids])

    # Get BERT model output
    with torch.no_grad():
        outputs = bert_model(token_tensor, segment_tensor)

    # Extract word embeddings from BERT model output
    return outputs[0]

In [6]:
# problems = read_problem_files("pan21/train", n=5)
# problems[0]

In [7]:
#@memory.cache
# def get_max_length(n=None):
#     max_train = max([len(x) for y in read_problem_files("pan21/train", n=n) for x in y])
#     max_val = max([len(x) for y in read_problem_files("pan21/validation", n=n) for x in y])
#     return max([max_train, max_val])

In [44]:
import torch
import numpy as np

max_input_length = 256

def pad_paragraph(paragraph_embedding, desired_length):
    d1, d2, d3 = paragraph_embedding.shape
    # print(f"{paragraph_embedding.shape=}")

    target = torch.zeros(d1, desired_length, d3)
    # print(f"{target.shape=}")
    target[:, :d2, :] = paragraph_embedding

    return target


def get_paragraph_pairs(problem_text):
    # print(problem_text)
    paragraph_embeddings = [pad_paragraph(get_embeddings(para[:max_input_length]), max_input_length) for para in problem_text]
    # print(f"{[paras.shape for paras in paragraph_embeddings]=}")
    # print(f"{[x.shape for x in paragraph_embeddings]}")
    pairs = itertools.combinations(paragraph_embeddings, 2)
    return [torch.flatten(torch.stack(pair, dim=2), start_dim=1, end_dim=2) for pair in pairs]


In [9]:
from tqdm.notebook import tqdm

# #@memory.cache
def get_problem_embeddings(problems, verbose=False):
    if verbose:
        return [get_paragraph_pairs(problem_text) for problem_text in tqdm(problems)]
    else:
        return [get_paragraph_pairs(problem_text) for problem_text in problems]

In [10]:
# problems_embed = get_problem_embeddings(problems)
# for i in range(5):
#     print(problems_embed[i][0][0].shape)
# print(f"{problems_embed[0][0].shape=}")

In [11]:
def get_simple_ground_truth(ground_truth, problem_numbers):
    simple_ground_truth = []
    for num in problem_numbers:
        task_3_ground_truth = ground_truth[f"problem-{num}"]["paragraph-authors"]
        simple_ground_truth.append(task_3_ground_truth)
    return simple_ground_truth


# simple_ground_truth = get_simple_ground_truth(ground_truth, range(1, len(problems_embed)))

In [12]:
def get_task_3_ground_truth(simple_ground_truth):
    task_gt = []
    for problem in simple_ground_truth:
        problem_gt = []
        for author1, author2 in itertools.combinations(problem, 2):
            problem_gt.append(int(author1 != author2))
        task_gt.append(problem_gt)
    return task_gt

# task_3_ground_truth = get_task_3_ground_truth(simple_ground_truth)

In [13]:
# TODO: Invert the function get_task_3_ground_truth. Our model will output a bunch of binary labels which need to be converted to the task 3 ground truth format
# Ground truth format (gtf): [1, 2, 2, 2, 2, 3, 2, 2]
# Binary labels for comparisons (bl): [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
# Each binary label is the result of comparing two paragraphs. 1 means there was an author change, 0 means there was no author change
# For example, bl[0], is the result of comparing gtf[0]=1 and gtf[1]=2. 1 != 2, therefore bl[0] = 1. bl[1]=1 is the result of gtf[0] == gtf[2] (1 == 2)
def get_simple_ground_truth_from_task_3(task_3_ground_truth):
    simple_gt = []
    for problem in task_3_ground_truth:
        # k = n*(n-1)/2
        # n**2 - n - 2k = 0
        coeff = [1, -1, len(problem) * -2]
        roots = np.roots(coeff)
        gt_length = int(roots[roots > 0][0])
        print(gt_length)

        gt = np.zeros(gt_length, dtype=np.uint8)
        gt[0] = 1
        for i in range(1, gt_length):
            # loop for gt[i]
            num_comparisons = i
            pointer = i - 1
            modified_flag = False
            print(f"{i=} {num_comparisons=} {pointer=}")
            for gt_i, j in enumerate(range(gt_length-2, 1, -1)[:num_comparisons]):
                # comparison between gt[gt_i] and gt[i]
                print(f"{gt_i=} {j=} {pointer=} {task_3_ground_truth[pointer]=}")
                bin_label = task_3_ground_truth[pointer]
                if bin_label == 0:
                    print(f"{gt[i]=} {gt[gt_i]=}")
                    gt[i] = gt[gt_i]
                    modified_flag = True
                    break

                pointer += j
            if not modified_flag:
                print(f"No modified")
                gt[i] = np.max(gt) + 1
            print(f"{gt}\n"
        simple_gt.append(gt)
    return simple_gt

# simple_ground_truth == get_simple_ground_truth_from_task_3(task_3_ground_truth)

In [14]:
# for s, t in zip(simple_ground_truth, task_3_ground_truth):
#     print(f"{s}")
#     print(f"{t}")

In [15]:
# Add code to write out the embeddings (X) and ground truths (y) so we can train without having to rerun the preprocessing step

In [61]:
import math
from tensorflow.keras.utils import PyDataset
from pathlib import Path

def flatten_problems(problems_list, squeeze=False):
        # [print(f"{pair=}") for problem in problems_list for pair in problem]
        return [pair.squeeze(0) if squeeze else pair for problem in problems_list for pair in problem]

class Pan21PyDataset(PyDataset):

    def __init__(self, x_set, y_set, batch_size, **kwargs):
        super().__init__(**kwargs)
        self.x, self.y = read_problem_files(x_set), read_ground_truth_files(y_set)
        self.batch_size = batch_size
        self.file_path = Path()

    def __len__(self):
        # Return number of batches.
        return math.ceil(len(self.x) / self.batch_size)

    def get_data(self, low_idx, high_idx):
        # print(f"{len(self.x[low_idx])=} {self.x[low_idx]=}")
        embeddings = get_problem_embeddings(self.x[low_idx:high_idx])
        batch_x = np.array(flatten_problems(embeddings, squeeze=True))
        batch_y = np.array(flatten_problems(get_task_3_ground_truth(get_simple_ground_truth(self.y, range(low_idx+1, high_idx+1)))))
        return batch_x, batch_y

    def to_file(self, file_path, overwrite=False):
        self.file_path = Path(file_path)
        self.file_path.mkdir(parents=True, exist_ok=True)
        for i in tqdm(range(len(self))):
            idx_path = self.file_path / f"{i}.npz"
            if not idx_path.exists() or (idx_path.exists() and overwrite):
                batch_x, batch_y = self.__getitem__(i, force_compute=True)
                np.savez_compressed(idx_path, batch_x=batch_x, batch_y=batch_y)

    def __getitem__(self, idx, force_compute=False):
        idx_path = self.file_path / f"{i}.npz"
        if force_compute or not idx_path.exists():
            # print(f"{idx=}")
            # Return x, y for batch idx.
            low = idx * self.batch_size
            # Cap upper bound at array length; the last batch may be smaller
            # if the total number of items is not a multiple of batch size.
            high = min(low + self.batch_size, len(self.x))
            batch_x, batch_y = self.get_data(low, high)

            # print(f"{low=} {high=}")
            # print(f"{batch_x[0].shape=}")
            # print(f"{len(batch_x)=}")
            # print(f"{len(batch_y)=}")
        else:
            npzfile = np.load(idx_path)
            batch_x = npzfile['batch_x']
            batch_y = npzfile['batch_y']

        return batch_x, batch_y

batch_size = 32
seed = 42



In [87]:
class Pan21FourierDataset(Pan21PyDataset):
    def __init__(self, x_set, y_set, batch_size, num_fourier_features, **kwargs):
        super().__init__(x_set, y_set, batch_size, **kwargs)
        self.num_fourier_features = num_fourier_features

    def __getitem__(self, idx, force_compute=False):
        return self.__getitem__helper(idx, num_fourier_features=self.num_fourier_features, force_compute=force_compute)

    def __getitem__helper(self, idx, num_fourier_features, force_compute=False):
        idx_path = self.file_path / "fourier" / f"{idx}.npz"
        
        if num_fourier_features > 0:
            batch_x, batch_y = super().__getitem__(idx, force_compute)
            new_batch_x = batch_x.copy()
            
            num_features = len(batch_x[0])
            # print(f"{num_features=}")
            # 0:x will be BERT embeddings for paragraph 1
            # x:length/2 will be fourier features for paragraph 1
            num_non_fourier_features = (num_features - num_fourier_features) // 2
            para1_fourier_features_low, para1_fourier_features_high = num_non_fourier_features, num_features // 2
            para2_fourier_features_low, para2_fourier_features_high = num_features // 2 + num_non_fourier_features , num_features

            # print(f"{para1_fourier_features_low=} {para1_fourier_features_high=}")
            # print(f"{para2_fourier_features_low=} {para2_fourier_features_high=}")

            if force_compute or not idx_path.exists():
                for i, x in enumerate(batch_x):
                    para1_end = num_features//2
                    para1_fft = np.real(np.fft.fft(x[:para1_end], axis=0))
                    para2_fft = np.real(np.fft.fft(x[para1_end:], axis=0))

                    new_batch_x[i, para1_fourier_features_low:para1_fourier_features_high] = para1_fft[:num_fourier_features//2]
                    new_batch_x[i, para2_fourier_features_low:para2_fourier_features_high] = para2_fft[:num_fourier_features//2]
            else:
                npzfile = np.load(idx_path)
                fourier_batch_x = npzfile['fourier_batch_x']

                i,j,k = fourier_batch_x.shape

                new_batch_x[:, para1_fourier_features_low:para1_fourier_features_high] = fourier_batch_x[:num_fourier_features//2]
                new_batch_x[:, para2_fourier_features_low:para2_fourier_features_high] = fourier_batch_x[j:j+num_fourier_features//2]

            return new_batch_x, batch_y
        else:
            # No fourier features means it is equivalent to just BERT embeddings
            return super().__getitem__(idx, force_compute)

    def to_file(self, file_path, overwrite=False):
        self.file_path = Path(file_path)
        fourier_file_path = self.file_path / "fourier"

        self.file_path.mkdir(parents=True, exist_ok=True)
        fourier_file_path.mkdir(parents=True, exist_ok=True)

        for i in tqdm(range(len(self))):
            idx_path = fourier_file_path / f"{i}.npz"
            if not idx_path.exists() or (idx_path.exists() and overwrite):
                fourier_batch_x, _ = self.__getitem__helper(i, num_fourier_features=512, force_compute=True)
                np.savez_compressed(idx_path, fourier_batch_x=fourier_batch_x)

In [None]:
class Pan21FourierFilterDataset(Pan21FourierDataset):
    def __init__(self, x_set, y_set, batch_size, num_fourier_features, cutoff_frequencies, **kwargs):
        super().__init__(x_set, y_set, batch_size, num_fourier_features, **kwargs)
        self.cutoff_frequencies = cutoff_frequencies
    
    def __getitem__(self, idx, force_compute=False):
        return self.__getitem__helper(idx, num_fourier_features=self.num_fourier_features, force_compute=force_compute)

In [65]:
Pan21PyDataset("pan21/train", "pan21/train", batch_size).to_file("train_ds")
Pan21PyDataset("pan21/validation", "pan21/validation", batch_size).to_file("val_ds")

  0%|          | 0/350 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

In [88]:
Pan21FourierDataset("pan21/train", "pan21/train", batch_size, num_fourier_features=512).to_file("train_ds")
Pan21FourierDataset("pan21/validation", "pan21/validation", batch_size, num_fourier_features=512).to_file("val_ds")

  0%|          | 0/350 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

In [None]:
# import time

# start_idx = 0
# limit = 5
# # limit = len(train_ds)

# before = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=True)
# after = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=False)
# after_after = time.time()

# print(f"Compute: {round((after - before)/limit, 2)}s vs File read: {round((after_after - after)/limit, 2)}s")
# # Compute: 8.95s per batch
# # Compute with compression: ~17s
# # Uncompressed read: .7s per batch
# # Compressed read: 1.6s per batch
# # Compressed is 1/10 the size of uncompressed, but takes ~twice as long to precompute and save
# # Compressed 512D Fourier takes ~30s per batch
# # Compressed 512D Fourier is about 500-700MB per batch

Compute: 8.95s vs File read: 1.6s


In [90]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Input, Flatten
from tensorflow.keras import optimizers, losses, metrics

# Code implementation of the RNN for sequence labeling
def create_rnn_model(num_labels, embedding_dim):
    model = Sequential()
    model.add(Input(shape=(max_input_length*2, embedding_dim)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Flatten())
    model.add(Dense(num_labels, activation='sigmoid'))

    model.compile(
        optimizer=optimizers.RMSprop(),  # Optimizer
        # Loss function to minimize
        loss=losses.BinaryCrossentropy(),
        # List of metrics to monitor
        metrics=[metrics.BinaryAccuracy(), metrics.AUC()],
    )

    return model
 
num_labels = 1
embedding_dim = 768
 
model = create_rnn_model(num_labels, embedding_dim)
model.summary()

In [91]:
# Does using the frequency domain spectra provide usefule information?
import datetime
from keras.callbacks import CSVLogger

num_fourier_features = [0, 512//4, 512//2, 512]
for num_ff in num_fourier_features:
    model_name = f"models/num_fourier_features/{num_ff}_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}"
    model = create_rnn_model(num_labels, embedding_dim)
    csv_logger = CSVLogger(f'{model_name}.log', separator=',', append=False)


    fourier_train_ds = Pan21FourierDataset("pan21/train", "pan21/train", batch_size, num_fourier_features=num_ff)
    fourier_val_ds = Pan21FourierDataset("pan21/validation", "pan21/validation", batch_size, num_fourier_features=num_ff)

    history = model.fit(
        fourier_train_ds,
        epochs=5,
        validation_data=fourier_val_ds,
        verbose=1,
        callbacks=[csv_logger]
    )

    model.save(f"{model_name}.keras")

Epoch 1/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6573s[0m 19s/step - auc_3: 0.5250 - binary_accuracy: 0.5477 - loss: 0.6915 - val_auc_3: 0.6005 - val_binary_accuracy: 0.5629 - val_loss: 0.6828
Epoch 2/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4607s[0m 13s/step - auc_3: 0.5695 - binary_accuracy: 0.5656 - loss: 0.6835 - val_auc_3: 0.6059 - val_binary_accuracy: 0.5828 - val_loss: 0.6753
Epoch 3/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4322s[0m 12s/step - auc_3: 0.5752 - binary_accuracy: 0.5738 - loss: 0.6787 - val_auc_3: 0.6078 - val_binary_accuracy: 0.5800 - val_loss: 0.6746
Epoch 4/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3912s[0m 11s/step - auc_3: 0.5843 - binary_accuracy: 0.5730 - loss: 0.6775 - val_auc_3: 0.6040 - val_binary_accuracy: 0.5788 - val_loss: 0.6762
Epoch 5/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3911s[0m 11s/step - auc_3: 0.5929 - binary_accuracy: 0.5821 - loss: 0.6744

In [68]:
# history = model.fit(
#     x_train,
#     y_train,
#     batch_size=50,
#     epochs=3,
#     # We pass some validation for
#     # monitoring validation loss and metrics
#     # at the end of each epoch
#     validation_data=(x_val, y_val),
# )
history = model.fit(
    train_ds,
    epochs=1,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=val_ds,
    verbose=1,
)

import datetime

model.save(f"models/baseline_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}.keras")

[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1320s[0m 4s/step - auc: 0.8467 - binary_accuracy: 0.7743 - loss: 0.4627 - val_auc: 0.5286 - val_binary_accuracy: 0.5231 - val_loss: 1.1656


In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)
