In [1]:
import torch
import random

random_seed = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from torch.utils.data import DataLoader
from rich.progress import track

from binary_dataset import BinaryDataset

batch_size = 32
tokens_per_paragraph = 256  # Fixed number of tokens per paragraph

train_dataset = BinaryDataset("pan21/train", "pan21/train")
val_dataset = BinaryDataset("pan21/validation", "pan21/validation")

# dataloader seems wonky, always returns tuple len 2 with default collate fn
def collate_fn(batch):
    # Separate each tuple into two lists
    x = [item[0] for item in batch]
    y = torch.stack([item[1] for item in batch]).view(len(batch), 1)

    return x, y
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

Output()

Using author file: pan21\train_authors
Using embedding dir: pan21\train_embeddings


Output()

Using author file: pan21\validation_authors
Using embedding dir: pan21\validation_embeddings


In [10]:
import torch
import torch.nn as nn
import cupy as cp
from cupy.fft import fft

class ParagraphLabelingModelWithBERT(nn.Module):
    def __init__(self, tokens_per_paragraph, output_dim, fourier=False, single_layer = True):
        super(ParagraphLabelingModelWithBERT, self).__init__()
        
        self.fourier = fourier
        self.single_layer = single_layer
        self.tokens_per_paragraph = tokens_per_paragraph

        bert_embedding_dimension = 768
        input_size = bert_embedding_dimension*self.tokens_per_paragraph*2
        
        if self.single_layer:
            self.fc = nn.Linear(input_size, output_dim, device=DEVICE)
        else:
            self.fc = nn.Linear(input_size, output_dim*512, device=DEVICE)
            self.fc2 = nn.Linear(output_dim*512, output_dim*256, device=DEVICE)
            self.fc3 = nn.Linear(output_dim*256, output_dim, device=DEVICE)

    def forward(self, paragraph_pair):
        # Tokenize paragraphs
        # print(paragraphs)

        # batches come as lists, individual examples as tuples
        if isinstance(paragraph_pair, tuple) and len(paragraph_pair) == 2:
            paragraph_pair = [paragraph_pair]

        # print(f"{len(paragraph_pair)=} {len(paragraph_pair[0])=}")
        # print(f"{paragraph_pair[0]=}")
        
        if self.fourier:
            paragraph_pair = [
                    (torch.tensor(cp.real(fft(cp.asarray(p1), axis=1)), device=DEVICE), 
                    torch.tensor(cp.real(fft(cp.asarray(p2), axis=1)), device=DEVICE))
                for (p1, p2) in paragraph_pair
                ]

        if self.single_layer:
            x = torch.sigmoid(self.fc(torch.stack([torch.flatten(torch.cat((p[0].squeeze(0), p[1].squeeze(0)), dim=0)) for p in paragraph_pair])))
        else:
            x = torch.sigmoid(self.fc3(self.fc2(self.fc(torch.stack([torch.flatten(torch.cat((p[0].squeeze(0), p[1].squeeze(0)), dim=0)) for p in paragraph_pair])))))
        
        # print(f"{x.shape=}")
        return x

In [5]:
import datetime
from rich.progress import Progress
from pathlib import Path

# Model parameters
bert_model_name = 'bert-base-uncased'
output_dim = 1    # Number of classes
# lr_exps = range(4, 8)
lr_exps = [6]
# fourier = True
num_epochs = 5

for fourier in (False, True):
    for lr_exponent in lr_exps:
        lr = 1/10**lr_exponent
        model = ParagraphLabelingModelWithBERT(tokens_per_paragraph, output_dim, fourier=fourier)

        criterion = nn.BCELoss()
        # learning rate is important, .001 led to always outputting 1
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        with Progress() as pb:
            total_task = pb.add_task('Total', total=num_epochs)

            for epoch in range(num_epochs):
                epoch_task = pb.add_task(f'Epoch {epoch}', total=len(dataloader))

                model.train()
                for inputs, targets in dataloader:
                    # print(f"{type(inputs)=} {len(inputs)=}")
                    # print(f"{type(inputs[0])=}")
                    # print(f"{targets.shape=}")
                    
                    optimizer.zero_grad()

                    # Forward pass
                    outputs = model(inputs)
                    # print(f"{outputs.shape=}")

                    # Compute the loss
                    loss = criterion(outputs, targets)

                    # Backward pass and optimization
                    loss.backward()
                    optimizer.step()

                    pb.update(epoch_task, advance=1)

                # TODO: add metadata to the model files
                metadata = {
                    'epoch': epoch,
                    'loss': loss.item(),
                    'optimizer_state': 'Adam',
                    'lr': lr,
                    'fourier': fourier,
                    'tokens_per_paragraph': tokens_per_paragraph,
                }
                state_dict = model.state_dict()
                state_dict['metadata'] = metadata

                # print(f"{model.state_dict()=}")

                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
                pb.update(total_task, advance=1)
                torch.save(model.state_dict(), Path(f"{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}_e{epoch}.torch"))


Output()

Output()

In [12]:
# best vanilla
# eval_dict = torch.load("checkpoints/2024_07_28-02_09_54_AM_e9.torch")
# best fourier
# eval_dict = torch.load("checkpoints/2024_07_28-02_56_01_AM_e9.torch")
from pathlib import Path

models = [
    (Path("checkpoints/2024_07_28-02_09_54_AM_e9.torch"), True),
    (Path("checkpoints/2024_07_28-02_56_01_AM_e9.torch"), True),
    (Path("2024_07_30-09_47_22_AM_e4.torch"), False),
    (Path("2024_07_30-12_35_29_PM_e4.torch"), False),
]
models = []

for model_path, single_layer in models:
    output_dim = 1
    eval_dict = torch.load(model_path)
    model = ParagraphLabelingModelWithBERT(tokens_per_paragraph, output_dim, single_layer=single_layer)
    model.load_state_dict(eval_dict)
    model.eval()
    models.append(model)

# val_loss = 0
# criterion = nn.BCELoss()

# for input, target in val_dataloader:
#     with torch.no_grad():
#         val_outputs = model(input)
        
#         val_loss += criterion(val_outputs.view(-1, output_dim), target).item()

# print(f'Validation Loss: {val_loss/len(val_dataloader):.4f}')

from prediction import build_prediction_dict
prediction_dicts = build_prediction_dict(models, val_dataset)

In [9]:
from prediction import compute_scores, build_json_predictions
for model_path, prediction_dict in zip(model_paths, prediction_dicts):
    dict_of_jsons_result = build_json_predictions(prediction_dict)

    from evaluation.evaluator import read_ground_truth_files
    truth = read_ground_truth_files("pan21/validation")

    task1_result, task2_result, task3_result = compute_scores(truth, dict_of_jsons_result)

    print(
        f'Model: {model_path.stem}\n' +
        f'\tTask 1 Score: {task1_result}\n'+
        f'\tTask 2 Score: {task2_result}\n'+
        f'\tTask 3 Score: {task3_result}\n'
    )


Model: 2024_07_30-09_47_22_AM_e4
	Task 1 Score: 0.5732774498808929
	Task 2 Score: 0.5067699680655273
	Task 3 Score: 0.30072446896832133

Model: 2024_07_30-12_35_29_PM_e4
	Task 1 Score: 0.5625573335457014
	Task 2 Score: 0.5006473615060574
	Task 3 Score: 0.3003207435158149

