In [1]:
import torch
import random

random_seed = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from torch.utils.data import DataLoader
import rich

from binary_dataset import BinaryDataset

batch_size = 32
tokens_per_paragraph = 256  # Fixed number of tokens per paragraph

train_dataset = BinaryDataset("pan21/train", "pan21/train")
val_dataset = BinaryDataset("pan21/validation", "pan21/validation")

# dataloader seems wonky, always returns tuple len 2 with default collate fn
def collate_fn(batch):
    # Separate each tuple into two lists
    x = [item[0] for item in batch]
    y = torch.stack([item[1] for item in batch])

    return x, y
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=5, shuffle=False, collate_fn=collate_fn)


Output()

Using author file: pan21\train_authors
Using embedding dir: pan21\train_embeddings


Output()

Using author file: pan21\validation_authors
Using embedding dir: pan21\validation_embeddings


In [4]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

class ParagraphLabelingModelWithBERT(nn.Module):
    def __init__(self, bert_model_name, tokens_per_paragraph, output_dim):
        super(ParagraphLabelingModelWithBERT, self).__init__()
        # Initialize the BERT tokenizer and model
        self.tokens_per_paragraph = tokens_per_paragraph
        # self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        # self.bert_model = BertModel.from_pretrained(bert_model_name).to(DEVICE)

        bert_embedding_dimension = 768
        input_size = bert_embedding_dimension*self.tokens_per_paragraph*2
        self.fc = nn.Linear(input_size, output_dim, device=DEVICE)

    def forward(self, paragraph_pair):
        # Tokenize paragraphs
        # print(paragraphs)

        # batches come as lists, individual examples as tuples
        if isinstance(paragraph_pair, tuple) and len(paragraph_pair) == 2:
            paragraph_pair = [paragraph_pair]

        # print(f"{len(paragraph_pair)=} {len(paragraph_pair[0])=}")
        # print(f"{paragraph_pair[0]=}")
        
        x = torch.stack([torch.flatten(torch.cat((p[0].squeeze(0), p[1].squeeze(0)), dim=0)) for p in paragraph_pair])
        # print(f"{x.shape=}")

        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [22]:
import datetime
from rich.progress import Progress
from pathlib import Path

# Model parameters
bert_model_name = 'bert-base-uncased'
output_dim = 1    # Number of classes

for lr_exponent in range(4, 8):
    lr = 1/10**lr_exponent
    model = ParagraphLabelingModelWithBERT(bert_model_name, tokens_per_paragraph, output_dim)

    criterion = nn.BCELoss()
    # learning rate is important, .001 led to always outputting 1
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    num_epochs = 5

    with Progress() as pb:
        total_task = pb.add_task('Total', total=num_epochs)

        for epoch in range(num_epochs):
            epoch_task = pb.add_task(f'Epoch {epoch}', total=len(dataloader))

            model.train()
            for inputs, targets in dataloader:
                # print(f"{type(inputs)=} {len(inputs)=}")
                # print(f"{type(inputs[0])=}")
                # print(f"{targets.shape=}")
                
                optimizer.zero_grad()

                # Forward pass
                outputs = model(inputs)
                # print(f"{outputs.shape=}")

                # Compute the loss
                loss = criterion(outputs, targets)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                pb.update(epoch_task, advance=1)

            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            pb.update(total_task, advance=1)
            torch.save(model.state_dict(), Path(f"{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}_e{epoch}.torch"))


Output()

Output()

Output()

In [15]:
small_val_set = [val_dataset[i] for i in range(10)]

In [21]:
# small_val_set = [v for v in val_dataset]
val_input = [v[0] for v in small_val_set]
val_Y = torch.stack([v[1] for v in small_val_set])

eval_dict = torch.load("2024_07_17-09_42_24_PM_e4.torch")
model.load_state_dict(eval_dict)
model.eval()

with torch.no_grad():
    val_outputs = model(val_input)
    print(f"{val_outputs=}")
    print(f"{val_Y=}")
    val_loss = criterion(val_outputs.view(-1, output_dim), val_Y)

    print(f'Validation Loss: {val_loss.item():.4f}')

val_outputs=tensor([[0.4663],
        [0.3878],
        [0.3255],
        [0.4528],
        [0.4400],
        [0.3745],
        [0.5065],
        [0.2962],
        [0.4191],
        [0.3408]], device='cuda:0')
val_Y=tensor([[0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]], device='cuda:0')
Validation Loss: 0.7233


In [8]:
def predictions_to_json(predictions, gt, threshold=0.5):
    adjusted_predictions = [p.index(max(p)) for p in predictions]

    true_num_paragraphs = len(gt["paragraph-authors"])
    
    # Construct the JSON object
    data = {
        "authors": max(adjusted_predictions),
        "structure": [999],  # Placeholder or specific requirement
        "site": "googole.com",
        "multi-author": max(adjusted_predictions) > 1,
        "changes": [int(adjusted_predictions[i] != adjusted_predictions[i + 1]) for i in range(len(adjusted_predictions) - 1)][:true_num_paragraphs],
        "paragraph-authors": adjusted_predictions[:true_num_paragraphs]
    }
    return data

In [14]:
from evaluation.evaluator import compute_score_single_predictions, compute_score_multiple_predictions
dict_of_jsons_result = [predictions_to_json(model(x), y) for x, y in val_dataloader]

from evaluation.evaluator import read_ground_truth_files
truth = read_ground_truth_files("pan21/validation")

task1_result = compute_score_single_predictions(truth, dict_of_jsons_result, 'multi-author')
task2_result = compute_score_multiple_predictions(truth, dict_of_jsons_result, 'changes', labels=[0, 1])
task3_result = compute_score_multiple_predictions(truth, dict_of_jsons_result, 'paragraph-authors', labels=[1, 2, 3, 4])

print(
    # f'Model: {model_path.stem}\n' +
    f'\tTask 1 Score: {task1_result}\n'+
    f'\tTask 2 Score: {task2_result}\n'+
    f'\tTask 3 Score: {task3_result}\n'
)


KeyboardInterrupt: 