In [2]:
import torch
import random

random_seed = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [3]:
from pathlib import Path
import pan21_functions as p21

In [4]:
import importlib; importlib.reload(p21)

<module 'pan21_functions' from 'c:\\Users\\thoma\\Documents\\croatia\\masters\\semester2\\text_analysis\\tar_project\\pan21_functions.py'>

In [24]:
from torch.utils.data import DataLoader, Dataset
import itertools
from natsort import natsorted
from tqdm import tqdm

import pan21_functions as p21
from evaluation.evaluator import read_ground_truth_files

class ParagraphDataset(Dataset):
    def __init__(self, x_path, y_path, paragraphs_per_doc=10):
        self.x = []
        self.y = []
        self.paragraphs_per_doc = paragraphs_per_doc

        num_large_problems = 0
        
        files = natsorted(Path(x_path).glob('problem-*.txt'))
        gt = read_ground_truth_files(y_path)
        for problem_num, problem_file in tqdm(enumerate(files), desc="Loading problem and ground truth files"):
            # print(f"{problem_file=}")
            # number = problem_file.name[len("problem-") : -len(".txt")]
            solutions = gt[f"problem-{problem_num+1}"]["paragraph-authors"]
            with open(problem_file, 'r', encoding="utf8") as fh:
                paragraphs = fh.readlines()

            if len(paragraphs) < self.paragraphs_per_doc:
                pad_num = self.paragraphs_per_doc - len(paragraphs)
                paragraphs += [""]*pad_num
                solutions += [0]*pad_num
            else:
                paragraphs = paragraphs[:self.paragraphs_per_doc]
                solutions = solutions[:self.paragraphs_per_doc]
                num_large_problems += 1

            # print(f"{solutions=}")
            solutions = [[0]*(s-1) + [1] + [0]*(4-s) if s != 0 else [0]*4 for s in solutions]
            # print(f"{solutions=}")

            self.x.append(paragraphs)
            self.y.append(torch.tensor(solutions, dtype=torch.int64, device=DEVICE))

        if num_large_problems > 0:
            print(f"Warning: {num_large_problems} problems had more than {self.paragraphs_per_doc} paragraphs and were truncated")
            
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# Example data
paragraphs = [
    "This is the first paragraph.",
    "Here is another paragraph.",
    # Add more paragraphs
]
labels = [0, 1]  # Corresponding labels for the paragraphs

paragraphs_per_doc = 15
batch_size = 1

train_dataset = ParagraphDataset("pan21/train", "pan21/train", paragraphs_per_doc=paragraphs_per_doc)
val_dataset = ParagraphDataset("pan21/validation", "pan21/validation", paragraphs_per_doc=paragraphs_per_doc)
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


Loading problem and ground truth files: 11200it [00:01, 8323.90it/s]




Loading problem and ground truth files: 2400it [00:00, 9665.75it/s]






In [6]:
tokens_per_paragraph = 256  # Fixed number of tokens per paragraph
embedding_dim = 768         # Dimension of word embeddings

In [27]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

class ParagraphLabelingModelWithBERT(nn.Module):
    def __init__(self, bert_model_name, tokens_per_paragraph, output_dim):
        hidden_dim = tokens_per_paragraph
        super(ParagraphLabelingModelWithBERT, self).__init__()
        # Initialize the BERT tokenizer and model
        self.tokens_per_paragraph = tokens_per_paragraph
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.bert_model = BertModel.from_pretrained(bert_model_name).to(DEVICE)
        self.fc1 = nn.Linear(self.bert_model.config.hidden_size, hidden_dim).to(DEVICE)
        self.relu = nn.ReLU().to(DEVICE)
        self.fc2 = nn.Linear(hidden_dim, output_dim).to(DEVICE)
        self.softmax = nn.LogSoftmax(dim=-1).to(DEVICE)

    def forward(self, paragraphs):
        # Tokenize paragraphs
        # print(paragraphs)
        encoded_input = [self.tokenizer(p, padding='max_length', truncation=True, return_tensors='pt', max_length=self.tokens_per_paragraph).to(DEVICE) for p in paragraphs]
        input_ids = [e['input_ids'] for e in encoded_input]
        attention_mask = [e['attention_mask'] for e in encoded_input]
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = [self.bert_model(i, attention_mask=a) for i, a in zip(input_ids,attention_mask)]
            embeddings = [o.last_hidden_state for o in outputs]  # shape: (batch_size, max_length, hidden_dim)
        
        embeddings = torch.squeeze(torch.stack(embeddings, dim=1))
        # print(f"{embeddings.shape=}")

        x = self.fc1(embeddings)
        # print(f"{x.shape=}")
        x = self.relu(x)
        # print(f"{x.shape=}")
        x = self.fc2(x)
        # print(f"{x.shape=}")
        x = self.softmax(x)
        # print(f"{x.shape=}")
        return x

# Model parameters
bert_model_name = 'bert-base-uncased'
output_dim = 4    # Number of classes

model = ParagraphLabelingModelWithBERT(bert_model_name, tokens_per_paragraph, output_dim)


In [28]:
import datetime

criterion = nn.CrossEntropyLoss()  # Cross-Entropy Loss for multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    for paragraphs, labels in tqdm(train_dataset):
        # paragraphs, labels = batch
        # print(f"{type(paragraphs[0])} {paragraphs=}")
        # print(f"{labels=}")
        optimizer.zero_grad()

        # Forward pass
        outputs = model(paragraphs)
        # print(f"{outputs.shape=}")

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
torch.save(model.state_dict(), Path(f"{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}.torch"))


100%|██████████| 11200/11200 [28:33<00:00,  6.54it/s] 


Epoch [1/2], Loss: 1.5812


100%|██████████| 11200/11200 [25:57<00:00,  7.19it/s]


Epoch [2/2], Loss: 1.5789


In [31]:
val_input = [v[0] for v in val_dataset][:5]
print(val_input[0])
model.eval()
with torch.no_grad():
    val_outputs = model(val_input)
    print(val_outputs)
    val_loss = criterion(val_outputs.view(-1, output_dim), val_Y.view(-1))

    print(f'Validation Loss: {val_loss.item():.4f}')

["Once you turn on Flip Ahead, you can swipe through content spread across multiple pages to go to the next page within the same article, post or thread. When browsing sequenced content, such as blogs or news sites, and whenever you've reached the end of your multi-page content, flip ahead will suggest an appropriate next article, post or thread to continue your exploration. Using Flip Ahead requires end user opt-in, and sends your browsing history to Microsoft to improve the quality of the experience.\n", 'It\'s a new feature for navigating IE pages. A good description from this page - Windows 8 Release Preview detailed impressions, under the "Web browsing" section: \n', 'Enhanced touch browsing: In the Release Preview, IE10’s Metro style experience offers a new way of browsing multi-page and sequenced content. Flip ahead enables you to navigate your favorite sites like you read a magazine by replacing the need to click on links with a more natural forward swipe gesture on touch-centr

NameError: name 'val_Y' is not defined

In [None]:
def predictions_to_json(predictions, gt, threshold=0.5):
    adjusted_predictions = [p.index(max(p)) for p in predictions]

    true_num_paragraphs = len(gt["paragraph-authors"])
    
    # Construct the JSON object
    data = {
        "authors": max(adjusted_predictions),
        "structure": [999],  # Placeholder or specific requirement
        "site": "googole.com",
        "multi-author": max(adjusted_predictions) > 1,
        "changes": [int(adjusted_predictions[i] != adjusted_predictions[i + 1]) for i in range(len(adjusted_predictions) - 1)][:true_num_paragraphs],
        "paragraph-authors": adjusted_predictions[:true_num_paragraphs]
    }
    return data

In [None]:
from evaluation.evaluator import compute_score_single_predictions, compute_score_multiple_predictions
dict_of_jsons_result = [predictions_to_json(p) for p in val_outputs]

truth = read_ground_truth_files("pan21/validation")

task1_result = compute_score_single_predictions(truth, dict_of_jsons_result, 'multi-author')
task2_result = compute_score_multiple_predictions(truth, dict_of_jsons_result, 'changes', labels=[0, 1])
task3_result = compute_score_multiple_predictions(truth, dict_of_jsons_result, 'paragraph-authors', labels=[1, 2, 3, 4])

print(
    # f'Model: {model_path.stem}\n' +
    f'\tTask 1 Score: {task1_result}\n'+
    f'\tTask 2 Score: {task2_result}\n'+
    f'\tTask 3 Score: {task3_result}\n'
)
