In [55]:
!python3 -m pip install torch

Defaulting to user installation because normal site-packages is not writeable


In [56]:
%pip install -U scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [57]:
import os
import json
import pandas as pd

In [58]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [59]:
PARENT_FOLDER = "release"
DATASET1_TRAIN = "pan23-multi-author-analysis-dataset1/pan23-multi-author-analysis-dataset1-train"
FILES_PATH = f'{PARENT_FOLDER}/{DATASET1_TRAIN}'

# Recollect data

In [60]:
def create_dataframe_from_files(parent_folder: str, dataset_folder: str, ini_range: int = 1, last_range: int = 200) -> pd.DataFrame:
    data = []

    for i in range(1, 4200):
        problem_file = os.path.join(parent_folder, dataset_folder, f"problem-{i}.txt")
        truth_file = os.path.join(parent_folder, dataset_folder, f"truth-problem-{i}.json")

        if os.path.exists(problem_file) and os.path.exists(truth_file):
            with open(problem_file, 'r') as problem_f, open(truth_file, 'r') as truth_f:
                x_value = problem_f.read().strip()
                y_value = json.load(truth_f)
            data.append({'X': x_value, 'authors': y_value['authors'], 'y': y_value['changes']})

    if data:
        df = pd.DataFrame(data)
        return df
    else:
        print("No data found in the specified files.")
        return None

In [61]:
data = create_dataframe_from_files(PARENT_FOLDER, DATASET1_TRAIN)
len(data)

4199

In [62]:
class CustomDataset(Dataset):
    def __init__(self, X, authors, y):
        self.X = X
        self.authors = authors
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return (self.X[index], self.authors[index], self.y[index])


# Exploratory Analysis

# Model creation

In [63]:
class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, n_heads, n_layers, dropout):
        super(Transformer, self).__init__()
        self.encoder = nn.Transformer(d_model=input_dim, nhead=n_heads, num_encoder_layers=n_layers, dim_feedforward=hidden_dim, dropout=dropout)
        self.fc = nn.Linear(input_dim, output_dim)
        
    def forward(self, src):
        src = self.encoder(src)
        output = self.fc(src)
        return output

In [64]:
INPUT_DIM = 512
HIDDEN_DIM = 256
N_HEADS = 8
N_LAYERS = 6
DROPOUT = 0.2
OUTPUT_DIM = 10

In [65]:
model = Transformer(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, N_HEADS, N_LAYERS, DROPOUT)



In [66]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Model training

In [67]:
BATCH_SIZE = 64
SHUFFLE = True

In [82]:
train_data, val_data = train_test_split(data, test_size=0.2)

dataset_train = CustomDataset(train_data['X'].values, train_data['authors'].values, train_data['y'].values)
dataset_val = CustomDataset(val_data['X'].values, val_data['authors'].values, val_data['y'].values)

train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
val_loader = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

In [94]:
len(train_loader)

53

In [95]:
print(dataset_train.X)

['I just don\'t want the OP to walk away from this thread thinking that if she tries to move and her husband files for the "emergency hearing" that it would make her a criminal or cause her to lose custody, when all it would actually mean is that she would need to bring the kids back.\nWould OP be worse off, legally, if she took the kids and the husband filed the emergency hearing and she brought the kids back, versus OP not leaving at all? (Ignoring any logistical issues that may or may not exist with OP leaving and then returning.) It seems to me that if she leaves, there\'s a possibility she would be able to stay in the new location, whereas if she sticks around she simply makes it much easier for her husband to keep her there in the future.\nI don\'t know. I don\'t the particulars of OP\'s life. There may be a secondary option that she move out, but stay in the state, file for custody and have it transferred to the state she has more ties to. It sounds like neither parent has any r

In [88]:
print(len(train_data))

3359


In [79]:
def train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    
    for batch_data, batch_labels in train_loader:
        optimizer.zero_grad()
        output = model(batch_data)
        loss = criterion(output, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss

In [81]:
def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_data, batch_labels in val_loader:
            output = model(batch_data)
            loss = criterion(output, batch_labels)
            total_loss += loss.item()
            _, predicted = output.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
    accuracy = 100 * correct / total
    return total_loss, accuracy

In [96]:
data.loc[2767]

X          Fighting between Israel and Arab states, namel...
authors                                                    2
y                                                        [1]
Name: 2767, dtype: object

In [None]:
num_epochs = 10

In [84]:
for epoch in range(num_epochs):
    total_train_loss = 0.0
    model.train()

    for batch_idx, (batch_data, batch_labels) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(batch_data)
        loss = criterion(output, batch_labels)

        # Debugging: Print the shapes of batch_data and batch_labels
        print(f"Batch {batch_idx}: Data shape - {batch_data.shape}, Labels shape - {batch_labels.shape}")

        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}] Batch [{batch_idx+1}/{len(train_loader)}] Train Loss: {loss.item():.4f}')
    avg_train_loss = total_train_loss / len(train_loader)

    val_loss, val_accuracy = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')


RuntimeError: each element in list of batch should be of equal size

## Model test