# PAN 2020 Authorship Verification

Authors:
- David García Guillén
- Daniel García Algora

## Library and environment setup

In [1]:
%pip install -q torch transformers numpy pandas sentence-transformers
%pip install -U scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-le

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import json
import pandas as pd
from typing import List

In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

Some of the lines are commented in order to access the data in different ways, since we tried several different runtime environments.
We created the `get_dataframe_from_file` function in order to transform .jsonl files directly to pandas dataframes, and then merged values and their corresponding ground truths into a single dataframe.

In [7]:
PARENT_FOLDER = "PAN2020-authorship-verification"
DATASET_TRUTH_TRAIN = f"{PARENT_FOLDER}/pan20-authorship-verification-training-small/pan20-authorship-verification-training-small-truth.jsonl"
DATASET_TRAIN = f"{PARENT_FOLDER}/pan20-authorship-verification-training-small/pan20-authorship-verification-training-small.jsonl"

In [6]:
def get_dataframe_from_file (file_path : str) -> List:
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            try:
                parsed_data = json.loads(line)
                data.append(parsed_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")

    return pd.DataFrame(data)

In [8]:
df_ground_truth = get_dataframe_from_file(DATASET_TRUTH_TRAIN)
df_inputs = get_dataframe_from_file(DATASET_TRAIN)

df_combined = pd.merge(df_ground_truth, df_inputs, on='id')

print(len(df_combined))

52623


In [None]:
df_ground_truth.head()

Unnamed: 0,id,same,authors
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[1446633, 1446633]"
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[1446633, 1446633]"
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,"[1446633, 1446633]"
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[1446633, 1446633]"
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[1446633, 1446633]"


In [None]:
len(df_ground_truth)

52601

In [None]:
def check_not_nulls(df: pd.DataFrame) -> None:
    print(df.isnull().sum())

In [None]:
def count_duplicate_ids(df: pd.DataFrame) -> pd.Series:
    # Find duplicate IDs
    duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]

    # Calculate the sum of repetitions
    sum_repetitions = len(duplicate_ids)

    return sum_repetitions

In [None]:
check_not_nulls(df_ground_truth)

id         0
same       0
authors    0
dtype: int64


In [None]:
check_not_nulls(df_inputs)

id         0
fandoms    0
pair       0
dtype: int64


Only on training data

## Dataset generation

In order to create a robust dataset, we decided to separate pairs and their respective fandoms, utilizing the fandom information to generate a new dataset of pairs.


In [None]:
assert count_duplicate_ids(df_ground_truth) == count_duplicate_ids(df_inputs)

Column 'same' is removed as it doesn't provide relevant information for model training. This is because it's a comparison between two IDs that are outputs of the model.

In [None]:
df_combined = df_combined.drop("authors", axis=1).drop("fandoms", axis=1)

Rename "authors" to "y"

In [None]:
df_combined = df_combined.rename(columns={'same': 'y'})

In [None]:
df_combined.head()

Unnamed: 0,id,y,pair
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[I shift a bit, warily letting my eyes dart fr..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[I shift a bit, warily letting my eyes dart fr..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,[A single tear escaped me as I left. I did hav...
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[""Ja."" Ludwig kept his gaze upon her, solidly...."
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[And he did. Slowly, hesitantly...but coming f..."


In [None]:
df_combined.iloc[0]

id                   6cced668-6e51-5212-873c-717f2bc91ce6
y                                                    True
pair    [I shift a bit, warily letting my eyes dart fr...
Name: 0, dtype: object

In [None]:
df_combined[['text1', 'text2']] = df_combined['pair'].apply(pd.Series)
df_combined = df_combined.drop("pair", axis=1)

In [None]:
df_combined.head()

Unnamed: 0,id,y,text1,text2
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"I shift a bit, warily letting my eyes dart fro...","""All will become one with Russia,"" he said, al..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"I shift a bit, warily letting my eyes dart fro...","Suddenly, a piece of ice falls into the pit of..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,A single tear escaped me as I left. I did have...,"got the Yang yoyo."" Kimiko pulled the other ha..."
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"""Ja."" Ludwig kept his gaze upon her, solidly. ...",SilverGray lll...YellowRagged llll...GrayMilli...
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"And he did. Slowly, hesitantly...but coming fr...","""Let""s go,"" Raimondo said and then started in ..."


In [None]:
df_combined.iloc[1, 1]

True

In [None]:
mean_length = 0
for i in range(len(df_combined)):
    mean_length += len(df_combined.iloc[i, 2]) + len(df_combined.iloc[i, 3])

mean_length /= len(df_combined) * 2
mean_length = int(mean_length)
mean_length

21441

For extra scalability we created the `CustomDataset` class, which  tokenizes a dataframe's text data using a specified model, and provides individual encoded texts along with their targets.

In [9]:
class CustomDataset(Dataset):
    def __init__(self, df, model_name, max_len=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data = df
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        encoded_input_text1 = self.tokenizer(self.data.iloc[index, 2], max_length=self.max_len, padding=True, truncation=True, return_tensors='pt')
        encoded_input_text2 = self.tokenizer(self.data.iloc[index, 3], max_length=self.max_len, padding=True, truncation=True, return_tensors='pt')

        return {
            "encoded_input_text1": encoded_input_text1,
            "encoded_input_text2": encoded_input_text2,
            "targets": torch.tensor(int(self.data.iloc[index, 1]), dtype=torch.float)
        }

# Model

The Transformer requires 2 dimensions, yet the tokenizer yields 3: batch, 1, and text length. To retain the batch (enabling the capture of all batches and their respective text), an additional dimension is inserted in the second place, signifying what the Transformer would receive if the batch weren't used.

In [10]:
class TransformerModel(nn.Module):
    def __init__(self, model_name, freeze_transformer):
        super(TransformerModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)

        if freeze_transformer:
            for param in self.transformer.parameters():
                param.requires_grad = False

        self.dense1 = nn.Linear(768, 512)
        self.dropout = nn.Dropout(0.4)
        self.cosine = nn.CosineSimilarity(dim=1)
        self.dense = nn.Linear(1, 1)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, encoded_input_text1, encoded_input_text2):
        input_ids_text1 = encoded_input_text1['input_ids']
        attention_mask_text1 = encoded_input_text1['attention_mask']

        input_ids_text2 = encoded_input_text2['input_ids']
        attention_mask_text2 = encoded_input_text2['attention_mask']

        # Move tensors to GPU
        if torch.cuda.is_available():
            input_ids_text1 = input_ids_text1.cuda()
            attention_mask_text1 = attention_mask_text1.cuda()
            input_ids_text2 = input_ids_text2.cuda()
            attention_mask_text2 = attention_mask_text2.cuda()

        model_output_text1 = self.transformer(
            input_ids=input_ids_text1[:, 0, :],
            attention_mask=attention_mask_text1[:, 0, :],
        ).last_hidden_state[:, 0]
        model_output_text2 = self.transformer(
            input_ids=input_ids_text2[:, 0, :],
            attention_mask=attention_mask_text2[:, 0, :],
        ).last_hidden_state[:, 0]

        x_a, x_b = self.dense1(model_output_text1), self.dense1(model_output_text2)
        x_a, x_b = self.gelu(self.dropout(x_a)), self.gelu(self.dropout(x_b))
        sem_sim = self.cosine(x_a, x_b)

        return self.sigmoid(sem_sim)

# Training model

In order to train the model, we defined the `evaluate` function, which assess the model's performance by computing the loss and accuracy metrics based on predictions and actual targets from a given data loader, and the `training_step` function, which performs a single training iteration by predicting outputs, computing loss, backpropagating, and updating model weights using the provided optimizer and loss function, returning the computed loss value.

Since we are working on comparing styles or embeddings, the `AnnaWegmann/Style-Embedding` model might yield good results, as it's trained to capture aspects of style in text.

In [11]:
model_name = 'AnnaWegmann/Style-Embedding' # chosen pretrained model

In [None]:
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)
train_dataset = CustomDataset(train_df, model_name, max_len=mean_length)
train_data_loader = DataLoader(train_dataset, batch_size=1, pin_memory=True, shuffle=True)

In [12]:
def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            encoded_input_text1 = batch['encoded_input_text1']
            encoded_input_text2 = batch['encoded_input_text2']
            targets = batch['targets']

            y_pred = model.forward(encoded_input_text1, encoded_input_text2)

            if torch.cuda.is_available():
                targets = targets.cuda()

            # Calculate loss
            loss = criterion(y_pred, targets)
            total_loss += loss.item()

            # Calculate accuracy
            predictions = (y_pred > 0.5).float()  # Assuming a binary classification task
            correct_predictions += (predictions == targets).sum().item()
            total_samples += targets.size(0)

    accuracy = correct_predictions / total_samples
    average_loss = total_loss / len(data_loader)

    return average_loss, accuracy

In [13]:
def training_step(encoded_input_text1, encoded_input_text2, targets, model, optimizer, criterion):
    # forward pass
    y_pred = model.forward(encoded_input_text1, encoded_input_text2)

    if torch.cuda.is_available():
        targets = targets.cuda()

    loss = criterion(y_pred, targets)

    # baccpropagate
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In the next cell, we define a training loop for a given batch size and epoch quantity, printing the progress for our loss and accuracy, saving the model once its training is complete.

In [None]:
model = TransformerModel(model_name=model_name, freeze_transformer=True)

if torch.cuda.is_available():
    model.cuda()

# loss function
criterion = nn.BCELoss()

# Define optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training and validation sets
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)

# Training loop
NUM_EPOCHS = 10
BATCH_SIZE = 32

train_dataset = CustomDataset(train_df, model_name)
validate_dataset = CustomDataset(val_df, model_name)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(validate_dataset, batch_size=BATCH_SIZE, shuffle=True)

i = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0

    for i, batch in enumerate(train_data_loader):
        input_text1 = batch['encoded_input_text1']
        input_text2 = batch['encoded_input_text2']
        targets = batch['targets']

        loss = training_step(input_text1, input_text2, targets, model, optimizer, criterion)
        running_loss += loss

        if i % 100 == 99:  # Print every 100 mini-batches
            print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}], "
                  f"Step [{i + 1}/{len(train_data_loader)}], "
                  f"Loss: {running_loss / 100}")
            running_loss = 0.0

    # Save the model weights after each epoch
    if epoch == 9:
        print(f'Saving model "model_epoch_{epoch + 1}"')
        checkpoint_path = f"model_epoch_{epoch + 1}.pt"
        torch.save(model.state_dict(), checkpoint_path)

    # The model is validated on the validation set after each epoch
    val_loss, val_accuracy = evaluate(model, val_data_loader)
    print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Validation Loss: {val_loss}, Accuracy: {val_accuracy}')

print("Saving model after training")
checkpoint_path = "finish_training_model.pt"
torch.save(model.state_dict(), checkpoint_path)
print('Finished Training')

Epoch [1/10], Step [100/1316], Loss: 0.6702577978372574
Epoch [1/10], Step [200/1316], Loss: 0.669662925004959
Epoch [1/10], Step [300/1316], Loss: 0.6717429214715958
Epoch [1/10], Step [400/1316], Loss: 0.6716173785924912
Epoch [1/10], Step [500/1316], Loss: 0.6701306539773941
Epoch [1/10], Step [600/1316], Loss: 0.6685152971744537
Epoch [1/10], Step [700/1316], Loss: 0.6625740367174149
Epoch [1/10], Step [800/1316], Loss: 0.6663932853937149
Epoch [1/10], Step [900/1316], Loss: 0.6653453743457795
Epoch [1/10], Step [1000/1316], Loss: 0.6729839885234833
Epoch [1/10], Step [1100/1316], Loss: 0.6659513080120086
Epoch [1/10], Step [1200/1316], Loss: 0.66863156914711
Epoch [1/10], Step [1300/1316], Loss: 0.6662837010622025
Epoch [1/10], Validation Loss: 0.6644689328402371, Accuracy: 0.5383372921615202
Epoch [2/10], Step [100/1316], Loss: 0.6622751587629319
Epoch [2/10], Step [200/1316], Loss: 0.6659164321422577
Epoch [2/10], Step [300/1316], Loss: 0.6680008327960968
Epoch [2/10], Step [400

The model consistently maintaining an accuracy around 0.54 suggests a consistent but modest performance, possibly due to the briefness of the training (only 10 epochs) or slight underfitting.

## Testing the model


We will now repeat the process with the provided test dataset:

In [14]:
# FOLDER = f"{PARENT_FOLDER}/pan20-authorship-verification-test/pan20-authorship-verification-test"
VALUES_FILE = f"{PARENT_FOLDER}/pan20-authorship-verification-test/pan20-authorship-verification-test/pan20-authorship-verification-test"
GROUND_TRUTH = f"{PARENT_FOLDER}/pan20-authorship-verification-test/pan20-authorship-verification-test/pan20-authorship-verification-test-truth.jsonl"

In [16]:
df_ground_truth = get_dataframe_from_file(f"{GROUND_TRUTH}")
df_inputs = get_dataframe_from_file(f"{VALUES_FILE}")

df_combined_val = pd.merge(df_ground_truth, df_inputs, on='id')

In [17]:
df_combined_val.head()

Unnamed: 0,id,same,authors,fandoms,pair
0,c04fdf1e-ddf5-5542-96e7-13ce18cae176,True,"[1555420, 1555420]","[CSI: New York, Four Brothers]","[""Calm down, Nicolas. You don""t wanna do somet..."
1,49dc4cae-3d32-5b4d-b240-a080a1dbb659,False,"[301516, 98554]","[Final Fantasy VIII, Escaflowne]","[""Squall!?"" Zell was panicking. Squall was mov..."
2,f326fe7c-fc10-566f-a70f-0f36e3f92399,False,"[3374404, 632668]","[Pretty Little Liars, Sonic the Hedgehog]","[""Just talk to the first girl you bump into an..."
3,16daa0d1-61b8-5650-b7ee-5e265bd40910,True,"[2639199, 2639199]","[Hetalia - Axis Powers, Kuroko no Basuke/黒子のバスケ]","[""I""ll be fine,"" Alfred said, his grin not wav..."
4,08b536a8-4fed-5f62-97bb-e57f79e841d2,False,"[1437540, 4527525]","[Evangelion, Transformers/Beast Wars]","[dominated by a huge desk. Behind the desk, an..."


In [18]:
df_combined_val = df_combined_val.drop("authors", axis=1)
df_combined_val = df_combined_val.drop("fandoms", axis=1)
df_combined_val = df_combined_val.rename(columns={'same': 'y'})

In [19]:
df_combined_val.head()

Unnamed: 0,id,y,pair
0,c04fdf1e-ddf5-5542-96e7-13ce18cae176,True,"[""Calm down, Nicolas. You don""t wanna do somet..."
1,49dc4cae-3d32-5b4d-b240-a080a1dbb659,False,"[""Squall!?"" Zell was panicking. Squall was mov..."
2,f326fe7c-fc10-566f-a70f-0f36e3f92399,False,"[""Just talk to the first girl you bump into an..."
3,16daa0d1-61b8-5650-b7ee-5e265bd40910,True,"[""I""ll be fine,"" Alfred said, his grin not wav..."
4,08b536a8-4fed-5f62-97bb-e57f79e841d2,False,"[dominated by a huge desk. Behind the desk, an..."


In [20]:
df_combined_val.iloc[0]

id                   c04fdf1e-ddf5-5542-96e7-13ce18cae176
y                                                    True
pair    ["Calm down, Nicolas. You don"t wanna do somet...
Name: 0, dtype: object

In [21]:
df_combined_val[['text1', 'text2']] = df_combined_val['pair'].apply(pd.Series)
df_combined_val = df_combined_val.drop("pair", axis=1)

In [30]:
model = TransformerModel(model_name=model_name, freeze_transformer=True)
model.load_state_dict(torch.load("./drive/MyDrive/NLP/finish_training_model.pt"))
model.eval()

if torch.cuda.is_available():
    model.cuda()

correct = 0
total = 0
BATCH_SIZE = 32

validate_dataset = CustomDataset(df_combined_val, model_name)
val_data_loader = DataLoader(validate_dataset, batch_size=BATCH_SIZE, shuffle=True)

with torch.no_grad():
    for i, batch in enumerate(val_data_loader):
        input_text1 = batch['encoded_input_text1']
        input_text2 = batch['encoded_input_text2']
        targets = batch['targets'].cuda()

        outputs = model.forward(input_text1, input_text2)

        _, predicted = torch.max(outputs, 0)

        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {100 * accuracy:.2f}%')

Test Accuracy: 2.60%


In contrast to the training accuracy, the test accuracy has turned out to be remarkably low, which, again, can be attributed to the training's briefness or underfitting.