In [1]:
# !python3 -m pip install -q torch transformers numpy pandas sentence-transformers -U scikit-learn

In [2]:
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Traceback (most recent call last):
  File "/usr/bin/jupyter-nbextension", line 33, in <module>
    sys.exit(load_entry_point('notebook==6.4.8', 'console_scripts', 'jupyter-nbextension')())
  File "/usr/lib/python3/dist-packages/jupyter_core/application.py", line 264, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "/usr/lib/python3/dist-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/usr/lib/python3/dist-packages/notebook/nbextensions.py", line 980, in start
    super().start()
  File "/usr/lib/python3/dist-packages/jupyter_core/application.py", line 253, in start
    self.subapp.start()
  File "/usr/lib/python3/dist-packages/notebook/nbextensions.py", line 888, in start
    self.toggle_nbextension_python(self.extra_args[0])
  File "/usr/lib/python3/dist-packages/notebook/nbextensions.py", line 861, in toggle_nbextension_python
  

In [3]:
import os
import json
import pandas as pd
from typing import List

In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split



In [5]:
PARENT_FOLDER = "PAN2020-authorship-verification"
DATASET1_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small-truth.jsonl"
DATASET2_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small.jsonl"
FILE_PATH_1 = f'{PARENT_FOLDER}/{DATASET1_TRAIN}'
FILE_PATH_2 = f'{PARENT_FOLDER}/{DATASET2_TRAIN}'

In [6]:
def get_dataframe_from_file (file_path : str) -> List:
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            try:
                parsed_data = json.loads(line)
                data.append(parsed_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")

    return pd.DataFrame(data)

In [7]:
df_ground_truth = get_dataframe_from_file(FILE_PATH_1)
df_inputs = get_dataframe_from_file(FILE_PATH_2)

df_combined = pd.merge(df_ground_truth, df_inputs, on='id')

In [8]:
df_ground_truth.head()

Unnamed: 0,id,same,authors
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[1446633, 1446633]"
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[1446633, 1446633]"
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,"[1446633, 1446633]"
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[1446633, 1446633]"
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[1446633, 1446633]"


In [9]:
len(df_ground_truth)

52601

In [10]:
def check_not_nulls(df: pd.DataFrame) -> None:
    print(df.isnull().sum())

In [11]:
def count_duplicate_ids(df: pd.DataFrame) -> pd.Series:
    # Find duplicate IDs
    duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]

    # Calculate the sum of repetitions
    sum_repetitions = len(duplicate_ids)

    return sum_repetitions

In [12]:
check_not_nulls(df_ground_truth)

id         0
same       0
authors    0
dtype: int64


In [13]:
check_not_nulls(df_inputs)

id         0
fandoms    0
pair       0
dtype: int64


Only on training data

## Generate Dataset

- Robust dataset: Separate pairs and with its fandoms. Use fandoms to generate new dataset of pairs.

In [14]:
assert count_duplicate_ids(df_ground_truth) == count_duplicate_ids(df_inputs)

In [15]:
assert len(df_combined) - len(df_inputs) == 22

Se elimina la columna "same" ya que no da información relevante para el entrenamiento del modelo. Debido a que es una comparación entre dos ids que son las salidas del modelo.

In [16]:
df_combined = df_combined.drop("authors", axis=1).drop("fandoms", axis=1)

Rename "authors" to "y"

In [17]:
df_combined = df_combined.rename(columns={'same': 'y'})

In [18]:
df_combined.head()

Unnamed: 0,id,y,pair
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[I shift a bit, warily letting my eyes dart fr..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[I shift a bit, warily letting my eyes dart fr..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,[A single tear escaped me as I left. I did hav...
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[""Ja."" Ludwig kept his gaze upon her, solidly...."
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[And he did. Slowly, hesitantly...but coming f..."


In [19]:
df_combined.iloc[0]

id                   6cced668-6e51-5212-873c-717f2bc91ce6
y                                                    True
pair    [I shift a bit, warily letting my eyes dart fr...
Name: 0, dtype: object

In [20]:
df_combined[['text1', 'text2']] = df_combined['pair'].apply(pd.Series)
df_combined = df_combined.drop("pair", axis=1)

In [21]:
df_combined.head()

Unnamed: 0,id,y,text1,text2
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"I shift a bit, warily letting my eyes dart fro...","""All will become one with Russia,"" he said, al..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"I shift a bit, warily letting my eyes dart fro...","Suddenly, a piece of ice falls into the pit of..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,A single tear escaped me as I left. I did have...,"got the Yang yoyo."" Kimiko pulled the other ha..."
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"""Ja."" Ludwig kept his gaze upon her, solidly. ...",SilverGray lll...YellowRagged llll...GrayMilli...
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"And he did. Slowly, hesitantly...but coming fr...","""Let""s go,"" Raimondo said and then started in ..."


In [22]:
df_combined.iloc[1, 1]

True

In [23]:
mean_length = 0
for i in range(len(df_combined)):
    mean_length += len(df_combined.iloc[i, 2]) + len(df_combined.iloc[i, 3])

mean_length /= len(df_combined) * 2
mean_length = int(mean_length)
mean_length

21441

In [24]:
class CustomDataset(Dataset):
    def __init__(self, df, model_name, max_len=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data = df
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        encoded_input_text1 = self.tokenizer(self.data.iloc[index, 2], max_length=512, padding=True, truncation=True, return_tensors='pt')
        encoded_input_text2 = self.tokenizer(self.data.iloc[index, 3], max_length=512, padding=True, truncation=True, return_tensors='pt')

        return {
            "encoded_input_text1": encoded_input_text1,
            "encoded_input_text2": encoded_input_text2,
            "targets": torch.tensor(int(self.data.iloc[index, 1]), dtype=torch.float)
        }

# Model

In [25]:
"""
# transformer without woth pairs
class TransformerModel(nn.Module):
    def __init__(self, model_name):
        super(TransformerModel, self).__init__()
        # self.transformer = AutoModel.from_pretrained(model_name)
        self.dense1 = nn.Linear(768, 512)
        self.dropout = nn.Dropout(0.1)
        self.cosine = nn.CosineSimilarity(dim=1)
        self.dense = nn.Linear(1, 1)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, encoded_input_text1, encoded_input_text2):
        model_output_text1 = self.transformer(
            input_ids=encoded_input_text1['input_ids'][0, :, :],
            attention_mask=encoded_input_text1['attention_mask'],
        ).last_hidden_state[:, 0]
        model_output_text2 = self.transformer(
            input_ids=encoded_input_text2['input_ids'][0, :, :],
            attention_mask=encoded_input_text2['attention_mask'],
        ).last_hidden_state[:, 0]

        x_a, x_b = self.dense1(model_output_text1), self.dense1(model_output_text2)
        x_a, x_b = self.gelu(self.dropout(x_a)), self.gelu(self.dropout(x_b))
        sem_sim = self.cosine(x_a, x_b)
        weighted_sem_sim = self.dense(sem_sim)

        return self.sigmoid(weighted_sem_sim)

"""

"\n# transformer without woth pairs\nclass TransformerModel(nn.Module):\n    def __init__(self, model_name):\n        super(TransformerModel, self).__init__()\n        # self.transformer = AutoModel.from_pretrained(model_name)\n        self.dense1 = nn.Linear(768, 512)\n        self.dropout = nn.Dropout(0.1)\n        self.cosine = nn.CosineSimilarity(dim=1)\n        self.dense = nn.Linear(1, 1)\n        self.gelu = nn.GELU()\n        self.sigmoid = nn.Sigmoid()\n\n    def forward(self, encoded_input_text1, encoded_input_text2):\n        model_output_text1 = self.transformer(\n            input_ids=encoded_input_text1['input_ids'][0, :, :],\n            attention_mask=encoded_input_text1['attention_mask'],\n        ).last_hidden_state[:, 0]\n        model_output_text2 = self.transformer(\n            input_ids=encoded_input_text2['input_ids'][0, :, :],\n            attention_mask=encoded_input_text2['attention_mask'],\n        ).last_hidden_state[:, 0]\n\n        x_a, x_b = self.dense1(

In [34]:
class TransformerModel(nn.Module):
    def __init__(self, embedding_size=768, hidden_size=512):
        super(TransformerModel, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        
        # Define the layers for the Transformer model
        self.embedding = nn.Embedding(10000, embedding_size)  # Change 10000 to your vocabulary size
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_layer, num_layers=6)
        
        self.dense1 = nn.Linear(embedding_size, hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.cosine = nn.CosineSimilarity(dim=1)
        self.dense = nn.Linear(1, 1)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, batch_encoding):
        embedded_text1 = self.embedding(batch_encoding['input_ids'][:, 0, :])  # Extract input_ids
        embedded_text2 = self.embedding(batch_encoding['input_ids'][:, 1, :])  # Extract input_ids
        
        encoded_text1 = self.transformer_encoder(embedded_text1)
        encoded_text2 = self.transformer_encoder(embedded_text2)

        model_output_text1 = encoded_text1[:, 0, :]  # Extract the first token's output
        model_output_text2 = encoded_text2[:, 0, :]  # Extract the first token's output
        
        x_a, x_b = self.dense1(model_output_text1), self.dense1(model_output_text2)
        x_a, x_b = self.gelu(self.dropout(x_a)), self.gelu(self.dropout(x_b))
        sem_sim = self.cosine(x_a, x_b)
        weighted_sem_sim = self.dense(sem_sim.unsqueeze(1))

        return self.sigmoid(weighted_sem_sim)


## Test mio para comprobar que funciona y corre el modelo

In [27]:
model_name = 'AnnaWegmann/Style-Embedding' # 'bert-base-uncased'  # Choose the appropriate pretrained model #'AnnaWegmann/Style-Embedding'

In [35]:
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)
train_dataset = CustomDataset(train_df, model_name, max_len=mean_length)
train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [29]:
print(train_df.index)

Index([29857, 11475, 43247, 22425,  4589, 49882, 36655,  8592, 34320, 33051,
       ...
       47191, 21962, 37194, 16850,  6265, 11284, 44732, 38158,   860, 15795],
      dtype='int64', length=42098)


Small test to see that everything works

In [36]:
# anna weinman style embeddings - hard negative mininng
model = TransformerModel() #(model_name=model_name)
model.train() # tell model we are going to train -> https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch

for batch in train_data_loader:
    x = model.forward(batch["encoded_input_text1"], batch["encoded_input_text2"])
    print(x)
    break

TypeError: TransformerModel.forward() takes 2 positional arguments but 3 were given

# Training model

See diapos a partir de la 152 y usar anotación de la diapos (ejemplo: bs_sl -> Batch size - Sequence Length)

In [None]:
# Define your loss function (customize based on your task)
criterion = nn.MSELoss()  # Example: Mean Squared Error

# Define optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Split your data into training and validation sets
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)

# Define a function to compute accuracy or other evaluation metrics
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            pairs = batch['pairs']
            fandoms = batch['fandoms']
            y = batch['y']

            # Forward pass
            y_pred = model(pairs, fandoms)

            # Calculate loss (customize based on your task)
            loss = criterion(y_pred, y)

            total_loss += loss.item()

    return total_loss / len(data_loader)

# Training loop
num_epochs = 10
batch_size = 32

train_dataset = CustomDataset(train_df, model_name)
validate_dataset = CustomDataset(val_df, model_name)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    # Create a DataLoader for training and validation data
    # You'll need to customize this part based on your dataset and preprocessing
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_data_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True)

    
    for batch in train_data_loader:
        # print(batch)
        # print(type(batch))
        
        input_text1 = batch['encoded_input_text1']        # pairs
        input_text2 = batch['encoded_input_text2']        # fandoms
        targets = batch['targets']                        # y
        
        print("------------")
        print(f'Batch keys: {batch.keys()}')
        print("------------")
        print(f'Batch belongs to type: {type(batch)}')
        print("------------")
        print(input_text1)
        print(input_text2)
        print(targets)
        print("------------")
        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        y_pred = model.forward(input_text1, input_text2)

        # Calculate loss
        loss = criterion(y_pred, targets)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Evaluate the model on the validation set
    val_loss = evaluate(model, val_data_loader)

    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_data_loader)}, Val Loss: {val_loss}')

print('Finished Training')