In [None]:
# !python3 -m pip install -q torch transformers numpy pandas sentence-transformers -U scikit-learn

In [1]:
import os
import json
import pandas as pd
from typing import List

In [2]:
PARENT_FOLDER = "PAN2020-authorship-verification"
DATASET1_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small-truth.jsonl"
DATASET2_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small.jsonl"
FILE_PATH_1 = f'{PARENT_FOLDER}/{DATASET1_TRAIN}'
FILE_PATH_2 = f'{PARENT_FOLDER}/{DATASET2_TRAIN}'

In [3]:
def get_dataframe_from_file (file_path : str) -> List:
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            try:
                parsed_data = json.loads(line)
                data.append(parsed_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")

    return pd.DataFrame(data)

In [4]:
df_ground_truth = get_dataframe_from_file(FILE_PATH_1)
df_inputs = get_dataframe_from_file(FILE_PATH_2)

df_combined = pd.merge(df_ground_truth, df_inputs, on='id')

In [5]:
df_ground_truth.head()

Unnamed: 0,id,same,authors
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[1446633, 1446633]"
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[1446633, 1446633]"
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,"[1446633, 1446633]"
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[1446633, 1446633]"
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[1446633, 1446633]"


In [6]:
len(df_ground_truth)

52601

In [7]:
def check_not_nulls(df: pd.DataFrame) -> None:
    print(df.isnull().sum())

In [8]:
def count_duplicate_ids(df: pd.DataFrame) -> pd.Series:
    # Find duplicate IDs
    duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]

    # Calculate the sum of repetitions
    sum_repetitions = len(duplicate_ids)

    return sum_repetitions

In [9]:
check_not_nulls(df_ground_truth)

id         0
same       0
authors    0
dtype: int64


In [10]:
check_not_nulls(df_inputs)

id         0
fandoms    0
pair       0
dtype: int64


Only on training data

In [11]:
assert count_duplicate_ids(df_ground_truth) == count_duplicate_ids(df_inputs)

In [12]:
assert len(df_combined) - len(df_inputs) == 22

Se elimina la columna "same" ya que no da información relevante para el entrenamiento del modelo. Debido a que es una comparación entre dos ids que son las salidas del modelo.

In [13]:
df_combined = df_combined.drop("same", axis=1)

Rename "authors" to "y"

In [14]:
df_combined = df_combined.rename(columns={'authors': 'y'})

In [15]:
df_combined.head()

Unnamed: 0,id,y,fandoms,pair
0,6cced668-6e51-5212-873c-717f2bc91ce6,"[1446633, 1446633]","[Guardians of Ga'Hoole, Hetalia - Axis Powers]","[I shift a bit, warily letting my eyes dart fr..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,"[1446633, 1446633]","[Guardians of Ga'Hoole, Warriors]","[I shift a bit, warily letting my eyes dart fr..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,"[1446633, 1446633]","[Guardians of Ga'Hoole, Xiaolin Showdown]",[A single tear escaped me as I left. I did hav...
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,"[1446633, 1446633]","[Hetalia - Axis Powers, Warriors]","[""Ja."" Ludwig kept his gaze upon her, solidly...."
4,4fe541af-912e-5a86-81a5-94c6d3891509,"[1446633, 1446633]","[Hetalia - Axis Powers, Xiaolin Showdown]","[And he did. Slowly, hesitantly...but coming f..."


In [16]:
df_combined.iloc[0]

id                      6cced668-6e51-5212-873c-717f2bc91ce6
y                                         [1446633, 1446633]
fandoms       [Guardians of Ga'Hoole, Hetalia - Axis Powers]
pair       [I shift a bit, warily letting my eyes dart fr...
Name: 0, dtype: object

## Generate Dataset

- Robust dataset: Separate pairs and with its fandoms. Use fandoms to generate new dataset of pairs.

In [17]:
def generate_dataset(df: pd.DataFrame):
    dataset = {}

    for index, row in df.iterrows():
        fandoms = row['fandoms']
        
        if dataset.get(fandoms[0]) is None:
            dataset[fandoms[0]] = []
        else:
            if row['pair'] not in dataset[fandoms[0]]:
                dataset[fandoms[0]].append((row['pair'][0], row['y'], row['id']))

    return dataset

In [18]:
generate_dataset(df_combined)

: 

# Model

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

In [None]:
accumulator = 0
for row in df_combined['pair']:
    accumulator += len(row[0]) + len(row[1])

mean = int(accumulator / len(df_combined['pair']))
mean

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, model_name, max_len=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data = df
        self.targets = self.data['y']
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # text_1 = str(self.data.iloc[index, 3][0])
        # text_1 = " ".join(text_1.split())
        # text_2 = str(self.data.iloc[index, 3][1])
        # text_2 = " ".join(text_2.split())
        # input_1 = self.tokenizer(text_1,
        #                         # max_length=self.max_len,
        #                         padding=True,
        #                         truncation=True,
        #                         return_token_type_ids=True,
        #                         return_tensors='pt')
        # input_2 = self.tokenizer(text_2,
        #                         # max_length=self.max_len,
        #                         padding=True, #'max_length',
        #                         truncation=True,
        #                         return_token_type_ids=True,
        #                         return_tensors='pt')

        # return {
        #     'ids_1': torch.tensor(input_1.input_ids, dtype=torch.long),
        #     'mask_1': torch.tensor(input_1.attention_mask, dtype=torch.long),
        #     'token_type_ids_1': torch.tensor(input_1.token_type_ids, dtype=torch.long),
        #     'ids_2': torch.tensor(input_2.input_ids, dtype=torch.long),
        #     'mask_2': torch.tensor(input_2.attention_mask, dtype=torch.long),
        #     'token_type_ids_2': torch.tensor(input_2.token_type_ids, dtype=torch.long),
        #     'targets': torch.tensor(int(self.targets[index][0]), dtype=torch.float)
        # }

        print("Index: " + str(index))
        print("Text: " + str(self.data.iloc[index, 3]))

        encoded_input = self.tokenizer(self.data.iloc[index, 3], max_length=512, padding=True, truncation=True, return_tensors='pt')
        # input_2 = self.tokenizer(text_2, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        # input_2 = self.tokenizer(text_2, padding=True, truncation=True, return_tensors='pt')

        return {
            "encoded_input": encoded_input,
            # "encoded_input_2": input_2,
            "targets": torch.tensor(int(self.targets.iloc[index, 0]), dtype=torch.float)
        }

In [None]:
# transformer without woth pairs
class TransformerModel(nn.Module):
    def __init__(self, max_len=512):
        super(TransformerModel, self).__init__()
        self.transformer = AutoModel.from_pretrained('AnnaWegmann/Style-Embedding')
        self.dense1 = nn.Linear(512, 768)
        self.dropout = nn.Dropout(0.1)
        self.cosine = nn.CosineSimilarity(dim=1)
        self.dense = nn.Linear(1, 1)
        self.gelu = nn.GELU()

    def forward(self, encoded_input):
        # x_a, x_b = self.dense1(last_hidden_state_a), self.dense1(last_hidden_state_b)
        # x_a, x_b = self.gelu(self.dropout(x_a)), self.gelu(self.dropout(x_b))
        # sem_sim = self.cosine(x_a, x_b)
        # weighted_sem_sim = self.dense(sem_sim)
        with torch.no_grad():
            model_output = self.transformer(input_ids=encoded_input['input_ids'], attention_mask=encoded_input['attention_mask'])

        return model_output
    # def forward(self, input1):
    #     self.transformer.train()
        
    #     with torch.no_grad():
    #         model_output = self.transformer(**input1)
    #     return model_output.last_hidden_state[:, 0]

## Test mio para comprobar que funciona y corre el modelo

In [None]:
num_fandoms = len(df_combined['fandoms'].explode().unique())
model_name = 'AnnaWegmann/Style-Embedding' # 'bert-base-uncased'  # Choose the appropriate pretrained model
# anna weinman style embeddings - hard negative mininng
# model = TransformerModel(num_fandoms, model_name)
model = TransformerModel(max_len=514)

In [None]:
model.train() # tell model we are going to train -> https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)
train_dataset = CustomDataset(train_df, model_name, max_len=514)
train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [None]:
train_df['pair'][0]

In [None]:
print(train_df.index)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = 'AnnaWegmann/Style-Embedding'

# Sentences we want sentence embeddings for
sentences = train_df['pair'][0]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Tokenize sentences
encoded_input = tokenizer(sentences, max_length=512, padding=True, truncation=True, return_tensors='pt')
print(encoded_input)

for batch in train_data_loader:            # "encoded_input_2": input_2,

    # Print the shape of the input tensor
    print("Input Shape:", batch['encoded_input']['input_ids'].shape)

    # Compute token embeddings
    encoded_input = batch['encoded_input']
    with torch.no_grad():
        model_output = model(input_ids=encoded_input['input_ids'], attention_mask=encoded_input['attention_mask'])

    break

print(model_output)

In [None]:
# model_1 = AutoModel.from_pretrained('AnnaWegmann/Style-Embedding')
# model_1.train()

# encoded_input = tokenizer(train_df['pair'][0], max_length=512, padding='max_length', truncation=True, return_tensors='pt')

# with torch.no_grad():
#     model_output = model_1(**encoded_input)

# print(model_output.last_hidden_state[:, 0])

for batch in train_data_loader:
    # ids, masks, token_type_ids, targets = batch['ids_1'], batch['mask_1'], batch['token_type_ids_1'], batch['targets']
    # ids2, masks2, token_type_ids2 = batch['ids_2'], batch['mask_2'], batch['token_type_ids_2']
    encoded_input = batch["encoded_input"]
    print(batch)
    print(encoded_input.keys())
    print(encoded_input)
    x = model.forward(encoded_input)
    print(x)
    break

# Training model

See diapos a partir de la 152 y usar anotación de la diapos (ejemplo: bs_sl -> Batch size - Sequence Length)

In [None]:
# Define your loss function (customize based on your task)
criterion = nn.MSELoss()  # Example: Mean Squared Error

# Define optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Split your data into training and validation sets
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)

# Define a function to compute accuracy or other evaluation metrics
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            pairs = batch['pairs']
            fandoms = batch['fandoms']
            y = batch['y']

            # Forward pass
            y_pred = model(pairs, fandoms)

            # Calculate loss (customize based on your task)
            loss = criterion(y_pred, y)

            total_loss += loss.item()

    return total_loss / len(data_loader)

# Training loop
num_epochs = 10
batch_size = 32

train_dataset = CustomDataset(train_df)
validate_dataset = CustomDataset(val_df)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    # Create a DataLoader for training and validation data
    # You'll need to customize this part based on your dataset and preprocessing
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_data_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True)

    for batch in train_data_loader:
        print(batch)
        pairs = batch['pairs']
        fandoms = batch['fandoms']
        y = batch['y']

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        y_pred = model.forward(pairs, fandoms)

        # Calculate loss
        loss = criterion(y_pred, y)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Evaluate the model on the validation set
    val_loss = evaluate(model, val_data_loader)

    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_data_loader)}, Val Loss: {val_loss}')

print('Finished Training')