In [1]:
%pip install -q torch transformers numpy pandas sentence-transformers -U scikit-learn

In [2]:
%pip install --upgrade numpy
%pip install --upgrade scikit-learn



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import json
import pandas as pd
from typing import List

In [5]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

In [6]:
# PARENT_FOLDER = "PAN2020-authorship-verification"
# DATASET1_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small/pan20-authorship-verification-training-small-truth.jsonl"
# DATASET2_TRAIN = "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small/pan20-authorship-verification-training-small.jsonl"
# DATASET1_TRAIN = "pan20-authorship-verification-training-small-truth.jsonl"
# DATASET2_TRAIN = "pan20-authorship-verification-training-small.jsonl"
DATASET_DRIVE_TRAIN_GROUND = "./drive/MyDrive/NLP/pan20-authorship-verification-training-small-truth.jsonl"
DATASET_DRIVE_TRAIN = "./drive/MyDrive/NLP/pan20-authorship-verification-training-small.jsonl"

In [7]:
def get_dataframe_from_file (file_path : str) -> List:
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            try:
                parsed_data = json.loads(line)
                data.append(parsed_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")

    return pd.DataFrame(data)

In [8]:
df_ground_truth = get_dataframe_from_file(DATASET_DRIVE_TRAIN_GROUND)
df_inputs = get_dataframe_from_file(DATASET_DRIVE_TRAIN)

df_combined = pd.merge(df_ground_truth, df_inputs, on='id')

######################
#     CUIDADO!!!!!   #
######################
#df_combined = df_combined.head(2**10)
print(len(df_combined))

52623


In [9]:
df_ground_truth.head()

Unnamed: 0,id,same,authors
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[1446633, 1446633]"
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[1446633, 1446633]"
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,"[1446633, 1446633]"
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[1446633, 1446633]"
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[1446633, 1446633]"


In [10]:
len(df_ground_truth)

52601

In [11]:
def check_not_nulls(df: pd.DataFrame) -> None:
    print(df.isnull().sum())

In [12]:
def count_duplicate_ids(df: pd.DataFrame) -> pd.Series:
    # Find duplicate IDs
    duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]

    # Calculate the sum of repetitions
    sum_repetitions = len(duplicate_ids)

    return sum_repetitions

In [13]:
check_not_nulls(df_ground_truth)

id         0
same       0
authors    0
dtype: int64


In [14]:
check_not_nulls(df_inputs)

id         0
fandoms    0
pair       0
dtype: int64


Only on training data

## Generate Dataset

- Robust dataset: Separate pairs and with its fandoms. Use fandoms to generate new dataset of pairs.

In [15]:
assert count_duplicate_ids(df_ground_truth) == count_duplicate_ids(df_inputs)

In [16]:
# assert len(df_combined) - len(df_inputs) == 22

Se elimina la columna "same" ya que no da información relevante para el entrenamiento del modelo. Debido a que es una comparación entre dos ids que son las salidas del modelo.

In [17]:
df_combined = df_combined.drop("authors", axis=1).drop("fandoms", axis=1)

Rename "authors" to "y"

In [18]:
df_combined = df_combined.rename(columns={'same': 'y'})

In [19]:
df_combined.head()

Unnamed: 0,id,y,pair
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"[I shift a bit, warily letting my eyes dart fr..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"[I shift a bit, warily letting my eyes dart fr..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,[A single tear escaped me as I left. I did hav...
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"[""Ja."" Ludwig kept his gaze upon her, solidly...."
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"[And he did. Slowly, hesitantly...but coming f..."


In [20]:
df_combined.iloc[0]

id                   6cced668-6e51-5212-873c-717f2bc91ce6
y                                                    True
pair    [I shift a bit, warily letting my eyes dart fr...
Name: 0, dtype: object

In [21]:
df_combined[['text1', 'text2']] = df_combined['pair'].apply(pd.Series)
df_combined = df_combined.drop("pair", axis=1)

In [22]:
df_combined.head()

Unnamed: 0,id,y,text1,text2
0,6cced668-6e51-5212-873c-717f2bc91ce6,True,"I shift a bit, warily letting my eyes dart fro...","""All will become one with Russia,"" he said, al..."
1,3c6c188a-db28-59aa-8c09-3d0f799ff579,True,"I shift a bit, warily letting my eyes dart fro...","Suddenly, a piece of ice falls into the pit of..."
2,b0cfa94f-c9ec-5aa5-8331-a5a249b664cf,True,A single tear escaped me as I left. I did have...,"got the Yang yoyo."" Kimiko pulled the other ha..."
3,e6e86e73-9a7b-58f2-a652-a17b4a1bcabf,True,"""Ja."" Ludwig kept his gaze upon her, solidly. ...",SilverGray lll...YellowRagged llll...GrayMilli...
4,4fe541af-912e-5a86-81a5-94c6d3891509,True,"And he did. Slowly, hesitantly...but coming fr...","""Let""s go,"" Raimondo said and then started in ..."


In [23]:
df_combined.iloc[1, 1]

True

In [24]:
mean_length = 0
for i in range(len(df_combined)):
    mean_length += len(df_combined.iloc[i, 2]) + len(df_combined.iloc[i, 3])

mean_length /= len(df_combined) * 2
mean_length = int(mean_length)
mean_length

21441

In [25]:
class CustomDataset(Dataset):
    def __init__(self, df, model_name, max_len=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data = df
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        encoded_input_text1 = self.tokenizer(self.data.iloc[index, 2], max_length=512, padding=True, truncation=True, return_tensors='pt')
        encoded_input_text2 = self.tokenizer(self.data.iloc[index, 3], max_length=512, padding=True, truncation=True, return_tensors='pt')

        return {
            "encoded_input_text1": encoded_input_text1,
            "encoded_input_text2": encoded_input_text2,
            "targets": torch.tensor(int(self.data.iloc[index, 1]), dtype=torch.float)
        }

# Model

In [26]:
# transformer without woth pairs
class TransformerModel(nn.Module):
    def __init__(self, model_name, freeze_transformer):
        super(TransformerModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)

        if freeze_transformer:
            for param in self.transformer.parameters():
                param.requires_grad = False

        self.dense1 = nn.Linear(768, 512)
        self.dropout = nn.Dropout(0.4)
        self.cosine = nn.CosineSimilarity(dim=1)
        self.dense = nn.Linear(1, 1)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, encoded_input_text1, encoded_input_text2):
        input_ids_text1 = encoded_input_text1['input_ids']
        attention_mask_text1 = encoded_input_text1['attention_mask']

        input_ids_text2 = encoded_input_text2['input_ids']
        attention_mask_text2 = encoded_input_text2['attention_mask']

        input_ids_text1 = input_ids_text1.cuda()
        attention_mask_text1 = attention_mask_text1.cuda()
        input_ids_text2 = input_ids_text2.cuda()
        attention_mask_text2 = attention_mask_text2.cuda()

        # Move tensors to GPU
        if torch.cuda.is_available():
            input_ids_text1 =
            attention_mask_text1

        model_output_text1 = self.transformer(
            input_ids=input_ids_text1[:, 0, :],
            attention_mask=attention_mask_text1[:, 0, :],
        ).last_hidden_state[:, 0]
        model_output_text2 = self.transformer(
            input_ids=input_ids_text2[:, 0, :],
            attention_mask=attention_mask_text2[:, 0, :],
        ).last_hidden_state[:, 0]

        x_a, x_b = self.dense1(model_output_text1), self.dense1(model_output_text2)
        x_a, x_b = self.gelu(self.dropout(x_a)), self.gelu(self.dropout(x_b))
        sem_sim = self.cosine(x_a, x_b)

        return self.sigmoid(sem_sim)

## Test mio para comprobar que funciona y corre el modelo

In [27]:
model_name = 'AnnaWegmann/Style-Embedding' # 'bert-base-uncased'  # Choose the appropriate pretrained model #'AnnaWegmann/Style-Embedding'

In [28]:
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)
train_dataset = CustomDataset(train_df, model_name, max_len=mean_length)
train_data_loader = DataLoader(train_dataset, batch_size=1, pin_memory=True, shuffle=True)

In [29]:
print(train_df.index)

Index([29857, 11475, 43247, 22425,  4589, 49882, 36655,  8592, 34320, 33051,
       ...
       47191, 21962, 37194, 16850,  6265, 11284, 44732, 38158,   860, 15795],
      dtype='int64', length=42098)


Small test to see that everything works

In [30]:
# anna weinman style embeddings - hard negative mininng
model = TransformerModel(model_name=model_name, freeze_transformer=True)

if torch.cuda.is_available():
    model.cuda()

model.train()

for batch in train_data_loader:
    encoded_input_text1 = batch["encoded_input_text1"]
    encoded_input_text2 = batch["encoded_input_text2"]

    x = model.forward(encoded_input_text1, encoded_input_text2)
    print(x)
    break


tensor([0.5861], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [31]:
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

0
Tesla T4


# Training model

See diapos a partir de la 152 y usar anotación de la diapos (ejemplo: bs_sl -> Batch size - Sequence Length)

In [32]:
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            encoded_input_text1 = batch['encoded_input_text1'].cuda()
            encoded_input_text2 = batch['encoded_input_text2'].cuda()
            targets = batch['targets'].cuda()

            y_pred = model.forward(encoded_input_text1, encoded_input_text2)

            # Calculate loss
            loss = criterion(y_pred, targets)
            total_loss += loss.item()

            # Calculate accuracy
            predictions = (y_pred > 0.5).float()  # Assuming a binary classification task
            correct_predictions += (predictions == targets).sum().item()
            total_samples += targets.size(0)

    accuracy = correct_predictions / total_samples
    average_loss = total_loss / len(data_loader)

    print(f"predictions (real): {y_pred}")
    print(f"predictions: {predictions}")
    print(f"ground_truth: {targets}")

    return average_loss, accuracy

In [33]:
def training_step(encoded_input_text1, encoded_input_text2, targets, model, optimizer, criterion):
    # !!!! necessary to set the model to training mode before

    # forward pass
    y_pred = model.forward(encoded_input_text1, encoded_input_text2)

    loss = criterion(y_pred, targets.cuda())

    # baccpropagate
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [None]:
model = TransformerModel(model_name=model_name, freeze_transformer=True)

if torch.cuda.is_available():
    model.cuda()

# Define your loss function (customize based on your task)
criterion = nn.BCELoss()  # Example: Binary Cross Entropy

# Define optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Split your data into training and validation sets
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)

# Training loop
NUM_EPOCHS = 10
BATCH_SIZE = 32

train_dataset = CustomDataset(train_df, model_name)
validate_dataset = CustomDataset(val_df, model_name)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(validate_dataset, batch_size=BATCH_SIZE, shuffle=True)

i = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0

    for i, batch in enumerate(train_data_loader):
        input_text1 = batch['encoded_input_text1']
        input_text2 = batch['encoded_input_text2']
        targets = batch['targets']

        loss = training_step(input_text1, input_text2, targets, model, optimizer, criterion)
        running_loss += loss

        if i % 100 == 99:  # Print every 100 mini-batches
            print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}], "
                  f"Step [{i + 1}/{len(train_data_loader)}], "
                  f"Loss: {running_loss / 100}")
            running_loss = 0.0

    # Save the model weights after each epoch
    checkpoint_path = f"model_epoch_{epoch + 1}.pt"

    # Evaluate the model on the validation set after each epoch
    val_loss, val_accuracy = evaluate(model, val_data_loader)
    print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Validation Loss: {val_loss}, Accuracy: {val_accuracy}')

torch.save(model.state_dict(), checkpoint_path)
print('Finished Training')

Epoch [1/10], Step [100/1316], Loss: 0.6712942403554917
Epoch [1/10], Step [200/1316], Loss: 0.6747627449035645
Epoch [1/10], Step [300/1316], Loss: 0.6723224359750748
Epoch [1/10], Step [400/1316], Loss: 0.6653332728147506
Epoch [1/10], Step [500/1316], Loss: 0.6660202199220657


QUEDA:

1. El validate-test serían pasar en un bucle el forward del otro zip y capturar resultados para conseguir las métricas
2. Fine tunning (mínimo)
3. Escribir cosas

PD: Quitar el print de los shape

In [None]:
raise ValueError('FALTA HACER LA PARTE DE VALIDACIÓN, PONER LAS COSAS BIEN!!!! (SOBRE TODO LOS FOLDER)')

In [None]:
FOLDER = f"{PARENT_FOLDER}/pan20-authorship-verification-test/pan20-authorship-verification-test"
VALUES_FILE = "pan20-authorship-verification-test.jsonl"
GROUND_TRUTH = "pan20-authorship-verification-test-truth.jsonl"

In [None]:
df_ground_truth = get_dataframe_from_file(f"{FOLDER}/{GROUND_TRUTH}")
df_inputs = get_dataframe_from_file(f"{FOLDER}/{VALUES_FILE}")

df_combined_val = pd.merge(df_ground_truth, df_inputs, on='id')

In [None]:
df_combined_val = df_combined.drop("authors", axis=1).drop("fandoms", axis=1)
df_combined_val = df_combined_val.rename(columns={'same': 'y'})

In [None]:
df_combined_val.head()

In [None]:
df_combined_val.iloc[0]

In [None]:
df_combined_val[['text1', 'text2']] = df_combined_val['pair'].apply(pd.Series)
df_combined_val = df_combined_val.drop("pair", axis=1)

In [None]:
model.load_state_dict(torch.load("model_epoch_10.pt"))
model.eval()

correct = 0
total = 0

test_dataset = CustomDataset(train_df, model_name)
validate_dataset = CustomDataset(val_df, model_name)

test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True)

with torch.no_grad():
    for i, batch in enumerate(test_data_loader):
        input_text1 = batch['encoded_input_text1']
        input_text2 = batch['encoded_input_text2']
        targets = batch['targets']

        outputs = model.forward(input_text1, input_text2)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {100 * accuracy:.2f}%')
