In [None]:
!pip install -q transformers  rouge-score

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader


from transformers import AutoTokenizer, AutoModel
sentence_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
# use "sentence-transformers/all-mpnet-base-v2" to get better accuracy
# refer to: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#sentence-embedding-models for more information
tokenizer = AutoTokenizer.from_pretrained(sentence_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install spacy
!pip install tqdm



In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import os

In [None]:
# Hyper-Parameters that seems to work best. You can change them if you want
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

# load dataframes containining preprocessed samples from CNN/Dailymail Dataset
train_df = pd.read_json("drive/MyDrive/data/merged.json")
test_df = pd.read_json("drive/MyDrive/data/test_bdf.json")
print(f"Train: {train_df.shape}, test shape: {test_df.shape}")

Train: (530199, 3), test shape: (36195, 3)


## Create a Data Loader Class

- Create a dataloader class that yields sentences and documentss and labels.

In [None]:
class cnndmData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = str(self.data.iloc[index].sents)
        sentence = " ".join(sentence.split())

        document = str(self.data.iloc[index].docs)
        document = " ".join(document.split())

        inputs = self.tokenizer.batch_encode_plus(
            [sentence, document],
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'sent_ids': torch.tensor(ids[0], dtype=torch.long),
            'doc_ids': torch.tensor(ids[1], dtype=torch.long),
            'sent_mask': torch.tensor(mask[0], dtype=torch.long),
            'doc_mask': torch.tensor(mask[1], dtype=torch.long),
            'targets': torch.tensor([self.data.iloc[index].y], dtype=torch.long)
        }

    def __len__(self):
        return self.len

training_set = cnndmData(train_df, tokenizer, MAX_LEN)
testing_set = cnndmData(test_df, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Build Model

- Build model based on sentence Bert pretrained models.

In [None]:
# get mean pooling for sentence bert models
# ref https://www.sbert.net/examples/applications/computing-embeddings/README.html#sentence-embeddings-with-transformers
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


# adding a drop out and a dense layer to vanilla bert to get the final output for the model.
# Note that different sentence transformer models may have different in_feature sizes
class SentenceBertClass(torch.nn.Module):
    def __init__(self, model_name=sentence_model, in_features=384):
        super(SentenceBertClass, self).__init__() # inherit init of SentenceBert
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(in_features*3, 768)
        # self.pre_classifier = torch.nn.Linear(in_features * 3, 512)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.classifierSigmoid = torch.nn.Sigmoid()

    def forward(self, sent_ids=None, doc_ids=None, sent_mask=None, doc_mask=None, inputs_embeds=None):

        # sent_output = self.l1(input_ids=sent_ids, attention_mask=sent_mask)
        # sentence_embeddings = mean_pooling(sent_output, sent_mask)

        if inputs_embeds is None: # check if inputs_embeds is provided
            sent_output = self.l1(input_ids=sent_ids, attention_mask=sent_mask)
        else:
            sent_output = self.l1(inputs_embeds=inputs_embeds, attention_mask=sent_mask) # use inputs_embeds if provided
        sentence_embeddings = mean_pooling(sent_output, sent_mask)

        doc_output = self.l1(input_ids=doc_ids, attention_mask=doc_mask)
        doc_embeddings = mean_pooling(doc_output, doc_mask)

        # elementwise product of sentence embs and doc embs
        combined_features = sentence_embeddings * doc_embeddings

        # Concatenate input features and their elementwise product
        concat_features = torch.cat((sentence_embeddings, doc_embeddings, combined_features), dim=1)

        pooler = self.pre_classifier(concat_features)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.classifierSigmoid(output)

        return output

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = SentenceBertClass(model_name=sentence_model)
model.to(device);

loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

In [None]:
class Discriminator(torch.nn.Module):
    def __init__(self, input_dim=768):
        super(Discriminator, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 512)
        self.fc2 = torch.nn.Linear(512, 256)
        self.fc3 = torch.nn.Linear(256, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Initialize Discriminator
discriminator = Discriminator(input_dim=768)  # Adjust based on your feature size
discriminator.to(device)

# Loss and Optimizers for adversarial learning
adv_loss_fn = torch.nn.BCELoss()  # For Discriminator
adv_optimizer = torch.optim.Adam(discriminator.parameters(), lr=1e-4)

In [None]:
def generate_adversarial_examples(embeddings, ep=1e-5):
    perturbation = torch.randn_like(embeddings) * ep
    adversarial_embeddings = embeddings + perturbation
    return adversarial_embeddings

In [None]:
# VANILLA
# NO Adversarial Training
# Defining the training function on the 80% of the dataset for tuning the distilbert model
print_n_steps = 1000
EPOCHS = 3
acc_step_holder, loss_step_holder = [], []


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):  # remove tqdm if u dont want the progress bar... i found it pretty in someone else's code so
        sent_ids = data['sent_ids'].to(device, dtype = torch.long)
        doc_ids = data['doc_ids'].to(device, dtype = torch.long)
        sent_mask = data['sent_mask'].to(device, dtype = torch.long)
        doc_mask = data['doc_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(sent_ids, doc_ids, sent_mask, doc_mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%print_n_steps==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(str(_* train_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)
            acc_step_holder.append(accu_step), loss_step_holder.append(loss_step)
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
# GAN LIKE
print_n_steps = 1000
EPOCHS = 3
acc_step_holder, loss_step_holder = [], []

def train(epoch):
    tr_loss = 0
    adv_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    discriminator.train()

    for _, data in tqdm(enumerate(training_loader, 0)):
        sent_ids = data['sent_ids'].to(device, dtype=torch.long)
        doc_ids = data['doc_ids'].to(device, dtype=torch.long)
        sent_mask = data['sent_mask'].to(device, dtype=torch.long)
        doc_mask = data['doc_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        # Generator Forward Pass
        outputs = model(sent_ids, doc_ids, sent_mask, doc_mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()

        # Generate Adversarial Examples
        sent_output = model.l1(input_ids=sent_ids, attention_mask=sent_mask)
        sent_embeddings = mean_pooling(sent_output, sent_mask)  # Get embeddings from the generator

        sent_embeddings = sent_embeddings.view(-1, 768)

        adv_embeddings = generate_adversarial_examples(sent_embeddings)

        # Discriminator Forward Pass
        real_labels = torch.ones(sent_embeddings.size(0), 1).to(device)
        fake_labels = torch.zeros(adv_embeddings.size(0), 1).to(device)

        real_outputs = discriminator(sent_embeddings.detach())  # Detach from the Generator graph
        fake_outputs = discriminator(adv_embeddings.detach())  # Detach so generator doesn’t get updated here

        real_loss = adv_loss_fn(real_outputs, real_labels)
        fake_loss = adv_loss_fn(fake_outputs, fake_labels)
        discriminator_loss = (real_loss + fake_loss) / 2

        adv_loss += discriminator_loss.item()

        # Backprop Discriminator
        adv_optimizer.zero_grad()
        discriminator_loss.backward()
        adv_optimizer.step()

        # Fool the Discriminator
        fooling_loss = adv_loss_fn(discriminator(adv_embeddings), real_labels)  # Generator tries to fool Discriminator

        # Add the adversarial loss to the generator's loss
        total_loss = loss + fooling_loss

        # Backprop Generator
        optimizer.zero_grad()
        total_loss.backward(retain_graph=True)
        optimizer.step()

        n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()
        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % print_n_steps == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"{_ * train_params['batch_size']}/{len(train_df)} - Steps. Acc -> {accu_step}, Loss -> {loss_step}")
            acc_step_holder.append(accu_step), loss_step_holder.append(loss_step)

    print(f"The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}")
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    print(f"Adversarial Loss: {adv_loss / nb_tr_steps}")

    return

In [None]:
for epoch in range(EPOCHS):
    train(epoch=epoch)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,5))
ax1.plot(acc_step_holder, label="Accuracy")
ax2.plot(loss_step_holder, label="Loss")
ax1.title.set_text("Accuracy")
ax2.title.set_text("Loss")
fig.tight_layout()
plt.show()

## Validation on Test Set

In [None]:
# validation was heavily inspired by a blog and ChatGPT
def validate_model(model, testing_loader):
    model.eval()

    n_correct = 0; n_wrong = 0; total = 0;  tr_loss = 0; nb_tr_steps = 0 ; nb_tr_examples = 0;
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):

            sent_ids = data['sent_ids'].to(device, dtype = torch.long)
            doc_ids = data['doc_ids'].to(device, dtype = torch.long)
            sent_mask = data['sent_mask'].to(device, dtype = torch.long)
            doc_mask = data['doc_mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(sent_ids, doc_ids, sent_mask, doc_mask)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()

            n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%print_n_steps==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(str(_* test_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)


    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu


In [None]:
acc = validate_model(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

## Save Model 1

In [None]:
import os

os.makedirs("drive/MyDrive/data/models", exist_ok=True)
torch.save(model.state_dict(), "drive/MyDrive/data/models/bal_model.pth")
!gsutil cp -r models $sum_dir