<a href="https://colab.research.google.com/github/vasan12sp/DocSummarizer/blob/master/Improved_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install transformers datasets rouge-score torch evaluate


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ev

In [None]:

# Import libraries
import torch
import random
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import (
    XLNetTokenizer, XLNetForSequenceClassification,
    BartTokenizer, BartForConditionalGeneration,
    AdamW
)
from torch.cuda.amp import GradScaler, autocast
from evaluate import load



In [None]:
# Load dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Randomly sample 10,000 articles from the train set and 500 from the validation set
train_sample = random.sample(range(len(dataset['train'])), 10000)
val_sample = random.sample(range(len(dataset['validation'])), 500)

train_dataset = dataset['train'].select(train_sample)
val_dataset = dataset['validation'].select(val_sample)

print(f"Training on {len(train_dataset)} articles")
print(f"Evaluating on {len(val_dataset)} articles")

# Preprocessing function
def preprocess_function(examples):
    inputs = examples['article']
    labels = examples['highlights']
    return {"input_text": inputs, "target_text": labels}

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Create DataLoaders
train_dataloader = DataLoader(tokenized_train, batch_size=4, shuffle=True)
val_dataloader = DataLoader(tokenized_val, batch_size=4, shuffle=False)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Training on 10000 articles
Evaluating on 500 articles


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
class Summ_xlnet_bart:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        # Load XLNet
        self.xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        self.xlnet_model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-base-cased', num_labels=1
        ).to(self.device)

        # Load BART
        self.bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        self.bart_model = BartForConditionalGeneration.from_pretrained(
            'facebook/bart-large-cnn'
        ).to(self.device)

        # Load the saved model weights
        model_state = torch.load('/content/drive/My Drive/summ_xlnet_bart.pth')
        self.xlnet_model.load_state_dict(model_state['xlnet_state'])
        self.bart_model.load_state_dict(model_state['bart_state'])

        # Set models to training mode
        self.xlnet_model.train()
        self.bart_model.train()

        # Mixed precision training
        self.scaler = GradScaler()

    def train(self, train_dataloader, val_dataloader, epochs=5, learning_rate=5e-5):
        optimizer = AdamW(
            list(self.xlnet_model.parameters()) + list(self.bart_model.parameters()),
            lr=learning_rate
        )

        loss_fn = torch.nn.BCEWithLogitsLoss()

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")

            total_loss = 0
            self.xlnet_model.train()
            self.bart_model.train()

            progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

            for batch_idx, batch in enumerate(progress_bar):
                inputs = batch['input_text']
                labels = batch['target_text']

                # Tokenize inputs for XLNet
                xlnet_inputs = self.xlnet_tokenizer(
                    inputs, padding=True, truncation=True,
                    return_tensors='pt', max_length=512
                ).to(self.device)

                # Mixed precision training with autocast
                with autocast():
                    outputs = self.xlnet_model(**xlnet_inputs)
                    importance_scores = outputs.logits.squeeze()

                    # Compute loss
                    loss = loss_fn(importance_scores, torch.ones_like(importance_scores))

                # Backpropagation with mixed precision
                optimizer.zero_grad()
                self.scaler.scale(loss).backward()
                self.scaler.step(optimizer)
                self.scaler.update()

                total_loss += loss.item()
                progress = f"Loss: {loss.item():.4f}"
                progress_bar.set_description(progress)

            avg_train_loss = total_loss / len(train_dataloader)
            print(f"Avg Training Loss: {avg_train_loss:.4f}")

            # Validation after every epoch
            self.evaluate(val_dataloader)

    def evaluate(self, val_dataloader):
        rouge = load("rouge")
        predictions = []
        references = []

        with torch.no_grad():
            self.xlnet_model.eval()
            self.bart_model.eval()

            for batch in val_dataloader:
                inputs = batch['input_text']
                labels = batch['target_text']

                xlnet_inputs = self.xlnet_tokenizer(
                    inputs, padding=True, truncation=True,
                    return_tensors='pt', max_length=512
                ).to(self.device)

                outputs = self.xlnet_model(**xlnet_inputs)
                importance_scores = outputs.logits.squeeze().cpu().numpy()
                top_sentences = np.argsort(importance_scores)[-3:]

                extractive_summary = ". ".join([inputs[0].split('.')[i] for i in top_sentences]) + '.'

                bart_input = self.bart_tokenizer(
                    extractive_summary, return_tensors='pt',
                    max_length=1024, truncation=True
                ).to(self.device)

                generated_summary_ids = self.bart_model.generate(
                    **bart_input, num_beams=4, max_length=150, early_stopping=True
                )

                generated_summary = self.bart_tokenizer.decode(
                    generated_summary_ids[0], skip_special_tokens=True
                )

                predictions.append(generated_summary)
                references.append(labels[0])

        rouge_scores = rouge.compute(predictions=predictions, references=references)
        print(f"ROUGE Scores: {rouge_scores}")



In [None]:
# Initialize the model and continue training
model = Summ_xlnet_bart()
model.train(train_dataloader, val_dataloader, epochs=5)

# Save the updated model
model_state = {
    'xlnet_state': model.xlnet_model.state_dict(),
    'bart_state': model.bart_model.state_dict()
}
torch.save(model_state, '/content/drive/My Drive/summ_xlnet_bart_updated.pth')
print("Model saved to /content/drive/My Drive/summ_xlnet_bart_updated.pth")


Using device: cuda


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

  model_state = torch.load('/content/drive/My Drive/summ_xlnet_bart.pth')
  self.scaler = GradScaler()


Epoch 1/5


  with autocast():


Avg Training Loss: 0.0000


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores: {'rouge1': 0.3823671807951099, 'rouge2': 0.16133440239146038, 'rougeL': 0.26253707902486006, 'rougeLsum': 0.326244085643826}
Epoch 2/5




Avg Training Loss: 0.0000
ROUGE Scores: {'rouge1': 0.3796309279498907, 'rouge2': 0.15930613406471392, 'rougeL': 0.26053716718177605, 'rougeLsum': 0.32336800866554427}
Epoch 3/5




Avg Training Loss: 0.0000
ROUGE Scores: {'rouge1': 0.38272360106144543, 'rouge2': 0.16095205829499987, 'rougeL': 0.2630325968215641, 'rougeLsum': 0.32645790773379013}
Epoch 4/5




Avg Training Loss: 0.0000
ROUGE Scores: {'rouge1': 0.3825468573628026, 'rouge2': 0.16063572512674443, 'rougeL': 0.26253400885751044, 'rougeLsum': 0.32613271616451156}
Epoch 5/5




Avg Training Loss: 0.0000
