# Fine Tuning BART for Summarization

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
import os
from sklearn.model_selection import train_test_split


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
import os
from sklearn.model_selection import train_test_split


# Load data from JSON files
json_directory = "/content/drive/MyDrive/PLP Project/updated/focus"
# json_directory = "/content/drive/MyDrive/focus"
dfs = [pd.read_json(os.path.join(json_directory, file), encoding='latin-1').dropna(subset=['Gemini Summary']).iloc[:, 1:3] for file in os.listdir(json_directory) if os.path.isfile(os.path.join(json_directory, file))]
consolidated_df = pd.concat(dfs, ignore_index=True)

# Rename columns
consolidated_df = consolidated_df.rename(columns={'Gemini Summary': 'text', 'Section Text': 'ctext'})

# Add prefix to source text
consolidated_df.ctext = 'summarize: ' + consolidated_df.ctext

# Split data into train and validation sets
train_data, val_data = train_test_split(consolidated_df, train_size=0.8, random_state=42)

# Define tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Define dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['ctext']
        target_text = self.data.iloc[idx]['text']

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze()
        }

# Create datasets and dataloaders
train_dataset = CustomDataset(train_data, tokenizer, max_length=512)
val_dataset = CustomDataset(val_data, tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:

# Fine-tune the model
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)


In [None]:
from tqdm import tqdm

# Define the total number of epochs
TRAIN_EPOCHS = 10

# Training loop
for epoch in range(TRAIN_EPOCHS):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}")

    for batch_idx, batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)  # Access input_ids using the key 'input_ids'
        attention_mask = batch['attention_mask'].to(device)  # Access attention_mask using the key 'attention_mask'
        labels = batch['labels'].to(device)  # Access labels using the key 'labels'
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": total_loss / len(progress_bar)})  # Update progress bar with current loss

    # Print average loss after each epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150)
        generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # Store the generated summaries as needed

    # Save the model after each epoch
    model_directory = "/content/drive/MyDrive/PLP Project/updated/focus/model_4-BART"
    model.save_pretrained(model_directory)


Epoch 1: 100%|██████████| 516/516 [03:46<00:00,  2.28it/s, loss=1.4]


Epoch 1, Average Loss: 1.4026


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 2: 100%|██████████| 516/516 [03:46<00:00,  2.27it/s, loss=0.855]


Epoch 2, Average Loss: 0.8552


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 3: 100%|██████████| 516/516 [03:47<00:00,  2.26it/s, loss=0.725]


Epoch 3, Average Loss: 0.7246


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 4: 100%|██████████| 516/516 [03:47<00:00,  2.26it/s, loss=0.629]


Epoch 4, Average Loss: 0.6292


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 5: 100%|██████████| 516/516 [03:46<00:00,  2.27it/s, loss=0.553]


Epoch 5, Average Loss: 0.5526


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 6: 100%|██████████| 516/516 [03:47<00:00,  2.27it/s, loss=0.487]


Epoch 6, Average Loss: 0.4872


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 7: 100%|██████████| 516/516 [03:48<00:00,  2.26it/s, loss=0.424]


Epoch 7, Average Loss: 0.4242


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 8: 100%|██████████| 516/516 [03:47<00:00,  2.26it/s, loss=0.374]


Epoch 8, Average Loss: 0.3745


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 9: 100%|██████████| 516/516 [03:48<00:00,  2.26it/s, loss=0.323]


Epoch 9, Average Loss: 0.3226


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Epoch 10: 100%|██████████| 516/516 [03:47<00:00,  2.26it/s, loss=0.286]


Epoch 10, Average Loss: 0.2862


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
from tqdm import tqdm

# Load the trained model and tokenizer
model_directory = "/content/drive/MyDrive/PLP Project/updated/focus/model_4-BART"
model = BartForConditionalGeneration.from_pretrained(model_directory)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Set the model to evaluation mode
model.eval()

# Create a new column to store the generated summaries
consolidated_df['generated_summary'] = ''

# Move the model to the appropriate device
model.to(device)

# Iterate through consolidated_df
for index, row in tqdm(consolidated_df.iterrows(), total=len(consolidated_df), desc="Generating Summaries"):
    # Tokenize the ctext
    input_encoding = tokenizer(row['ctext'], padding="max_length", truncation=True, return_tensors="pt")

    # Move input tensors to the appropriate device
    input_ids = input_encoding["input_ids"].to(device)
    attention_mask = input_encoding["attention_mask"].to(device)

    # Generate the summary without limiting the maximum length
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    # Decode the generated summary tokens into text
    generated_summary = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Add the generated summary to the DataFrame
    consolidated_df.at[index, 'generated_summary'] = generated_summary

# Print the DataFrame to verify the generated summaries are added
print(consolidated_df.head())

# Save DataFrame to JSON file
json_file_path = "/content/drive/MyDrive/PLP Project/updated/focus/model_4-BART/consolidated_df_with_summaries.json"
consolidated_df.to_json(json_file_path, orient="records")


Generating Summaries: 100%|██████████| 1291/1291 [06:17<00:00,  3.42it/s]


                                               ctext  \
0  summarize: Item 1. Business\nCompany Backgroun...   
1  summarize: Item 1A. Risk Factors\nThe Company’...   
2  summarize: Item 1B. Unresolved Staff Comments\...   
3  summarize: Item 2. Properties\nThe Company’s h...   
4  summarize: Item 3. Legal Proceedings\nEpic Gam...   

                                                text  \
0  **Company Overview:**\n\nApple designs, manufa...   
1  **Business, Financial, and Legal Risks**\n\n* ...   
2  There are no unresolved staff comments for Ite...   
3  The company's headquarters are in Cupertino, C...   
4  Epic Games sued Apple for antitrust violations...   

                                   generated_summary  
0  **Company Overview**\n\nApple designs, manufac...  
1  **Item 1A. Risk Factors**\n\n**Macroeconomic a...  
2  There are no unresolved staff comments for Ite...  
3  Apple's headquarters are in Cupertino, Califor...  
4  Epic Games sued Apple for antitrust violations..

In [None]:
# Install the rouge_score library
!pip install rouge_score

# Import necessary libraries
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate through the DataFrame to compute ROUGE scores
for index, row in consolidated_df.iterrows():
    # Calculate ROUGE scores for each pair of reference summary and generated summary
    scores = scorer.score(row['text'], row['generated_summary'])

    # Extract and append ROUGE scores to respective lists
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Compute average ROUGE scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print average ROUGE scores
print(f'Average ROUGE-1 score: {avg_rouge1:.4f}')
print(f'Average ROUGE-2 score: {avg_rouge2:.4f}')
print(f'Average ROUGE-L score: {avg_rougeL:.4f}')


Average ROUGE-1 score: 0.2852
Average ROUGE-2 score: 0.2195
Average ROUGE-L score: 0.2641


In [None]:
from rouge_score import rouge_scorer

# Initialize a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)

# Initialize lists to store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
rougeLsum_scores = []

# Iterate through the DataFrame rows
for index, row in tqdm(consolidated_df.iterrows(), total=len(consolidated_df), desc="Calculating ROUGE Scores"):
    # Calculate ROUGE scores for each pair of reference summary and generated summary
    scores = scorer.score(row['text'], row['generated_summary'])

    # Extract individual scores
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)
    rougeLsum_scores.append(scores['rougeLsum'].fmeasure)

# Compute average scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
avg_rougeLsum = sum(rougeLsum_scores) / len(rougeLsum_scores)

# Print average scores
print("Average ROUGE-1 F1 Score:", avg_rouge1)
print("Average ROUGE-2 F1 Score:", avg_rouge2)
print("Average ROUGE-L F1 Score:", avg_rougeL)
print("Average ROUGE-Lsum F1 Score:", avg_rougeLsum)


Calculating ROUGE Scores: 100%|██████████| 1291/1291 [00:27<00:00, 46.13it/s]

Average ROUGE-1 F1 Score: 0.28515637218659495
Average ROUGE-2 F1 Score: 0.21953605025218417
Average ROUGE-L F1 Score: 0.2641083722483588
Average ROUGE-Lsum F1 Score: 0.26666224958096735





In [None]:
model.save_pretrained('bart_epochs_10')
tokenizer.save_pretrained('bart_tokenizer_epochs_10')


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('bart_tokenizer_epochs_10/tokenizer_config.json',
 'bart_tokenizer_epochs_10/special_tokens_map.json',
 'bart_tokenizer_epochs_10/vocab.json',
 'bart_tokenizer_epochs_10/merges.txt',
 'bart_tokenizer_epochs_10/added_tokens.json')

In [2]:
repo_name = 'BART-10K-summarization'

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/297.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from t

In [7]:
!pip install accelerate -U



In [3]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [3]:
model_path = '/content/drive/MyDrive/PLP Project/updated/focus/model_4-BART'
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from transformers import Trainer, TrainingArguments

hf_token = "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"

args = TrainingArguments(
    output_dir='./results',  # where to save model checkpoints
    hub_model_id=f'yatharth97/{repo_name}',  # your HF model repository
    push_to_hub=True,  # enables pushing to hub after training, if you train within this script
    hub_token=hf_token,  # your Hugging Face API token
)

In [5]:
trainer = Trainer(model=model, tokenizer=tokenizer, args=args)


In [6]:
trainer.push_to_hub(
    commit_message="Commit message describing the changes made",
    blocking=True
)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yatharth97/BART-10K-summarization/commit/3a7eb1b24d20948eaf611e14292dd86821898953', commit_message='Commit message describing the changes made', commit_description='', oid='3a7eb1b24d20948eaf611e14292dd86821898953', pr_url=None, pr_revision=None, pr_num=None)