In [None]:

!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install transformers==4.37.2
!pip install datasets==2.17.0
!pip install evaluate==0.4.1
!pip install rouge-score==0.1.2

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize

import torch

from tqdm import tqdm

from datasets import load_dataset, load_from_disk, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, set_seed


In [None]:
import huggingface_hub

huggingface_hub.login(token="TOKEN_API")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "facebook/bart-large-xsum"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

### Load Dataset

In [None]:
import requests
import zipfile
import os

# Define the URL of the ZIP file
url = 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip'

# Define the file paths
zip_file_path = 'summarizer-data.zip'
extracted_folder_path = 'summarizer-data'

# Download the ZIP file
response = requests.get(url)
with open(zip_file_path, 'wb') as zip_file:
    zip_file.write(response.content)

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

# Remove the ZIP file
os.remove(zip_file_path)

print('Downloaded and extracted the ZIP file successfully.')


In [None]:
dataset = load_from_disk("summarizer-data/samsum_dataset/")
dataset

In [None]:
# Calculate split lengths
split_lengths = [len(dataset[split]) for split in dataset]


print("Split lengths:", split_lengths)


print("Features:", dataset['train'].column_names)


print("\nDialogue:")
print(dataset["test"][1]["dialogue"])


print("\nSummary:")
print(dataset["test"][1]["summary"])


In [None]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
validation_df = pd.DataFrame(dataset["validation"])


In [None]:
from datasets import Dataset

# Assuming train, test, and val are pandas DataFrames defined elsewhere in your code

# Convert pandas DataFrames to Dataset objects
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
val_ds = Dataset.from_pandas(validation_df)

print(f"Train Dataset:\n{train_ds}\n\n")
print(f"Test Dataset:\n{test_ds}\n\n")
print(f"Validation Dataset:\n{val_ds}\n\n")

In [None]:
def preprocess_example(example):
    """
    Preprocesses a single example (data point) for BART model input.

    Args:
        example: A dictionary containing 'dialogue' (list of text) and 'summary' (text).

    Returns:
        model_inputs: A dictionary of tokenized inputs and labels ready for BART. 
    """

    # Extract dialogues and prepare for tokenization
    inputs = example['dialogue']

    # Tokenize input dialogues for BART
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenize target summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example['summary'], max_length=128, truncation=True)

    # Include tokenized labels in model inputs
    model_inputs['labels'] = labels['input_ids']

    return model_inputs


# Tokenize and preprocess datasets
tokenized_train = train_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])
tokenized_test = test_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])
tokenized_val = val_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])

print(tokenized_train)
print(tokenized_test)
print(tokenized_val)


In [None]:
# Checking samples
sample = tokenized_train[0]
print("Input IDs:")
print(sample['input_ids'])
print("\nAttention Mask:")
print(sample['attention_mask'])
print("\nLabels:")
print(sample['labels'])

## Modeling