In [1]:
from zipfile import ZipFile
import os

# Path to the ZIP file
zip_path = 'Summaries.zip'
extract_path = 'Summaries'

# Extract the ZIP file
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List the contents of the extracted directory
extracted_files = os.listdir(extract_path)
extracted_files

['Summaries', 'News Articles']

In [2]:
# Paths to the inner directories
news_articles_path = os.path.join(extract_path, 'News Articles')
summaries_path = os.path.join(extract_path, 'Summaries')

# List the contents of the News Articles and Summaries directories
news_articles_files = os.listdir(news_articles_path)
summaries_files = os.listdir(summaries_path)

news_articles_files, summaries_files

(['entertainment', 'business', 'tech', 'politics', 'sport'],
 ['entertainment', 'business', 'tech', 'politics', 'sport'])

In [3]:
# Path to one of the category directories for both articles and summaries
category_news_path = os.path.join(news_articles_path, 'business')
category_summaries_path = os.path.join(summaries_path, 'business')

# List some of the files in these directories
category_news_files = os.listdir(category_news_path)[:5]  # List first 5 files
category_summaries_files = os.listdir(category_summaries_path)[
    :5]  # List first 5 files

# Display filenames
category_news_files, category_summaries_files


(['167.txt', '186.txt', '286.txt', '099.txt', '211.txt'],
 ['167.txt', '186.txt', '286.txt', '099.txt', '211.txt'])

In [4]:
# Read the contents of the first news article and its summary
with open(os.path.join(category_news_path, '001.txt'), 'r', encoding='utf-8') as file:
    news_article_sample = file.read()

with open(os.path.join(category_summaries_path, '001.txt'), 'r', encoding='utf-8') as file:
    summary_sample = file.read()

news_article_sample, summary_sample

('Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to si

In [5]:
def load_data_with_fallback_encoding(category_path, summaries_path):
    articles = []
    summaries = []

    # Get all article files in the category
    article_files = sorted(os.listdir(category_path))

    for article_file in article_files:
        article_path = os.path.join(category_path, article_file)
        summary_path = os.path.join(summaries_path, article_file)

        # Try to read the article and summary files with a fallback encoding
        try:
            with open(article_path, 'r', encoding='utf-8') as file:
                article = file.read()
        except UnicodeDecodeError:
            with open(article_path, 'r', encoding='iso-8859-1') as file:
                article = file.read()

        try:
            with open(summary_path, 'r', encoding='utf-8') as file:
                summary = file.read()
        except UnicodeDecodeError:
            with open(summary_path, 'r', encoding='iso-8859-1') as file:
                summary = file.read()

        articles.append(article)
        summaries.append(summary)

    return articles, summaries

In [6]:
# Reload data from each category with fallback encoding
categories = ['business', 'entertainment', 'politics', 'sport', 'tech']
all_articles = []
all_summaries = []
all_categories = []

In [7]:
import pandas as pd
for category in categories:
    category_news_path = os.path.join(news_articles_path, category)
    category_summaries_path = os.path.join(summaries_path, category)
    articles, summaries = load_data_with_fallback_encoding(
        category_news_path, category_summaries_path)

    all_articles.extend(articles)
    all_summaries.extend(summaries)
    all_categories.extend([category] * len(articles))

# Create a DataFrame with the loaded data
data_df = pd.DataFrame({
    'Category': all_categories,
    'Article': all_articles,
    'Summary': all_summaries
})

data_df.head(), data_df.shape

(   Category                                            Article  \
 0  business  Ad sales boost Time Warner profit\n\nQuarterly...   
 1  business  Dollar gains on Greenspan speech\n\nThe dollar...   
 2  business  Yukos unit buyer faces loan claim\n\nThe owner...   
 3  business  High fuel prices hit BA's profits\n\nBritish A...   
 4  business  Pernod takeover talk lifts Domecq\n\nShares in...   
 
                                              Summary  
 0  TimeWarner said fourth quarter sales rose 2% t...  
 1  The dollar has hit its highest level against t...  
 2  Yukos' owner Menatep Group says it will ask Ro...  
 3  Rod Eddington, BA's chief executive, said the ...  
 4  Pernod has reduced the debt it took on to fund...  ,
 (2225, 3))

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets (temporary set will be further split into validation and test)
train_df, temp_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Split the temporary set into validation and test sets
validation_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42)

train_df.shape, validation_df.shape, test_df.shape


((1780, 3), (222, 3), (223, 3))

In [9]:
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import torch

# Use a smaller model if possible, like BART-base
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
class SummarizationDataset(Dataset):
    def __init__(self, tokenizer, articles, summaries, max_length=256):
        self.tokenizer = tokenizer
        self.articles = articles
        self.summaries = summaries
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article_encoded = self.tokenizer(self.articles[idx], return_tensors='pt',
                                         truncation=True, padding='max_length', max_length=self.max_length)
        summary_encoded = self.tokenizer(self.summaries[idx], return_tensors='pt',
                                         truncation=True, padding='max_length', max_length=self.max_length)
        return {
            'input_ids': article_encoded['input_ids'].squeeze(0),
            'attention_mask': article_encoded['attention_mask'].squeeze(0),
            'labels': summary_encoded['input_ids'].squeeze(0)
        }

train_dataset = SummarizationDataset(tokenizer, train_df['Article'].tolist(), train_df['Summary'].tolist())
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Reduced batch size

In [11]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(3):  # Maintain or adjust the number of epochs as necessary
    for batch in train_loader:
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")



Epoch: 0, Loss: 5.66681432723999
Epoch: 0, Loss: 5.7609100341796875
Epoch: 0, Loss: 6.137808799743652
Epoch: 0, Loss: 4.6346611976623535
Epoch: 0, Loss: 4.375171184539795
Epoch: 0, Loss: 5.533237457275391
Epoch: 0, Loss: 4.785006046295166
Epoch: 0, Loss: 4.30952787399292
Epoch: 0, Loss: 4.129088401794434
Epoch: 0, Loss: 4.824732780456543
Epoch: 0, Loss: 3.848839282989502
Epoch: 0, Loss: 4.196347713470459
Epoch: 0, Loss: 3.4949588775634766
Epoch: 0, Loss: 4.281175136566162
Epoch: 0, Loss: 4.053802490234375
Epoch: 0, Loss: 4.114142417907715
Epoch: 0, Loss: 3.7110671997070312
Epoch: 0, Loss: 3.3556606769561768
Epoch: 0, Loss: 3.377974033355713
Epoch: 0, Loss: 3.281545639038086
Epoch: 0, Loss: 3.3881051540374756
Epoch: 0, Loss: 3.2720024585723877
Epoch: 0, Loss: 2.9274868965148926
Epoch: 0, Loss: 2.7139949798583984
Epoch: 0, Loss: 3.047201633453369
Epoch: 0, Loss: 2.741421699523926
Epoch: 0, Loss: 3.129354953765869
Epoch: 0, Loss: 2.1966893672943115
Epoch: 0, Loss: 2.502671241760254
Epoch:

In [15]:
from rouge_score import rouge_scorer

def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    # Average the scores
    avg_scores = {key: sum(values) / len(values) for key, values in scores.items()}
    return avg_scores

# Example usage within an evaluation function
def evaluate_model(model, data_loader, tokenizer, device='cuda'):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_beams=5)
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

    # Compute ROUGE scores using the custom function
    rouge_scores = compute_rouge(all_preds, all_labels)
    return rouge_scores

# Save the model
model.save_pretrained('./saved_bart_model')
tokenizer.save_pretrained('./saved_bart_model')


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./saved_bart_model/tokenizer_config.json',
 './saved_bart_model/special_tokens_map.json',
 './saved_bart_model/vocab.json',
 './saved_bart_model/merges.txt',
 './saved_bart_model/added_tokens.json')

In [16]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Path to the saved model
model_dir = './saved_bart_model'

# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)


In [17]:
def prepare_input(text, tokenizer, max_length=512):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
    return inputs


In [20]:
def generate_summary(input_text, tokenizer, model, device='cpu'):
    # Prepare the model and input data
    model.to(device)
    model.eval()
    inputs = prepare_input(input_text, tokenizer)

    # Generate summary
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_beams=5, early_stopping=True)

    # Decode generated ids to text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary


In [21]:
# Example text
input_text = "The quick brown fox jumps over the lazy dog. This famous sentence contains every letter in the English language, making it a pangram used in typing practice and testing typewriters."

# Generate the summary using the updated function
summary = generate_summary(input_text, tokenizer, model)
print("Generated Summary:", summary)

Generated Summary: This famous sentence contains every letter in the English language, making it a pangram used in typing practice and testing typewriters.
