**Importing Required Libraries and fixing the Data Path**

In [84]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import os
from tqdm import tqdm

DATA_PATH = '/content/article_highlights.csv'

**BERT-based Text Classification Pipeline**

In [None]:
class ArticleDataProcessor:
    def __init__(self, data_path, tokenizer, max_length=512):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_data(self):
        """Load data from CSV file"""
        df = pd.read_csv(self.data_path)
        return df

    def preprocess_data(self, df):
        """Preprocess data for BERT fine-tuning"""

        def create_label(row):
            # Convert to string to handle potential NaN values
            article = str(row['article']).lower()
            highlight = str(row['highlights']).lower()
            # Check if the first 50 chars of highlight appear in the article
            if highlight[:50] in article:
                return 1
            return 0

        # Apply the function to create labels
        df['label'] = df.apply(create_label, axis=1)

        # Split data into train and validation sets
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
        return train_df, val_df

    def prepare_features(self, text):
        """Convert text to BERT input features"""
        return self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

class ArticleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, is_training=True):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_training = is_training

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert to string to handle potential NaN values
        article = str(self.data.iloc[idx]['article'])
        highlight = str(self.data.iloc[idx]['highlights'])

        # Combine article and highlight for classification
        text = f"Article: {article} Highlight: {highlight}"

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Convert dictionary of tensors to flat tensors
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        if self.is_training:
            label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': label
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }

class BERTModelManager:
    def __init__(self, num_labels=2, model_name='bert-base-uncased'):
        """Initialize the BERT model for fine-tuning"""
        self.model_name = model_name
        self.num_labels = num_labels
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

    def get_tokenizer(self):
        return self.tokenizer

    def get_model(self):
        return self.model

    def save_model(self, output_dir):
        """Save the fine-tuned model and tokenizer"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        print(f"Model saved to {output_dir}")

class BERTTrainer:
    def __init__(self, model, tokenizer, train_dataloader, val_dataloader,
                 device, epochs=3, learning_rate=2e-5):
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.device = device
        self.epochs = epochs
        self.learning_rate = learning_rate

    def train(self):
        """Train the BERT model"""
        self.model.to(self.device)

        # Set up optimizer and scheduler
        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        total_steps = len(self.train_dataloader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Training loop
        for epoch in range(self.epochs):
            print(f"\nEpoch {epoch+1}/{self.epochs}")
            self.model.train()
            total_loss = 0

            # Progress bar for training
            progress_bar = tqdm(self.train_dataloader, desc="Training")

            for batch in progress_bar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

            avg_train_loss = total_loss / len(self.train_dataloader)
            print(f"Average training loss: {avg_train_loss:.4f}")

            # Validation after each epoch
            val_loss, val_accuracy = self.evaluate()
            print(f"Validation Loss: {val_loss:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")

        return self.model

    def evaluate(self):
        """Evaluate the model on validation data"""
        self.model.eval()
        val_loss = 0
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in tqdm(self.val_dataloader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1).cpu().numpy()

                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())

        val_loss = val_loss / len(self.val_dataloader)
        val_accuracy = accuracy_score(true_labels, predictions)

        print("\nClassification Report:")
        print(classification_report(true_labels, predictions))

        return val_loss, val_accuracy

**Training the Model and Saving the Model**

In [None]:

def main():
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Setup BERT model and tokenizer
    model_manager = BERTModelManager()
    tokenizer = model_manager.get_tokenizer()
    model = model_manager.get_model()

    # Load and preprocess data
    data_processor = ArticleDataProcessor(DATA_PATH, tokenizer)
    df = data_processor.load_data()
    train_df, val_df = data_processor.preprocess_data(df)

    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")

    # Create datasets
    train_dataset = ArticleDataset(train_df, tokenizer)
    val_dataset = ArticleDataset(val_df, tokenizer)

    # Create dataloaders
    batch_size = 8
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    # Train the model
    trainer = BERTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        device=device,
        epochs=3
    )

    fine_tuned_model = trainer.train()

    # Save the model
    a = '/content/drive/MyDrive/tdl_orange_problem'
    output_dir = os.path.join(a, 'fine_tuned_bert')
    model_manager.save_model(output_dir)

if __name__ == "__main__":
    main()

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training samples: 6540
Validation samples: 1636





Epoch 1/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 818/818 [10:31<00:00,  1.30it/s, loss=0.0000]


Average training loss: 0.0077


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 205/205 [00:50<00:00,  4.08it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       1.00      1.00      1.00      1633

    accuracy                           1.00      1636
   macro avg       0.50      0.50      0.50      1636
weighted avg       1.00      1.00      1.00      1636

Validation Loss: 0.0019
Validation Accuracy: 0.9982

Epoch 2/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 818/818 [10:30<00:00,  1.30it/s, loss=0.0000]


Average training loss: 0.0003


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 205/205 [00:50<00:00,  4.08it/s]



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00      1633

    accuracy                           1.00      1636
   macro avg       1.00      1.00      1.00      1636
weighted avg       1.00      1.00      1.00      1636

Validation Loss: 0.0001
Validation Accuracy: 1.0000

Epoch 3/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 818/818 [10:30<00:00,  1.30it/s, loss=0.0000]


Average training loss: 0.0000


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 205/205 [00:50<00:00,  4.09it/s]



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00      1633

    accuracy                           1.00      1636
   macro avg       1.00      1.00      1.00      1636
weighted avg       1.00      1.00      1.00      1636

Validation Loss: 0.0000
Validation Accuracy: 1.0000
Model saved to /content/drive/MyDrive/tdl_orange_problem/fine_tuned_bert


**Importing Required Libraries**

In [43]:
import torch
import numpy as np
import re
from transformers import BertTokenizer, BertForSequenceClassification

**Saves the File Path where the Bert Model is saved**

In [None]:
bert_model_path = '/content/drive/MyDrive/tdl_orange_problem/fine_tuned_bert'  # Update this path as needed

**Load the Tokenizer and the Model**

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_path)
model = BertForSequenceClassification.from_pretrained(bert_model_path)

**To make sure to use the same device as used during training**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

**Extractive Summarization Function using BERT**

In [None]:
def simple_sentence_tokenize(text):
    """
    A simple function to split text into sentences without using NLTK
    """
    # Split on period, exclamation mark, or question mark followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # Filter out empty sentences
    return [s for s in sentences if s.strip()]

def extractive_summarize(article, num_sentences=3):
    """
    Use BERT to perform extractive summarization by ranking sentences.

    Args:
        article (str): The input article text
        num_sentences (int): Number of sentences to include in the summary

    Returns:
        str: The extractive summary
    """
    # Split the article into sentences using our custom function
    sentences = simple_sentence_tokenize(article)

    if len(sentences) <= num_sentences:
        return article

    # Store sentence scores
    scores = []

    for sentence in sentences:
        # Tokenize the sentence
        inputs = tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        # Get the prediction logits
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the positive class logit as the sentence importance score
            score = outputs.logits[0][1].item()
            scores.append(score)

    # Select top sentences
    top_indices = np.argsort(scores)[-num_sentences:]
    top_indices = sorted(top_indices)

    # Form the summary
    summary = ' '.join([sentences[i] for i in top_indices])

    return summary

**Generating Summary**

In [48]:
# Example article
article_example = """
Liverpool secured a commanding 4-1 victory over Luton Town in the Premier League match. The Reds initially fell behind in the opening minutes but quickly recovered with goals from Van Dijk, Gakpo, and a brace from Salah. The win helps Liverpool maintain their position at the top of the Premier League table. Manager JÃ¼rgen Klopp praised the team's resilience and attacking prowess, particularly highlighting the performance of their Egyptian forward. Luton Town, despite the defeat, showed moments of quality but ultimately couldn't match Liverpool's class throughout the 90 minutes.
"""

# Generate and print summary
print("Original Article:")
print(article_example)
print("\n" + "-"*50 + "\n")

print("BERT Extractive Summary:")
summary = extractive_summarize(article_example)
print(summary)

Original Article:

Liverpool secured a commanding 4-1 victory over Luton Town in the Premier League match. The Reds initially fell behind in the opening minutes but quickly recovered with goals from Van Dijk, Gakpo, and a brace from Salah. The win helps Liverpool maintain their position at the top of the Premier League table. Manager JÃ¼rgen Klopp praised the team's resilience and attacking prowess, particularly highlighting the performance of their Egyptian forward. Luton Town, despite the defeat, showed moments of quality but ultimately couldn't match Liverpool's class throughout the 90 minutes.


--------------------------------------------------

BERT Extractive Summary:

Liverpool secured a commanding 4-1 victory over Luton Town in the Premier League match. The Reds initially fell behind in the opening minutes but quickly recovered with goals from Van Dijk, Gakpo, and a brace from Salah. The win helps Liverpool maintain their position at the top of the Premier League table.
