In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_df['article_tokens'] = train_df['article'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))
train_df['highlights_tokens'] = train_df['highlights'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=150))


In [None]:
val_df['article_tokens'] = val_df['article'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))
val_df['highlights_tokens'] = val_df['highlights'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=150))


In [None]:
test_df['article_tokens'] = test_df['article'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))
test_df['highlights_tokens'] = test_df['highlights'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=150))


In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, article_tokens, highlights_tokens):
        self.article_tokens = article_tokens
        self.highlights_tokens = highlights_tokens

    def __len__(self):
        return len(self.article_tokens)

    def __getitem__(self, idx):
        return {'input_ids': self.article_tokens[idx], 'labels': self.highlights_tokens[idx]}

In [None]:
train_dataset = SummarizationDataset(train_df['article_tokens'].tolist(), train_df['highlights_tokens'].tolist())
val_dataset = SummarizationDataset(val_df['article_tokens'].tolist(), val_df['highlights_tokens'].tolist())
test_dataset = SummarizationDataset(test_df['article_tokens'].tolist(), test_df['highlights_tokens'].tolist())

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=150)  
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
for epoch in range(3):  
    model.train()
    for batch in train_dataloader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs, labels=labels)
            val_losses.append(outputs.loss.item())

    avg_val_loss = sum(val_losses) / len(val_losses)
    print(f"Epoch {epoch + 1}, Average Validation Loss: {avg_val_loss}")

model.save_pretrained('./')

In [None]:
model.eval()
references = []
predictions = []

for idx, row in test_df.iterrows():
    article_text = row['article']
    reference_summary = row['highlights']
    inputs = tokenizer(article_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        summary_ids = model.generate(**inputs)
        predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    references.append(reference_summary)
    predictions.append(predicted_summary)

rouge_scores = scorer.score(references, predictions)
meteor_scores = [meteor_score([ref], pred) for ref, pred in zip(references, predictions)]

print("ROUGE Scores:", rouge_scores)
print("METEOR Scores:", meteor_scores)

plt.bar(['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'METEOR'], [rouge_scores['rouge1'][2], rouge_scores['rouge2'][2], rouge_scores['rougeL'][2], sum(meteor_scores) / len(meteor_scores)])
plt.ylabel('Score')
plt.title('Evaluation Metrics')
plt.show()