In [2]:
'''
# Instructions:

Before running this Jupyter notebook, please ensure that you have the required Python packages installed. If you don't have these packages installed, you can install them using the following commands:
!pip install nltk
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install rouge
!pip install pickle5
!pip install numpy

Run this file to generate summaries and scores to evaluate the quality of the summaries
'''

import data_processing as dp
import train
import evaluate
import torch
from transformers import DistilBertTokenizer

directory = '../cnn/stories'
batch_size = 32
learning_rate = 2e-5
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rajagopalmenon.v/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rajagopalmenon.v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
dp.load_and_process_data(tokenizer, device, directory, 512, 5000)

Processed file 0 - 0002c17436637c4fe1837c935c04de47adb18e9a.story - time taken: 0.0748739242553711
Processed file 1 - 0004306354494f090ee2d7bc5ddbf80b63e80de6.story - time taken: 0.028778553009033203
Processed file 2 - 00083697263e215e5e7eda753070f08aa374dd45.story - time taken: 0.026996612548828125
Processed file 3 - 000940f2bb357ac04a236a232156d8b9b18d1667.story - time taken: 0.022791385650634766
Processed file 4 - 00120f91cfcab17bac165f7a4719019a628a9db3.story - time taken: 0.017772674560546875
Processed file 5 - 001789cf9b865dcac3d9fc032a6b1533e3318eda.story - time taken: 0.026652097702026367
Processed file 6 - 001b4673dbb3437282cd2ea58d9eca471e25780f.story - time taken: 0.03696298599243164
Processed file 7 - 0020ede07ee7ad1f6cf654c7dc678e7341d0c0e5.story - time taken: 0.04904818534851074
Processed file 8 - 002175ac42ef0c91b9fb7e07259413a8ee3979a3.story - time taken: 0.051445722579956055
Processed file 9 - 002509a01890dd51476aa84c634b6c1db306f995.story - time taken: 0.0375568866729

Token indices sequence length is longer than the specified maximum sequence length for this model (939 > 512). Running this sequence through the model will result in indexing errors


Processed file 1225 - 1656cf86bac4931f4904a1661808496181f93d96.story - time taken: 0.026322603225708008
Processed file 1226 - 1660498fb3bd2c499be68a1a52760219c37dff4d.story - time taken: 0.022155046463012695
Processed file 1227 - 1668f1eb7d7f597d145a6941270b064355165259.story - time taken: 0.025333881378173828
Processed file 1228 - 16694dbfdcb6adb1238a16676b267a2514af13da.story - time taken: 0.03923320770263672
Processed file 1229 - 167136458fbfe21a58768f18e15cc324594ec504.story - time taken: 0.023464441299438477
Processed file 1230 - 1673bc2eb8c51f63fa7d5606a127c22340a8b174.story - time taken: 0.019938230514526367
Processed file 1231 - 167426eb7e7b44dde1c6454a2383ae3f49113993.story - time taken: 0.019230127334594727
Processed file 1232 - 16755dd5d5c0134cee17fd41f43b92e3eb845771.story - time taken: 0.026882648468017578
Processed file 1233 - 167dfcfa36a50bed7f42220d6ffbafce62ec8693.story - time taken: 0.02854132652282715
Processed file 1234 - 167f20b433f58d0cff462708c7424c35307ba4df.sto

In [5]:
tokenized_sentences, attention_masks, scores, reference_summaries = dp.load_data('../processed-data-distilbert-6')
train_loader, val_loader, test_loader = dp.create_data_loaders(tokenized_sentences, attention_masks, scores, reference_summaries, batch_size)

Total dataset size: 5000
Train dataset size: 3000
Validation dataset size: 1000
Test dataset size: 1000
Number of batches in train_loader: 94


In [6]:
model = train.train_model(train_loader, val_loader, learning_rate, num_epochs)
torch.save(model.state_dict(), 'summarization_model_5.pth')

94
Batch 1: 32 samples
training: 0
Batch 2: 32 samples
training: 1
Batch 3: 32 samples
training: 2
Batch 4: 32 samples
training: 3
Batch 5: 32 samples
training: 4
Batch 6: 32 samples
training: 5
Batch 7: 32 samples
training: 6
Batch 8: 32 samples
training: 7
Batch 9: 32 samples
training: 8
Batch 10: 32 samples
training: 9
Batch 11: 32 samples
training: 10
Batch 12: 32 samples
training: 11
Batch 13: 32 samples
training: 12
Batch 14: 32 samples
training: 13
Batch 15: 32 samples
training: 14
Batch 16: 32 samples
training: 15
Batch 17: 32 samples
training: 16
Batch 18: 32 samples
training: 17
Batch 19: 32 samples
training: 18
Batch 20: 32 samples
training: 19
Batch 21: 32 samples
training: 20
Batch 22: 32 samples
training: 21
Batch 23: 32 samples
training: 22
Batch 24: 32 samples
training: 23
Batch 25: 32 samples
training: 24
Batch 26: 32 samples
training: 25
Batch 27: 32 samples
training: 26
Batch 28: 32 samples
training: 27
Batch 29: 32 samples
training: 28
Batch 30: 32 samples
training:

In [13]:
import model
model = model.SummarizationModel()

# Load the saved state dictionary into the model
model.load_state_dict(torch.load('summarization_model_5.pth'))
generated_summaries, reference_summaries = evaluate.evaluate_model(model, test_loader, tokenizer, device)

In [14]:
print(f"{reference_summaries[0]}")
print(f"{generated_summaries[0]}")


# Flatten the reference and generated summaries
flattened_reference_summaries = evaluate.flatten_summaries(reference_summaries)
flattened_generated_summaries = evaluate.flatten_summaries(generated_summaries)

metrics = evaluate.calculate_metrics(flattened_generated_summaries, flattened_reference_summaries)
print("ROUGE Scores:", metrics['rouge'])
print("BLEU Score:", metrics['bleu'])
print("METEOR Score:", metrics['meteor'])

['Police detained, questioned Fat Joe, entourage. Woman said she "found herself" in Fat Joe\'s limo. Incident followed rapper\'s show in Madison, Wisconsin. Rapper\'s lawyer calls woman a "groupie pretender". ', 'Nomination of Robert A. Harding to be announced Monday, sources say. Harding "has the experience ... to make a real difference," administration official says. Harding, who served 33 years in Army, would be the TSA\'s first African-American administrator. He once was Defense Department\'s senior human intelligence officer. ', 'Video of the scene show concertgoers returning to help. They fashioned chairs as stretchers and lifted scaffolding. Indiana Gov. Mitch Daniels said people "ran to the trouble, not from the trouble". "It\'s the character that we associate with our state," he said. ', "Man United overturns a first-leg deficit and advances in the Champions League. Dutchman Robin van Persie scores a hat-trick in a 3-0 win over Olympiacos. The win at Old Trafford boosts United

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGE Scores: {'rouge-1': {'r': 0.03532907436985732, 'p': 0.3209202909432689, 'f': 0.06303159856655474}, 'rouge-2': {'r': 0.003963774270471483, 'p': 0.043541559196122216, 'f': 0.007179646942064529}, 'rouge-l': {'r': 0.0327841279846554, 'p': 0.3001724875366168, 'f': 0.05858819445207221}}
BLEU Score: 2.0531748702430436e-84
METEOR Score: 0.031446376587667135
