In [1]:
!pip install evaluate
!pip install rouge_score

import numpy as np
import pandas as pd
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.corpus import wordnet
from nltk import pos_tag
from scipy import spatial
import networkx as nx
import zipfile
from datasets import load_dataset
from operator import itemgetter

nltk.download('punkt')
nltk.download('stopwords')

stop_words = stopwords.words('english')

# The number of sentences to keep in the extractive summary

def getExtraction(sentence, N = 5):
    train_report = sentence
    train_sentences = sent_tokenize(train_report)
    sentence_tokens = []
    for sentence in train_sentences:
        new_sentence = re.sub(r'[^\w\s]', '', sentence.lower())
        words_in_sentence = word_tokenize(new_sentence)
        words_in_sentence = [word for word in words_in_sentence if word not in stop_words]
        sentence_tokens.append(words_in_sentence)
        
    word_to_vector = Word2Vec(sentences = sentence_tokens, vector_size = 1, min_count = 1, epochs = 1000)
    # Create a list of sentence embeddings for each sentence in the report
    sentence_embeddings = []
    # Find the maximum sentence length in the report to pad the sentence embeddings
    maximum_sentence_length = max([len(sentence_token) for sentence_token in sentence_tokens])
    # For each sentence in the report, calculate the mean of the word embeddings for each word in the sentence
    for sentence in sentence_tokens:
        sentence_embedding = [np.mean(word_to_vector.wv[word]) for word in sentence]
        # Pad the sentence embeddings to the maximum sentence length
        sentence_embedding = np.pad(sentence_embedding, (0, maximum_sentence_length - len(sentence_embedding)), 'constant')
        sentence_embeddings.append(sentence_embedding)

    # Use cosine similarity to calculate the similarity between each sentence in the report
    # Create a matrix to store the cosine similarity between each sentence
    similarity_matrix = np.zeros((len(sentence_tokens), len(sentence_tokens)))
    for j in range(len(sentence_tokens)):
        for k in range(len(sentence_tokens)):
            if j != k:
                # used 1 - cosine similarity to calculate the cosine distance
                similarity_matrix[j][k] = 1 - spatial.distance.cosine(sentence_embeddings[j], sentence_embeddings[k])
    # Create a network to represent the similarity between each sentence in the report
    similarity_network = nx.from_numpy_array(similarity_matrix)
    text_rank_scores = nx.pagerank(similarity_network, max_iter = 10000)

    # keep track of the score and the corresponding sentence
    sentence_scores = {}
    for j in range(len(sentence_tokens)):
        sentence_scores[train_sentences[j]] = text_rank_scores[j]

    # Sort the sentences based on the text rank scores and keep the top N sentences
    top_n_sent = dict(sorted(sentence_scores.items(), key = itemgetter(1), reverse = True)[:N])
    summary = ''
    # Generate the extractive summary by concatenating the top N sentences in the same order appearing in the report
    for train_sentence in train_sentences:
        if train_sentence in top_n_sent:
            summary += " "+train_sentence
    
    return summary


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=34c95eb2404fcb390fab46e0e6f5a3716f7e37d46ab66f874f04c5c4044a7f4e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[nltk_data] Downlo

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from datasets import load_dataset
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = load_dataset("ccdv/govreport-summarization", split="train[:500]")

print("Train Shape: ", np.shape(train_data))

SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "UNK"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


def normalizeString(s):
    s = s.lower().strip()
    return s.strip()

def readLangs(lang1, lang2):
    print("Reading lines...")

    # Split every line into pairs and normalize
    
    pairs = [[normalizeString(l['report']), normalizeString(l['summary'])] for l in train_data]

    report_lang = Lang(lang1)
    summary_lang = Lang(lang2)

    return report_lang, summary_lang, pairs

max_report_size = 500 #10000
max_summary_size = 150 #1500

def filterPair(p):
    v1 = len(p[0].split(' '))
    v2 = len(p[1].split(' '))
    return v1 < max_report_size and v2 < max_summary_size


def filterPairs(pairs):
    pairs_new = []
    index = 0
    print("Got Pairs: ", len(pairs))
    
    for p in pairs:
        index+=1
        rep = getExtraction(p[0], 10)
        summ = getExtraction(p[1], 2)
        p_new = [rep, summ]
        
        if filterPair(p_new):
            pairs_new.append(p_new)
        
        if index%100 == 0:
            print("Done with a 100 pairs")
        
    return pairs_new

def prepareData(lang1, lang2):
    report_lang, summary_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        report_lang.addSentence(pair[0])
        summary_lang.addSentence(pair[1])
    print("Counted words:")
    print(report_lang.name, report_lang.n_words)
    print(summary_lang.name, summary_lang.n_words)

    return report_lang, summary_lang, pairs

Downloading builder script:   0%|          | 0.00/3.22k [00:00<?, ?B/s]

Downloading and preparing dataset gov_report_summarization_dataset/document to /root/.cache/huggingface/datasets/ccdv___gov_report_summarization_dataset/document/1.0.0/57ca3042de9c40c218cc94084cbc80a99a161036134bfc88112c57d251443590...


Downloading data:   0%|          | 0.00/271M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset gov_report_summarization_dataset downloaded and prepared to /root/.cache/huggingface/datasets/ccdv___gov_report_summarization_dataset/document/1.0.0/57ca3042de9c40c218cc94084cbc80a99a161036134bfc88112c57d251443590. Subsequent calls will reuse this data.
Train Shape:  (500, 2)


In [3]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    print("Length of losses: ", len(points))    
    plt.figure()
    plt.plot(range(len(points)), points)
    plt.savefig("loss_plot_100_epoch_r.png")
    
def indexesFromSentence(lang, sentence):
    ret = []
    for word in sentence.split(' '):
        if word in lang.word2index.keys():
            ret.append(lang.word2index[word])
        else:
            ret.append(2)
    return ret #[lang.word2index[word] for word in sentence.split(' ')]
#     return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(report_lang, pair[0])
    target_tensor = tensorFromSentence(summary_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('report', 'summary')

    n = len(pairs)
    input_ids = np.zeros((n, max_report_size), dtype=np.int32)
    target_ids = np.zeros((n, max_summary_size), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader, pairs


def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    
    model.eval()
#     model.train()
    
    return model

In [4]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    counter = 0

    total_loss = 0
    for data in dataloader:
        if counter%200==0:
            print("Counter: ", counter)
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

        counter+=1
    return total_loss / len(dataloader)


def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        print("Epoch: ", epoch)
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print("LOSS: ", loss)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    
    showPlot(plot_losses)
    

def evaluate_s(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, _ = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_hidden

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden
    

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(max_summary_size):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [6]:
hidden_size = 256
batch_size = 1
print("Getting Dataloader")
input_lang, output_lang, train_dataloader, pairs = get_dataloader(batch_size)
print("Got DataLoader")

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

print("ENCODER PARAMS: ", sum(p.numel() for p in encoder.parameters() if p.requires_grad))
print("DECODER PARAMS: ", sum(p.numel() for p in decoder.parameters() if p.requires_grad))

Getting Dataloader
Reading lines...
Read 500 sentence pairs
Got Pairs:  500


  dist = 1.0 - uv / np.sqrt(uu * vv)


Done with a 100 pairs
Done with a 100 pairs
Done with a 100 pairs
Done with a 100 pairs
Done with a 100 pairs
Trimmed to 420 sentence pairs
Counting words...
Counted words:
report 13976
summary 5051
Got DataLoader
ENCODER PARAMS:  3972608
DECODER PARAMS:  2985915


In [7]:
print("Training Started")
epochs = 100
train(train_dataloader, encoder, decoder, epochs, print_every=5, plot_every=1)

checkpointEncoder = {'model': encoder,
                     'state_dict': encoder.state_dict()}
checkpointDecoder = {'model': decoder,
                     'state_dict': decoder.state_dict()}

torch.save(checkpointEncoder, 'checkpointEncoder100_4_r.pth')
torch.save(checkpointDecoder, 'checkpointDecoder100_4_r.pth')

print("Training Done")

Training Started
Epoch:  1
Counter:  0
Counter:  200
Counter:  400
LOSS:  2.65268925172942
Epoch:  2
Counter:  0
Counter:  200
Counter:  400
LOSS:  2.246552967457544
Epoch:  3
Counter:  0
Counter:  200
Counter:  400
LOSS:  1.9803907979102362
Epoch:  4
Counter:  0
Counter:  200
Counter:  400
LOSS:  1.6953342091469539
Epoch:  5
Counter:  0
Counter:  200
Counter:  400
LOSS:  1.396934099424453
21m 48s (- 414m 19s) (5 5%) 1.9944
Epoch:  6
Counter:  0
Counter:  200
Counter:  400
LOSS:  1.1044480152073362
Epoch:  7
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.8466016772957076
Epoch:  8
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.6259649715253285
Epoch:  9
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.4493781292367549
Epoch:  10
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.31391634188947226
43m 30s (- 391m 36s) (10 10%) 0.6681
Epoch:  11
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.21823479857827935
Epoch:  12
Counter:  0
Counter:  200
Counter:  400
LOSS:  0.154251774879438

In [8]:
def evaluateAndShowAttention(input_sentence):
    output_words, decoder_hidden = evaluate_s(encoder, decoder, input_sentence, input_lang, output_lang)
    s = ' '.join(output_words)
    return s


s1 = evaluateAndShowAttention(pairs[10][0])
print("Original Summary: ", pairs[10][1])
print("New Summary: ", s1)

Original Summary:   individual staff members face no consequences for failing to meet the training requirement, however, and vba has not tracked training completion by individuals. the department of veterans affairs (va) has not examined the ratings distribution, but acknowledges a potential issue with its formula and is considering changes.
New Summary:   individual staff members face no consequences for failing to meet the training requirement, however, and vba has not tracked training completion by individuals. the department of veterans affairs (va) has not examined the ratings distribution, but acknowledges a potential issue with its formula and is considering changes. <EOS>


In [9]:
import evaluate
from evaluate import load
rouge = evaluate.load('rouge')

# encoder = load_checkpoint('checkpointEncoder100_4_r.pth')
# decoder = load_checkpoint('checkpointDecoder100_4_r.pth')

summaries_og = []
summaries = []
for p in pairs:
    s = evaluateAndShowAttention(p[0])
    summaries_og.append(p[1])
    summaries.append(s)

results = rouge.compute(predictions=summaries, references=summaries_og)
print("Train Set Metrics: ")
print(results)

2024-04-16 13:22:09.836195: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 13:22:09.836424: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 13:22:10.013320: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Train Set Metrics: 
{'rouge1': 0.9204858532968208, 'rouge2': 0.9095797088399289, 'rougeL': 0.9165332908151574, 'rougeLsum': 0.9166026169567599}


In [10]:
# Then do rouge metrics for some test data. lets say another 100 from the test pool
import evaluate
from evaluate import load
rouge = evaluate.load('rouge')

test_data = load_dataset("ccdv/govreport-summarization", split="test[:100]")

pairs_test = [[normalizeString(l['report']), normalizeString(l['summary'])] for l in test_data]

print("Read %s sentence pairs" % len(pairs_test))
pairs_test = filterPairs(pairs_test)
print("Trimmed to %s sentence pairs" % len(pairs_test))

summaries_original = []
summaries_test = []
for p in pairs_test:
    s = evaluateAndShowAttention(p[0])
    summaries_original.append(p[1])
    summaries_test.append(s)

results = rouge.compute(predictions=summaries_test, references=summaries_original)
print("Test Set Metrics: ")
print(results)

print("Original Test Summary: ", summaries_original[10])
print("New Test Summary: ", summaries_test[10])



Read 100 sentence pairs
Got Pairs:  100


  dist = 1.0 - uv / np.sqrt(uu * vv)


Done with a 100 pairs
Trimmed to 100 sentence pairs
Test Set Metrics: 
{'rouge1': 0.16070116556519218, 'rouge2': 0.007886436434556407, 'rougeL': 0.1040808798153002, 'rougeLsum': 0.10434727271283714}
Original Test Summary:   nationwide, about 1.4 million elderly or disabled individuals receive care in more than 15,500 nursing homes. in light of the increased number and severity of abuse deficiencies, it is imperative that cms have strong nursing home oversight in place to protect residents from abuse.
New Test Summary:   the federal government has been pursuing electronic initiatives to strengthen its buying processes, reduce costs, and create a competitive "virtual" marketplace. each of the four business assistance programs gao examined had taken steps to educate its clients on electronic commerce as part of its operations. <EOS>
