In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms

import gensim
import gensim.downloader
import nltk
from gensim.models.keyedvectors import KeyedVectors

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cpu


## Step 1: Load datasets

In [3]:
# Load dataset.
def load_dataset(dataset_path: str) -> pd.DataFrame:
    df = pd.read_csv(dataset_path)
    df.drop(columns=['id'], inplace=True) # Drop id column
    df.dropna(inplace=True) # Drop null values (if any)
    return df

In [4]:
# Load train data.
df_train = load_dataset(r'../data/cnn_dailymail/train.csv')
print("Number of records in training set:", len(df_train))

Number of records in training set: 287113


In [5]:
# Load validation data.
df_val = load_dataset(r'../data/cnn_dailymail/validation.csv')
print("Number of records in validation set:", len(df_val))

Number of records in validation set: 13368


In [6]:
# Load test data.
df_test = load_dataset(r'../data/cnn_dailymail/test.csv')
print("Number of records in test set:", len(df_test))

Number of records in test set: 11490


## Step 2: Data pre-processing

In [7]:
# Remove redundant newline character ('\n').
df_train['highlights'] = df_train['highlights'].str.replace('\n', ' ')
# Remove the extra whitespace before the periods.
df_train['highlights'] = df_train['highlights'].str.replace(r' \.', r'.')

df_val['highlights'  ] = df_val['highlights'  ].str.replace('\n', ' ')
df_val['highlights'  ] = df_val['highlights'  ].str.replace(r' \.', r'.')

df_test['highlights' ] = df_test['highlights' ].str.replace('\n', ' ')
df_test['highlights' ] = df_test['highlights' ].str.replace(r' \.', r'.')

## Step 3: Sentence Encoding

In this step, we write a function to encode any given sentence into a vector representation. To do this, we first use the GloVe word embedding (`glove-wiki-gigaword-200`) to convert each single word in the sentence into a vector $\mathbf{w}i$, which has 200 elements. Then, we simply average these vectors to get the sentence representation $\mathbf{s}_j = \frac{1}{L_j}\sum_{i=1}^{L_j} \mathbf{w}_j$, where $L_j$ is the length of the sentence.

In [42]:
wv = gensim.downloader.load('glove-wiki-gigaword-200') # word vectors
encoding_len = 200

try:
    vocab = np.load(file='./word_vectors/vocab.npy')
    embedding = np.load(file='./word_vectors/embedding.npy')
    pad_emb = np.zeros((encoding_len))   # embedding for '<pad>'.
    unk_emb = np.mean(embedding, axis=0) # embedding for '<unk>'.
except:
    vocab = np.array(wv.index_to_key)
    embedding = np.array(wv.vectors)

    pad_emb = np.zeros((encoding_len))   # embedding for '<pad>'.
    unk_emb = np.mean(embedding, axis=0) # embedding for '<unk>'.

    vocab = np.insert(arr=vocab, obj=0, values='<pad>')
    vocab = np.insert(arr=vocab, obj=0, values='<unk>')
    embedding = np.vstack((pad_emb, unk_emb, embedding))
    with open('./word_vectors/vocab.npy','wb') as f:
        np.save(f, vocab)
    with open('./word_vectors/embedding.npy','wb') as f:
        np.save(f, embedding)

In [11]:
text = df_train['article'][0]
sents = nltk.tokenize.sent_tokenize(text)
sent_lens = [len(sent) for sent in sents]
avg_sent_len = sum(sent_lens) / len(sent_lens)
sents = [sent for sent in sents if len(sent) > avg_sent_len * 0.5]

In [12]:
sents

['The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October.',
 'The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion.',
 'Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A .',
 "State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure.",
 'The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A.',
 'The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month.',
 'Symptoms of hepatitis A include fever, tiredness, loss o

In [46]:
def encode_sentence(sent: str, wv: KeyedVectors, unk_emb: np.ndarray) -> np.ndarray:
    """ Encode the sentence into a 200-dimensional vector.
    """
    encoded = []
    for word in sent.split():
        word = word.lower()
        if word in wv:
            encoded.append(np.array(wv[word]))
        else:
            encoded.append(unk_emb)
    encoded = np.array(encoded).mean(axis=0)
    return encoded

In [None]:
def encode_article(article: list[str], pad_emb: np.ndarray):
    # TODO: get to know the output type, and then implement this function.

## Step 4: Sentence Extracting

In [None]:
MAX_LENGTH = 100

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        input = self.dropout(input)
        output, hidden = self.gru(input)
        return output, hidden

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)
        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)
        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long,
                                    device=device).fill_(0) # Start of Document
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions