In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

## Module Imports

In [3]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from matplotlib import pyplot as plt
from torch.nn.utils.rnn import pad_sequence
import glob
import itertools
import pickle
import re
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
!pip install rouge-score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Constants

In [4]:
drive_prefix = '/content/drive/MyDrive/ese546/'
PATH_TRAIN_X = drive_prefix + 'cnndm/train.txt.src'
PATH_TRAIN_Y = drive_prefix + 'cnndm/train.txt.tgt.tagged'

PATH_VAL_X = drive_prefix + 'cnndm/val.txt.src'
PATH_VAL_Y = drive_prefix + 'cnndm/val.txt.tgt.tagged'

PATH_TEST_X = drive_prefix + 'cnndm/test.txt.src'
PATH_TEST_Y = drive_prefix + 'cnndm/test.txt.tgt.tagged'

# STOP_WORDS = set(stopwords.words('english')) 
STOP_WORDS = set()
EMB_SIZE = 300
GLOVE_EMB = drive_prefix + f'glove.6B.{EMB_SIZE}d.txt'
BATCH_SIZE = 32
EPSILON = 0.5 # for deciding between feeding (model's output OR target) as input

START_CHAR = 'starttoken'
END_CHAR = 'endtoken'
PAD_CHAR = 'padtoken'

MAX_ARTICLE_LEN = 300 # the article can have at most 300 tokens
MAX_DECODER_OUTPUT = 100 # labels can have at most 100 tokens

# NN Hyper-parameters
E_HIDDEN_DIM = 200
D_HIDDEN_DIM = 200

EP = 4000
PRINT_EVERY_EP = 100
SAVE_MODEL_EVERY_EP = 500
FORCE_CREATE_DICT = True # force to recreate the word features from scratch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

cuda


## Things to copy to local MyDrive
- glove.6B.300d.txt
- news_summary.csv (only need this if you want to reproduce) -> 90k rows (body and labels)
- cnndm/test.txt.src -> 900k rows (body)
- cnndm/test.txt.tgt.tagged -> 900k rows (labels)


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper Functions

In [28]:
def clean_text(s):
    s = s.lower().strip()
    s = s.replace('<t>', '').replace('</t>', '').replace('\n', '')
    s = s.replace('-lrb-', '').replace('-rrb-', '')
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub('"','', s)
    s = re.sub(r"'s\b","",s)
    s = re.sub("[^a-zA-Z]", " ", s)
    # s = re.sub(r"([.!?])", r" \1", s)
    # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def create_word_vec(input_text, input_label, force_create):
    word_idx_dict = {} # { word : the index of that word in the dictionary}
    idx_word_dict = {} # { index of word : word }
    
    # 1. Create mapping between words and the corresponding embedding values
    embed_file_path = drive_prefix + f'{EMB_SIZE}d_embed_dict'
    if os.path.exists(embed_file_path) and not force_create:
        print('Embedding dictionary exists, loading from file...')
        embedding_dict = pickle.load(open(embed_file_path, 'rb'))
    else:
        embedding_dict = {} 
        
        for line in glove:
            tokens = line.split()
            embedding_dict[tokens[0]] = np.array(tokens[1:], dtype='float32')
        pickle.dump(embedding_dict, open(embed_file_path, 'wb'))
        print('Saved embedding dictionary')

    # 2. Tokenize the input_text and labels
    if os.path.exists(drive_prefix + 'train_word_idx_dict') and not force_create:
        print('Word-to-index dictionary exists, loading from file...')
        word_idx_dict = pickle.load(open(drive_prefix + 'train_word_idx_dict', 'rb'))
    if os.path.exists(drive_prefix + 'train_idx_word_dict') and not force_create:
        print('Index-to-word dictionary exists, loading from file...')
        idx_word_dict = pickle.load(open(drive_prefix + 'train_idx_word_dict', 'rb'))
    else:
        unique_tokens = set([])
        for line in input_text:
            unique_tokens = unique_tokens.union(word_tokenize(line))
        for line in input_label:
            unique_tokens = unique_tokens.union(word_tokenize(line))

        for token in unique_tokens:
            word_idx_dict[token] = len(word_idx_dict)
                
        # 2.1 Add in the special tokens to the dictionary, note that the START_CHAR and END_CHAR have been added
        # during the preprocessing stage
        word_idx_dict[PAD_CHAR] = len(word_idx_dict)
        
        idx_word_dict = dict(zip(word_idx_dict.values(), word_idx_dict.keys()))
    
    # 3. Build the word vector for all the words in our dictionary
    if os.path.exists(drive_prefix + 'train_word_vector') and not force_create:
        print('Word Vector exists, loading from file...')
        word_vector = pickle.load(open(drive_prefix + 'train_word_vector', 'rb'))
    else:
        word_vector = []
        for idx, token in idx_word_dict.items():            
            if token in embedding_dict:
                word_vector.append(embedding_dict[token])
            # Append the special tokens to the word vector and assign random values
            elif token in [START_CHAR, END_CHAR, PAD_CHAR]:
                word_vector.append(np.random.normal(0, 1, EMB_SIZE))
            # if the token doesn't have an embedding, we set to 0
            else:
                word_vector.append(np.zeros([EMB_SIZE]))
    
    ## Save the dictionaries
    pickle.dump(word_idx_dict, open(drive_prefix + 'train_word_idx_dict', 'wb'))
    pickle.dump(idx_word_dict, open(drive_prefix + 'train_idx_word_dict', 'wb'))
    pickle.dump(word_vector, open(drive_prefix + 'train_word_vector', 'wb'))

    # The index in word_vec corresponds to the article index in the original X_Test array
    return np.array(word_vector), word_idx_dict, idx_word_dict

def sentence_to_idx(sentence, word_to_idx):
    tokens = word_tokenize(sentence)
    return [word_to_idx[token] for token in tokens if token in word_to_idx]

def decontracted(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text) 
    return text

def pre_process(column, max_len):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentence in tqdm(column.astype(str)):
        sent = decontracted(sentence)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", sent)
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        sent=re.sub('<[^>]*>', '', sent)
        # trim longer articles
        sent = sent.lower().strip()
        sent = ' '.join(sent.split()[:max_len])
        preprocessed_text.append(sent)
    return preprocessed_text

## TODO: REFERENCE
def zeroPadding(l, fillvalue=PAD_CHAR):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_CHAR):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_CHAR:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def plot(train_loss, val_loss):
    plt.plot(train_loss, label='Train')
    plt.plot(val_loss, label='Val')
    plt.title('Training vs Validation Loss')
    plt.xlabel('Episodes')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

## Read Data

### Data Source 2

In [14]:
with open(PATH_TEST_X, 'r') as f:
    X_Test = f.readlines()
    
with open(PATH_TEST_Y, 'r') as f:
    Y_Test = f.readlines()
    
with open(GLOVE_EMB, 'r', encoding='utf-8') as f:
    glove = f.readlines()

In [10]:
X_Test[0]

"marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of germanwings flight 9525 insisted wednesday that he was not aware of any video footage from on board the plane . marseille prosecutor brice robin told cnn that `` so far no videos were used in the crash investigation . '' he added , `` a person who has such a video needs to immediately give it to the investigators . '' robin 's comments follow claims by two magazines , german daily bild and french paris match , of a cell phone video showing the harrowing final seconds from on board germanwings flight 9525 as it crashed into the french alps . all 150 on board were killed . paris match and bild reported that the video was recovered from a phone at the wreckage site . the two publications described the supposed video , but did not post it on their websites . the publications said that they watched the video , which was found by a source close to the investigation . `` one can hear cries of 

In [15]:
X_Test = pre_process(np.array(X_Test), max_len=MAX_ARTICLE_LEN)[:3000]
Y_Test = pre_process(np.array(Y_Test), max_len=MAX_DECODER_OUTPUT)[:3000]
Y_Test = list(map(lambda x: f"{START_CHAR} {x} {END_CHAR}", Y_Test))

X_Test, X_Val, Y_Test, Y_Val = train_test_split(X_Test, Y_Test, test_size=0.3, random_state=23, shuffle=True)
print(f'\nTotal # of stories: {len(X_Test)}')

100%|██████████| 11490/11490 [00:07<00:00, 1542.81it/s]
100%|██████████| 11490/11490 [00:00<00:00, 11563.76it/s]



Total # of stories: 2100


In [12]:
max(list(map(len, X_Test)))

2026

## Features

In [16]:
train_word_vector, train_word_idx_dict, train_idx_word_dict = create_word_vec(X_Test, Y_Test, force_create=FORCE_CREATE_DICT)
print(f'Word Vector Shape: {train_word_vector.shape}')
assert train_word_vector.shape == (len(train_idx_word_dict.keys()), EMB_SIZE)

Saved embedding dictionary
Word Vector Shape: (30872, 300)


## Model

In [17]:
# Create the embedding layer weights based on the pre-trained word vector
def create_pretrained_emb_layer(word_vector):
    # vocab_size, embed_dim = word_vector.shape
    embed_layer = nn.Embedding.from_pretrained(torch.tensor(word_vector).float(), freeze=False)    
    return embed_layer

In [18]:
class Encoder(nn.Module):
    def __init__(self, hidden_dim, embedding_layer, n_layers=1, dropout=0):
        super(Encoder, self).__init__()

        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = embedding_layer
        self.gru = nn.GRU(EMB_SIZE,
                          hidden_dim,
                          num_layers=n_layers,
                          dropout=dropout,
                          bidirectional=True)

    def forward(self, x, lengths, x_h=None):
        embedded = self.embedding(x)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths, enforce_sorted=False)
        # Forward pass through GRU
        y_hat, x_h = self.gru(packed, x_h)
        # Unpack padding
        y_hat, _ = torch.nn.utils.rnn.pad_packed_sequence(y_hat)
        # Sum bidirectional GRU outputs
        y_hat = y_hat[:, :, :self.hidden_dim] + y_hat[:, : ,self.hidden_dim:]
        # Return output and final hidden state

        return y_hat, x_h

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim

        self.attention = nn.Linear(hidden_dim * 2, hidden_dim)
        self.value = nn.Parameter(torch.FloatTensor(hidden_dim).normal_())

    def forward(self, x_h, encoder_output):
        energy = self.attention(torch.cat((x_h.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        attn_score = torch.sum(self.value * energy, dim=2)

        # Transpose max_length and batch_size dimensions
        attn_score = attn_score.t()

        # the softmax normalized probability scores (with added dimension)
        attn_weights = F.softmax(attn_score, dim=1).unsqueeze(1)
        return attn_weights


class Decoder(nn.Module):
    def __init__(self, hidden_dim, output_dim, embedding_layer, n_layers=1, dropout=0.1):
        super(Decoder, self).__init__()

        self.embedding = embedding_layer # TODO
        self.dropout = nn.Dropout(dropout)
        self.n_layers = n_layers
        self.gru = nn.GRU(input_size=EMB_SIZE,
                          hidden_size=hidden_dim,
                          num_layers=n_layers,
                          dropout=dropout)
        
        self.attn_gru_combined = nn.Linear(hidden_dim * 2, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)

        self.attention = Attention(hidden_dim)

    def forward(self, x, x_h, encoder_output):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)

        y_hat, x_h = self.gru(embedded, x_h)

        attn_weights = self.attention(y_hat, encoder_output)

        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_output.transpose(0, 1))

        # Concatenate weighted context vectors
        y_hat = y_hat.squeeze(0)
        context = context.squeeze(1)
        weighted_context = torch.cat((y_hat, context), 1)

        y_hat = torch.tanh(self.attn_gru_combined(weighted_context))

        y_hat = F.softmax(self.out(y_hat), dim=1)
        return y_hat, x_h

## Train Functions

In [22]:
def maskNLLLoss(inp, target, mask):
    # TODO: Reference
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

def one_pass(x, y, e_model, d_model, e_optim, d_optim, lengths, mask, max_target_len):
    e_optim.zero_grad()
    d_optim.zero_grad()

    x = torch.tensor(x).to(device)
    y = torch.tensor(y).to(device)
    mask = mask.to(device)

    loss = 0 # loss accumulated from each timestep
    pass_loss = [] # loss for one pass

    e_output, e_hidden = e_model(x, lengths)

    # Create initial decoder input (start with START_CHAR for each sequence)
    d_input = torch.LongTensor([[train_word_idx_dict[START_CHAR] for _ in range(BATCH_SIZE)]])
    d_input = d_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    d_hidden = e_hidden[:d_model.n_layers]
    
    for i in range(max_target_len):
        d_output, d_hidden = d_model(d_input, d_hidden, e_output)

        if random.random() < EPSILON:
            d_input = y[i].view(1, -1)

        else:
            # TODO
            _, topi = d_output.topk(1) # topi = torch.argmax(d_output, axis=1)
            d_input = torch.LongTensor([[topi[i][0] for i in range(BATCH_SIZE)]])
            d_input = d_input.to(device)

        # Calculate and accumulate loss
        mask_loss, nTotal = maskNLLLoss(d_output, y[i], mask[i])
        loss += mask_loss

        pass_loss.append(mask_loss.item())


    loss.backward()
    torch.nn.utils.clip_grad_norm_(e_model.parameters(), 1)
    torch.nn.utils.clip_grad_norm_(d_model.parameters(), 1)

    e_optim.step()
    d_optim.step()

    return sum(pass_loss) / len(pass_loss)

def train(e_model, d_model, e_optim, d_optim, model_name):
    print('Training')
    e_model.train()
    d_model.train()
    ep_loss = []

    for ep in range(1, EP + 1):
        X_samples = []
        Y_samples = []
        for _ in range(BATCH_SIZE):            
            rand_idx = random.randint(0, len(X_Test)-1)
            X_samples.append(X_Test[rand_idx])
            Y_samples.append(Y_Test[rand_idx])

        # article
        indexes_batch = [sentence_to_idx(sentence, train_word_idx_dict) for sentence in X_samples]
        lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
        padList = zeroPadding(indexes_batch, fillvalue=train_word_idx_dict[PAD_CHAR])
        X_batch = torch.LongTensor(padList)
        
        # labels
        indexes_batch = [sentence_to_idx(sentence, train_word_idx_dict) for sentence in Y_samples]
        max_target_len = max([len(indexes) for indexes in indexes_batch])
        padList = zeroPadding(indexes_batch, fillvalue=train_word_idx_dict[PAD_CHAR])
        mask = binaryMatrix(padList)
        mask = torch.ByteTensor(mask)
        Y_batch = torch.LongTensor(padList)

        


        # TODO
        loss = one_pass(X_batch, Y_batch, e_model, d_model, e_optim, d_optim, lengths, mask, max_target_len)

        ep_loss.append(loss)

        if ep % PRINT_EVERY_EP == 0 and ep > 0:
            print(f'EP:{ep} | Loss: {np.array(ep_loss[-PRINT_EVERY_EP:]).mean()}')

        # if ep % SAVE_MODEL_EVERY_EP == 0 and ep > 0:
        #     torch.save({
        #     'epoch': ep,
        #     'encoder_model': e_model.state_dict(),
        #     'decoder_model': d_model.state_dict(),
        #     'encoder_optimizer': e_optim.state_dict(),
        #     'decoder_optimizer': d_optim.state_dict(),
        #     'loss': loss
        #     }, f'{drive_prefix}models/{model_name}/checkpoint_{ep}')

    return ep_loss

In [30]:
def evaluate(article, encoder, decoder):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        indexes_batch = [sentence_to_idx(article, train_word_idx_dict)]

        # Create lengths tensor
        lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
        # Transpose dimensions of batch to match models' expectations
        input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
        # Use appropriate device
        input_batch = input_batch.to(device)
        # lengths = lengths.to(device)

        e_output, e_hidden = encoder(input_batch, lengths)
        d_hidden = e_hidden[:decoder.n_layers]

        # Initialize decoder input with SOS_token
        d_input = torch.ones(1, 1, device=device, dtype=torch.long) * train_word_idx_dict[START_CHAR]

        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        
        for i in range(MAX_DECODER_OUTPUT):
            d_output, d_hidden = decoder(d_input, d_hidden, e_output)

            scores, d_input = torch.max(d_output, dim=1)

            all_tokens = torch.cat((all_tokens, d_input), dim=0)

            d_input = torch.unsqueeze(d_input, 0)

        tokens = all_tokens

        # mask_loss, nTotal = maskNLLLoss(d_output, y[i], mask[i])
        # loss += mask_loss

        # indexes -> words
        decoded_words = [train_idx_word_dict[token.item()] for token in tokens]

    return decoded_words

## Run
- the performance below is from Data Source 2

In [23]:
# MAIN DRIVER
model_name = 'autoencoder_attn'

e_embedding_layer = create_pretrained_emb_layer(train_word_vector)
d_embedding_layer = create_pretrained_emb_layer(train_word_vector)

assert train_word_vector.shape[0] == len(train_word_idx_dict) == len(train_idx_word_dict)

encoder = Encoder(E_HIDDEN_DIM, embedding_layer=e_embedding_layer, n_layers=3, dropout=0.3).to(device)
decoder = Decoder(D_HIDDEN_DIM, embedding_layer=d_embedding_layer, output_dim=train_word_vector.shape[0], n_layers=3).to(device)

e_optim = optim.Adam(encoder.parameters(), lr=1e-3)
d_optim = optim.Adam(decoder.parameters(), lr=1e-3)

plot_loss = train(encoder, decoder, e_optim, d_optim, model_name=model_name)

Training


  del sys.path[0]
  
  """


EP:100 | Loss: 4.446352240916184
EP:200 | Loss: 3.710299094790394
EP:300 | Loss: 3.615888891889903
EP:400 | Loss: 3.472700108971394
EP:500 | Loss: 3.405246657326145
EP:600 | Loss: 3.288994682086927
EP:700 | Loss: 3.21933060930879
EP:800 | Loss: 3.131595725347544
EP:900 | Loss: 3.128815248643815
EP:1000 | Loss: 2.9941362464451795
EP:1100 | Loss: 2.9908835688025484
EP:1200 | Loss: 2.816330959904549
EP:1300 | Loss: 2.8853740278752067
EP:1400 | Loss: 2.8121644522736733
EP:1500 | Loss: 2.711385091581623
EP:1600 | Loss: 2.6774941475698184
EP:1700 | Loss: 2.658153160525816
EP:1800 | Loss: 2.552131453041313
EP:1900 | Loss: 2.6016928012075637
EP:2000 | Loss: 2.517440751837539
EP:2100 | Loss: 2.439135944897421
EP:2200 | Loss: 2.4024145385530438
EP:2300 | Loss: 2.336813087090621
EP:2400 | Loss: 2.3198812460119727
EP:2500 | Loss: 2.280757819100453
EP:2600 | Loss: 2.2362779608307632
EP:2700 | Loss: 2.2383336900696578
EP:2800 | Loss: 2.1804251188899646
EP:2900 | Loss: 2.1532227915037425
EP:3000 | Lo

## Train from checkpoint (Optional)

In [None]:
ep = 4000
checkpoint = torch.load(f'{drive_prefix}models/{model_name}/checkpoint_{ep}')

e_embedding_layer = create_pretrained_emb_layer(train_word_vector)
d_embedding_layer = create_pretrained_emb_layer(train_word_vector)
encoder = Encoder(E_HIDDEN_DIM, n_layers=3, embedding_layer=e_embedding_layer).to(device)
decoder = Decoder(D_HIDDEN_DIM, embedding_layer=d_embedding_layer, output_dim=train_word_vector.shape[0], n_layers=3).to(device)

encoder.load_state_dict(checkpoint['encoder_model'])
decoder.load_state_dict(checkpoint['decoder_model'])
print('Loaded model for evaluation')

random_idx = np.random.randint(0, len(X_Test))
pred_summary = ' '.join([token for token in evaluate(X_Test[random_idx], encoder, decoder) if token != PAD_CHAR])
true_summary = Y_Test[random_idx]
print(X_Test[random_idx])
print(f'True: {true_summary}')
print(f'Pred: {pred_summary}')

Loaded model for evaluation
charlie austin has revealed how his good mate joey barton ignored him on his first day at queens park rangers before realising his mistake with the duo then becoming good friends the striker has had a premier league debut season to remember and is being touted as a possible england debutant in the future after scoring goals in the competition his close friendship with club captain barton is well publicised but they are first encounter is one austin is unlikely to forget charlie austin lrb left rrb and joey barton have become good friends despite their very early setback speaking
True: starttoken t charlie austin revealed that joey barton ignored him on his first day t t the pair are good friends now following the hilarious incident t t the 25yearold says qpr ca not afford to lose in their double header t t austin is delighted and humbled to be fourth endtoken
Pred: starttoken t charlie austin revealed he joey barton ignored him on the first season t t the pa

In [24]:
def compute_rouge(pred_summaries, true_summaries):
  from rouge_score import rouge_scorer
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
  rogue1_precision = []
  rogue1_recall = []
  rogue1_fmeasure = []
  rogueL_precision = []
  rogueL_recall = []
  rogueL_fmeasure = []
  for i in range(len(pred_summaries)):
    score = scorer.score(pred_summaries[i], true_summaries[i])
    rogue1_precision.append(score['rouge1'].precision)
    rogue1_recall.append(score['rouge1'].recall)
    rogue1_fmeasure.append(score['rouge1'].fmeasure)
    rogueL_precision.append(score['rougeL'].precision)
    rogueL_recall.append(score['rougeL'].recall)
    rogueL_fmeasure.append(score['rougeL'].fmeasure)
  
  scores = {'rogue1_precision':rogue1_precision, 'rogue1_recall':rogue1_recall, 'rogue1_fmeasure': rogue1_fmeasure, 'rogueL_precision':rogueL_precision, 'rogueL_recall':rogueL_recall, 'rogueL_fmeasure': rogueL_fmeasure }
  return scores

In [25]:
def compute_rouge_summary_stats(scores):
  def mean_confidence_interval(data, confidence=0.95):
    import scipy.stats
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
  stats = {}
  for key in scores.keys():
    stats[key] = mean_confidence_interval(scores[key])
  return stats

In [26]:
def evaluate_all(X, Y):
  pred_summaries, true_summaries = [], []
  for ind in range(len(X)):
    pred_summary = ' '.join([token for token in evaluate(X[ind], encoder, decoder) if token != PAD_CHAR])
    true_summary = Y[ind]
    pred_summaries.append(pred_summary)
    true_summaries.append(true_summary)
  return pred_summaries, true_summaries

In [31]:
pred_summaries, true_summaries = evaluate_all(X_Val, Y_Val)

In [47]:
pred_summaries_filtered  = [' '.join([token for token in summary.split() if token != 'starttoken' and token != 't' and token != 'endtoken']) for summary in pred_summaries]
true_summaries_filtered  = [' '.join([token for token in summary.split() if token != 'starttoken' and token != 't' and token != 'endtoken']) for summary in true_summaries]

In [50]:
scores = compute_rouge(pred_summaries_filtered, true_summaries_filtered)

In [51]:
compute_rouge_summary_stats(scores)

{'rogue1_fmeasure': (0.17642904634314047,
  0.1715228988669806,
  0.18133519381930033),
 'rogue1_precision': (0.16803381509205856,
  0.1633746845625774,
  0.17269294562153972),
 'rogue1_recall': (0.19664710905593996,
  0.19027010862922733,
  0.2030241094826526),
 'rogueL_fmeasure': (0.12998914272363157,
  0.1264605188316175,
  0.13351776661564566),
 'rogueL_precision': (0.12441533363197836,
  0.12094140801327362,
  0.12788925925068312),
 'rogueL_recall': (0.14401958192143843,
  0.1396030806098731,
  0.14843608323300378)}