In [None]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

In [None]:
import pathlib
import os
import matplotlib.pyplot as plt
import seaborn as sns

import json
import lzma

import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
import string
import datetime as dt

import numpy as np  
import pandas as pd 
import re           
pd.set_option("display.max_colwidth", 200)

# Pre-Processing for Extractive Model

## <a id='1'>1. Tokenizing sentences and words</a>
Unlike abstractive modeling, we need to tokenize by sentence as well as by words for this particular extractive model. Our model needs data that has clear sentence markers, to be able to extract/select the best sentences for making a summary.

In [None]:
sentence_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

In [None]:
cases = []
#Rename path for North Carolina cases, .txt files, on your own machine
with lzma.open("./NC_text/data/data.jsonl.xz", 'r') as jsonl_file:
    for case in jsonl_file:
        cases.append(json.loads(str(case, 'utf-8')))

headnotes = []
text_ops = []

for c in cases:
    head = c['casebody']['data']['head_matter']
    op = c['casebody']['data']['opinions']
    
    #Do not append empty headnotes or empty opinion text
    if head and op:
        headnotes.append(head)
        text_ops.append(op[0]['text'])

In [None]:
%%time
# Cap body texts to 3000 words, due to limits of our GPU
max_body_word_count = 3000
token_ops = []
op_word_counts = []
for i, op in enumerate(text_ops):
    results = sentence_tokenizer.tokenize(op)
    results = ' ##SENT## '.join(results)
    # Cut off tokens at max allowed count
    tokens = tokenizer.tokenize(results)[:max_body_word_count]
    op_word_counts.append(len(tokens))
    results = ' '.join(tokens)
    token_ops.append(results)

In [None]:
%%time
# Cap head notes to 2000 words
max_head_word_count = 2000
token_heads = []
head_word_counts = []
for i, head in enumerate(headnotes):
    results = sentence_tokenizer.tokenize(head)
    results = ' ##SENT## '.join(results)
    tokens = tokenizer.tokenize(results)[:max_head_word_count]
    head_word_counts.append(len(tokens))
    results = ' '.join(tokens)
    token_heads.append(results)

In [None]:
# Drop all cases with headnotes under 50 words, because our EDA shows
# that they tend to not be true head notes, but just list of names
df = pd.DataFrame(list(zip(token_ops, token_heads, op_word_counts, head_word_counts)), columns=('token_ops', 'token_heads', 'op counts', 'head counts'))
df = df.loc[df['head counts']>50]

# Drop all cases where summaries are longer than 1/2 word-length of body text
# Mainly because when headnotes are as long as their opinion text,
# they arguably are not really "summaries" anymore by that point.
df = df.loc[df['op counts']>df['head counts']*2]
len(df)

In [None]:
# Export csv to prepare for Oracle step
df[:24000].to_csv("true_df.csv")

## <a id='2'>2. Oracle</a>
This is used as an unsupervised way to generate the "true labels" (e.g. best sentences to extract) from the true head note. 
This particular algorithm is from the Zhou et al piece "Neural Document Summmarization by Jointly Learning to Score and Select Sentences," found here: https://www.aclweb.org/anthology/P18-1061.pdf. The model they use is called NeuSum.

This algorithm constructs the training data set by labelling sentences in a given document as part of the summary or not, based on maximizing the Rouge-2 F1 score. 

Because our documents for these cases are so much longer than the CNN/Daily Mail dataset used by Zhou et al for NeuSum, the below process takes far longer to find labels, and also it does not explore all possible combinations of sentences--otherwise, the algorithm would take far too long to find all the best labels.

In [None]:
# Borrowed from: https://github.com/magic282/cnndm_acl18/blob/master/find_oracle.py

# Calculates Rouge Scores
from collections import Counter
from nltk.stem.porter import PorterStemmer
import scipy.stats as st

stemmer = PorterStemmer()


class Rouge(object):
    def __init__(self, stem=True, use_ngram_buf=False):
        self.N = 2
        self.stem = stem
        self.use_ngram_buf = use_ngram_buf
        self.ngram_buf = {}

    @staticmethod
    def _format_sentence(sentence):
        s = sentence.lower()
        s = re.sub(r"[^0-9a-z]", " ", s)
        s = re.sub(r"\s+", " ", s)
        s = s.strip()
        return s

    def _create_n_gram(self, raw_sentence, n, stem):
        if self.use_ngram_buf:
            if raw_sentence in self.ngram_buf:
                return self.ngram_buf[raw_sentence]
        res = {}
        sentence = Rouge._format_sentence(raw_sentence)
        tokens = sentence.split(' ')
        if stem:
            # try:  # TODO older NLTK has a bug in Porter Stemmer
            tokens = [stemmer.stem(t) for t in tokens]
            # except:
            #     pass
        sent_len = len(tokens)
        for _n in range(n):
            buf = Counter()
            for idx, token in enumerate(tokens):
                if idx + _n >= sent_len:
                    break
                ngram = ' '.join(tokens[idx: idx + _n + 1])
                buf[ngram] += 1
            res[_n] = buf
        if self.use_ngram_buf:
            self.ngram_buf[raw_sentence] = res
        return res

    def get_ngram(self, sents, N, stem=False):
        if isinstance(sents, list):
            res = {}
            for _n in range(N):
                res[_n] = Counter()
            for sent in sents:
                ngrams = self._create_n_gram(sent, N, stem)
                for this_n, counter in ngrams.items():
                    # res[this_n] = res[this_n] + counter
                    self_counter = res[this_n]
                    for elem, count in counter.items():
                        if elem not in self_counter:
                            self_counter[elem] = count
                        else:
                            self_counter[elem] += count
            return res
        elif isinstance(sents, str):
            return self._create_n_gram(sents, N, stem)
        else:
            raise ValueError

    def get_mean_sd_internal(self, x):
        mean = np.mean(x)
        sd = st.sem(x)
        res = st.t.interval(0.95, len(x) - 1, loc=mean, scale=sd)
        return (mean, sd, res)

    def compute_rouge(self, references, systems):
        assert (len(references) == len(systems))

        peer_count = len(references)

        result_buf = {}
        for n in range(self.N):
            result_buf[n] = {'p': [], 'r': [], 'f': []}

        for ref_sent, sys_sent in zip(references, systems):
            ref_ngrams = self.get_ngram(ref_sent, self.N, self.stem)
            sys_ngrams = self.get_ngram(sys_sent, self.N, self.stem)
            for n in range(self.N):
                ref_ngram = ref_ngrams[n]
                sys_ngram = sys_ngrams[n]
                ref_count = sum(ref_ngram.values())
                sys_count = sum(sys_ngram.values())
                match_count = 0
                for k, v in sys_ngram.items():
                    if k in ref_ngram:
                        match_count += min(v, ref_ngram[k])
                p = match_count / sys_count if sys_count != 0 else 0
                r = match_count / ref_count if ref_count != 0 else 0
                f = 0 if (p == 0 or r == 0) else 2 * p * r / (p + r)
                result_buf[n]['p'].append(p)
                result_buf[n]['r'].append(r)
                result_buf[n]['f'].append(f)

        res = {}
        for n in range(self.N):
            n_key = 'rouge-{0}'.format(n + 1)
            res[n_key] = {}
            if len(result_buf[n]['p']) >= 50:
                res[n_key]['p'] = self.get_mean_sd_internal(result_buf[n]['p'])
                res[n_key]['r'] = self.get_mean_sd_internal(result_buf[n]['r'])
                res[n_key]['f'] = self.get_mean_sd_internal(result_buf[n]['f'])
            else:
                # not enough samples to calculate confidence interval
                res[n_key]['p'] = (np.mean(np.array(result_buf[n]['p'])), 0, (0, 0))
                res[n_key]['r'] = (np.mean(np.array(result_buf[n]['r'])), 0, (0, 0))
                res[n_key]['f'] = (np.mean(np.array(result_buf[n]['f'])), 0, (0, 0))

        return res

In [None]:
# Borrowed from: https://github.com/magic282/cnndm_acl18/blob/master/find_oracle.py

import sys
import itertools
import gc
import math
import datetime
#from PyRouge.Rouge.Rouge import Rouge

class Document(object):
    def __init__(self, doc_sents, summary_sents):
        self.doc_sents = doc_sents
        self.summary_sents = summary_sents
        self.doc_len = len(self.doc_sents)
        self.summary_len = len(self.summary_sents)
        self.concat_summary = " ".join(self.summary_sents)

rouge = Rouge(use_ngram_buf=True)

MAX_COMB_L = 5
MAX_COMB_NUM = 100000


def c_n_x(n, x):
    if x > (n >> 2):
        x = n - x
    res = 1
    for i in range(n, n - x, -1):
        res *= i
    for i in range(x, 0, -1):
        res = res // i
    return res


def solve_one(document):
    if document.doc_len == 0 or document.summary_len == 0:
        return None, 0
    sentence_bigram_recall = [0] * document.doc_len
    for idx, sent in enumerate(document.doc_sents):
        scores = rouge.compute_rouge([document.summary_sents], [sent])
        recall = scores['rouge-2']['r'][0]
        sentence_bigram_recall[idx] = recall
    candidates = []
    for idx, recall in enumerate(sentence_bigram_recall):
        if recall > 0:
            candidates.append(idx)
    all_best_l = 1
    all_best_score = 0
    all_best_comb = None
    for l in range(1, len(candidates)):
        if l > MAX_COMB_L:
            #print('Exceed MAX_COMB_L')
            break
        comb_num = c_n_x(len(candidates), l)
        if math.isnan(comb_num) or math.isinf(comb_num) or comb_num > MAX_COMB_NUM:
            #print('Exceed MAX_COMB_NUM')
            break
        combs = itertools.combinations(candidates, l)
        l_best_score = 0
        l_best_choice = None
        for comb in combs:
            c_string = [document.doc_sents[idx] for idx in comb]
            rouge_scores = rouge.compute_rouge([document.summary_sents], [c_string])
            rouge_bigram_f1 = rouge_scores['rouge-2']['f'][0]
            if rouge_bigram_f1 > l_best_score:
                l_best_score = rouge_bigram_f1
                l_best_choice = comb
        if l_best_score > all_best_score:
            all_best_l = l
            all_best_score = l_best_score
            all_best_comb = l_best_choice
        else:
            if l > all_best_l:
                break
    return all_best_comb, all_best_score


def solve(documents, output_file):
    writer = open(output_file, 'w', encoding='utf-8', buffering=1)
    for idx, doc in enumerate(documents):
        if idx % 100 == 0:
            print(datetime.datetime.now())
            rouge.ngram_buf = {}
            gc.collect()
        comb = solve_one(doc)
        writer.write('{0}\t {1}'.format(comb[0], comb[1]) + '\n')
    writer.close()


def load_data(src_file, tgt_file):
    docs = []
    for src_line, tgt_line in zip(src_file, tgt_file):
        src_line = src_line.strip()
        tgt_line = tgt_line.strip()
        src_sents = src_line.split('##SENT##')
        tgt_sents = tgt_line.strip().split('##SENT##')
        docs.append(Document(src_sents, tgt_sents))
    return docs


def find_highlights(src_file, tgt_file, outfile_name):
    docs = load_data(src_file, tgt_file)
    solve(docs, outfile_name)

In [None]:
%%time
# This takes 72+ hours of processing
# This was run parallel on multiple machines to speed up processing
start = 0#6000
end = 2#24000
find_highlights(df['token_ops'][start:end], df['token_heads'][start:end], "full_oracle.txt")

# The Extractive Model

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping


## <a id='1'>1. Further Pre-processing: X_train</a>

In [None]:
# true_df.csv can be found in this zip file that Song generated for paralelling pre-processing:
# https://drive.google.com/file/d/1yDXjd7seRCp3_YZDHky3utDVtBdIsSYL/view?usp=sharing

df = pd.read_csv("true_df.csv")

In [None]:
#Adjust df to match the y_train that we pre-processed by batching
df = df.iloc[:24000]

<img src="./images/model_img1.png">

### <a id='a'>a. Generate vocabulary via tokenization for embeddings</a>

In [None]:
# I don't need to get embeddings for headnote tokens, because our
# RNN will only extract sentences from the opinion body text;
# thus, my model never needs to "read" the headnotes
all_text = df['token_ops']

# Text cleaner removes numbers and punctuation, as they are probably not needed
# for the model to evaluate the IMPORTANCE of a sentence (not meaning).
# It also decreases the size of our input, for the sake of our GPU

def txt_cleaner(text):
    newString = re.sub('"','', text)
    
    #added line to remove parantheses
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString) # remove '"'
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    
    newString = newString.lower()
    tokens=newString.split()
    newString=''
    for i in tokens:
        if len(i)>1:                                 
            newString=newString+i+' '  
    return newString

In [None]:
%%time
# all_text: all the text, tokenized for generating the vocabulary, which we need for embedding
# df_cleaned: the cleaned dataframe with punctuations/numbers removed

all_text = []
df_cleaned = []

for doc in df['token_ops']:
    x = doc.replace("##SENT##", "newsenthere")
    x = txt_cleaner(x)
    df_cleaned.append(x)
    
    x_tok = nltk.tokenize.WordPunctTokenizer().tokenize(x)
    all_text.extend(x_tok)

In [None]:
# words: all the unique words in our entire training text set
# We will use this to generate embedding layer later
words = np.unique(all_text)
n_words = len(words)

# Add padding at position 0, and 'zzzzzz' because for some reason, it was not in our unique words list
word_index = ['_PADDING_'] +  ['zzzzzzz'] + list(words)

print("Vocab size: {}".format(n_words))

<img src="./images/model_img2.png">

### <a id='b'>b. Find maximum length of sentences to cap dataset at</a>

In [None]:
%%time
# Find how long each sentence in our training set is, to see what max_sentence length is.
# We need max_sent_len in order to do padding.
# We need padding because we will MaxPool our words into sentences, so need consistent sentence length

sent_lengths = []
for doc in df['token_ops']:
    sents_in_doc = doc.split('##SENT##')
    sent_lengths.append(len(sents_in_doc))

In [None]:
plt.hist(np.sort(sent_lengths), bins=50)
plt.title("Length of sentences in dataset")
plt.xlabel("Word length")
plt.ylabel("# of sentences with x word length")

<img src="./images/model_img3.png">

In [None]:
# We see that 91% of our sentences have lengths less than 130 words
# So we arbitrarily pick this as our max sentence length
# NOTE: I tried longer sentence lengths, but the JupyterHub GPU crashes
max_idx = np.argmax(np.sort(sent_lengths) >130)
max_idx/df.shape[0]

In [None]:
# Set max words per sentence to 130
# Use this size for maxpooling layer
max_words_in_sent = 130 #max(sent_lengths)
print(max_words_in_sent)

In [None]:
# dictionary for word index (vocabulary)
word2idx = dict(zip(word_index, range(n_words+1)))
idx2word = dict(zip(range(n_words+1), word_index))

### <a id='c'>c. Generate X_train from cleaned data</a>

In [None]:
%%time
# convert to numeric using word2idx and add padding
X = []
for doc in df_cleaned:
    sents_in_doc = doc.split('newsenthere')
    
    mod_doc = []
    for sent in sents_in_doc:
        mod_sent=[]
        x_tokens = nltk.tokenize.WordPunctTokenizer().tokenize(sent)
        # Convert tokens in a sentence to index numbers
        for token in x_tokens:
            mod_sent.append(word2idx[token])
        mod_doc.append(mod_sent[:max_words_in_sent])
    X.append(pad_sequences(mod_doc, maxlen=max_words_in_sent, padding='post', value=0))

In [None]:
# Generate the X_train
X_train = []
for doc in X:
    concat_doc = []
    for sent in doc:
        concat_doc.extend(sent)
    X_train.append(concat_doc)
len(X_train)

In [None]:
# cap max senteces per doc to 110 sentences * 130 words per sent, cuz above max_length is too large
sents_per_doc =110
max_doc_len = sents_per_doc*max_words_in_sent
max_doc_len

In [None]:
# Pad documents so that they are all the same number of sentences
X_tr_final = pad_sequences(X_train, maxlen=max_doc_len, padding='post', value=0)

In [None]:
X_tr_final.shape

## <a id='2'>2. Further Pre-processing: Y_train labels</a>

In [None]:
# 2000 oracle iterations take 8 hours, so doing the full pre-processing
# was not a feasible option.

In [None]:
# Builds unprocessed y_train (y_unproc) with a tuple of best sentence indices per document
# Also builds F1 Rouge-2 scores, calculated during pre-processing

# Borrowed from https://stackoverflow.com/questions/6633678/finding-words-after-keyword-in-python
y_unproc=[]
rouge_scores=[]

f = open("./oracle/full_oracle.txt", "r")
for line in f:
    y_tup, split, rouge_score = line.partition('\t')
    rouge_score = rouge_score.strip('\n')
    y_unproc.append(y_tup)
    rouge_scores.append(rouge_score)

In [None]:
# Find indices of N/A to drop later from x_train and y_train
null_y = []
for i in range(len(y_unproc)):
    if y_unproc[i] == 'None':
        null_y.append(i)

print("# of nulls: {}".format(len(null_y)))

# Drop nulls
X_tr_final = np.delete(X_tr_final, null_y, axis=0)
y_unproc = np.delete(y_unproc, null_y, axis=0)

In [None]:
# set length of our y_train
y_len = len(y_unproc)
y_len

In [None]:
# This generates the lbinary abels for each word in a sentence (0 or 1, 1 being it is a chosen extracted summary sentence)

# initialize zeros in the correct y_train shape
y_full = np.zeros(shape=(y_len, sents_per_doc))
for i in range(y_len):
    if y_unproc[i] != 'None':
        y_p = y_unproc[i].strip('(),').split(', ')
        y_tpl = tuple(map(int, y_p))
        for j in y_tpl:
            if j < sents_per_doc:
                y_full[i][j] = 1.

In [None]:
y_full.shape

## <a id='3'>3. The Model</a>

### <a id='3a'>a. Build embedding layer</a>
Get GloVE word embeddings for our model

In [None]:
# https://nlp.stanford.edu/projects/glove/
# I used wikipedia 2014+ Gigaword

# Extract word vectors
embeddings_index = {}
f = open('glove.6B.50d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
# generate embedding matrix
EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_doc_len,
                            trainable=False)

### <a id='3b'>b. Build GRU-GRU model</a>
Our model draws predominantly from the Nallaptai, Zhai, and Zhou 2016 SummaRuNNer paper: https://arxiv.org/pdf/1611.04230.pdf.

I have two bidirectional GRUs: One operates at the word level, and the other operates at the sentence level. Because we padded all our sentences to be the same number of words, we can use MaxPooling to consolidate all the word outputs into a single sentence output, and feed those into our 2nd GRU layer that looks at the sentence level.

Due to the difficulty of building a classification layer that handles salience, novelty, absolute position, and relative position (it would have been easier to do in straight tensorflow or Pytorch), I opt for a simple Dense layer at the end to classify the labels (unlike the real SummaRuNNer).

In [None]:
# architecture inspired by SummaRunner 
# https://github.com/hpzhao/SummaRuNNer/blob/master/models/RNN_RNN.py
n_units=50
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001,clipvalue=1.0)
loss = 'binary_crossentropy'#weighted_bce
metrics = ["accuracy"]

seq_input = Input(shape=(max_doc_len,))
embedded_seq = embedding_layer(seq_input)

# Word-level GRU
x = Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True))(embedded_seq)

# MaxPool combines words into sentences
wordout = tf.keras.layers.MaxPool1D(pool_size = max_words_in_sent, padding='same')(x)

# Sentence-level GRU after maxpooling all words in a sentence
sentout = Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True))(wordout)

# Classification at the sentence level
output = TimeDistributed(Dense(units=1, activation='sigmoid'))(sentout)


model = tf.keras.Model(inputs=seq_input, outputs=output) 

model.compile(optimizer=optimizer, loss=loss, metrics=metrics,)

print(model.summary())

<img src="./images/model_img4.png">

In [None]:
# The index at which to build validation set up
val_size = int(X_tr_final.shape[0]*0.1)

# Shuffle indices randomly
np.random.seed(2143)
indices = np.arange(X_tr_final.shape[0])
np.random.shuffle(indices)
X_tr_final = X_tr_final[indices]
y_full = y_full[indices]

# Generate Training and validation sets
x_train = X_tr_final[val_size:]
y_train = y_full[val_size:]
x_val = X_tr_final[:val_size]
y_val = y_full[:val_size]

In [None]:
%%time
# Train model
verbose = 1

callback=tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=1)
history = model.fit(x_train, y_train, batch_size=64, epochs=5,#epochs, 
                    validation_data=(x_val, y_val), verbose=verbose,
                    shuffle=True,
                   )

<img src="./images/model_img5.png">

In [None]:
## Save model
model.save_weights("{}.h5".format('extract_model_v3_2'))

In [None]:
## Load model
model.load_weights("extract_model_v3_2.h5")

## <a id='4'>4. Generating and Scoring Summary</a>
We use both the Rouge-1 and Rouge-2 F1 scores in order to measure how well our predicted/generated summaries match up to the true summaries.
The Rouge-2 F1 score is used in both the NeuSum and SummaRunner papers that influence this extractive model, primarily because matching bigrams is a more complex task than matching unigrams (Rouge-1), and the F1 score includes both precision and recall, making it a more "well-rounded" measure of how well our model performs

In [None]:
# Generate probability predictions for each sentence in each document
y_val_prob = model.predict(x_val)

In [None]:
# Following the SummaRunner article, we decide on our best sentences, NOT by p>0.5,
# because our positive true labels are too sparse. Instead, pick 5 largest probabilities
# per document for our 5-sentence summaries.
val_best_sents = []
for i in range(len(y_val_prob)):
    # Get first 5 sentence indices with largest probabilities
    best = np.argsort(y_val_prob[i].reshape(-1))[::-1][0:5]
    val_best_sents.append(best)

In [None]:
# Builds document and loads it for scoring
# Uses custom object, Document, based off Summarunner code

def load_data(src_file, tgt_file):
    docs = []
    for src_line, tgt_line in zip(src_file, tgt_file):
        src_line = src_line.strip()
        tgt_line = tgt_line.strip()
        #if src_line == "" or tgt_line == "":
        #    docs.append(None)
        #    continue
        src_sents = src_line.split('##SENT##')
        tgt_sents = tgt_line.strip().split('##SENT##')
        docs.append(Document(src_sents, tgt_sents))
    return docs

In [None]:
# Drop nulls from original imported DF and then select the validation indices
val_df = df.drop(null_y).iloc[indices][:val_size]
docs = load_data(val_df['token_ops'], val_df['token_heads'])

### <a id='4a'>a. Find Rouge Score</a>

In [None]:
rouge = Rouge(use_ngram_buf=True)

# Generate scores using Py Rouge
def score_one(comb, document):
    c_string=[]
    for idx in comb:
        if idx < document.doc_len:
            c_string.append(document.doc_sents[idx])
    score = rouge.compute_rouge([document.summary_sents], [c_string])
    return score

def get_scores(best_sents, documents):
    scores_pred=[]
    for comb, document in zip(best_sents, documents):
        score = score_one(comb, document)
        scores_pred.append(score)
    return scores_pred

# Use scores_pred, the output from get_scores, to get average score
# rouge_type is string "Rouge-1" or "Rouge-2"
# p_r_f: select 'p', 'r', or 'f'
def find_avg_rouge(scores_pred, rouge_type, p_r_f):
    rouge_scores_pred =[]
    for score in scores_pred:
        f_score = score[rouge_type][p_r_f][0]
        rouge_scores_pred.append(f_score)
    
    avg_score = sum(rouge_scores_pred)/len(rouge_scores_pred)
    
    return avg_score #{'avg score': avg_score, 'all scores':rouge_scores_pred}

### Model-generated summary's Rouge-2 F1 score

In [None]:
%%time
model_scores = get_scores(val_best_sents, docs)
model_r1_score = find_avg_rouge(model_scores, "rouge-1", "f")
model_r2_score = find_avg_rouge(model_scores, "rouge-2", "f")

In [None]:
print("Model's Rouge-1 F1 Score: {}".format(model_r1_score))
print("Model's Rouge-2 F1 Score: {}".format(model_r2_score))

<img src="./images/model_img6.png">

### Model-generated summaries: Qualitative Examples

#### Example 1: Generated Summary

In [None]:
doc_num=30
list(pd.Series(docs[doc_num].doc_sents).iloc[val_best_sents[doc_num]])

<img src="./images/model_img7.png">

#### Example 1: Actual Head Note

In [None]:
docs[doc_num].summary_sents

<img src="./images/model_img8.png">

#### Example 2: Generated Summary

In [None]:
doc_num=300
list(pd.Series(docs[doc_num].doc_sents).iloc[val_best_sents[doc_num]])

<img src="./images/model_img9.png">

#### Example 2: Actual Head Note

In [None]:
docs[doc_num].summary_sents

<img src="./images/model_img10.png">

### True average rouge score for our entire training set 
Based off true y labels

In [None]:
#make sure to drop null, or y's will not align
docs2 = load_data(df['token_ops'].drop(null_y), df['token_heads'].drop(null_y))
# Check that our document # and # of y labels match
display(len(docs2))
display(len(y_unproc))

In [None]:
# Finds the true summary labels from our unprocessed y_train
y_true_combos = []
for doc in y_unproc:
    if doc != 'None':
        y_p = doc.strip('(),').split(', ')
        y_tpl = tuple(map(int, y_p))
        y_true_combos.append(y_tpl)
    else:
        y_true_combos.append((9999999,))

In [None]:
%%time
true_scores = get_scores(y_true_combos, docs2)
true_r1_score = find_avg_rouge(true_scores, "rouge-1", "f")
true_r2_score = find_avg_rouge(true_scores, "rouge-2", "f")

In [None]:
print("True Rouge-1 F1 Score for entire dataset: {}".format(true_r1_score))
print("True Rouge-2 F1 Score for entire dataset: {}".format(true_r2_score))

<img src="./images/model_img11.png">

### Remarks on Results

Our model is not too far off from the actual Rouge scores of our true labels, meaning that our model learned our labels relatively well. In order to further improve performance, we would need to improve our pre-processing step, where we better select our "gold label" sentences for extraction.

Still, our model outperforms our baseline BERT, with Rouge-1 = 0.36 and Rouge-2 = 0.135

Looking at our qualitative summary outputs, we also see that our extractive summaries are relatively intelligible and do appear to capture some of the important information from the original true summaries.