In [8]:
# dependencies
from tqdm.notebook import tqdm
from pickle import load
import pandas as pd
import numpy as np
import string

If you are on Colab make sure `stories.pkl` is in your present working directory. In our case, we saved it in a Drive folder in the earlier step and while executing the workflow of Step 2, we first copied `stories.pkl` from the Drive to Colab. 

In [None]:
!ls -l

In [10]:
# load stories and summaries' list
stories = load(open('stories.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [11]:
# Create clean lines function
def clean_lines(lines):
    cleaned = list() # emply list
    
    # Create mapping table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    
    for line in lines:
        
        # strip source cnn office if it exists
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):] # reassign the line w/o the string '(CNN)'
      
        # chunk line into words (tokenize) by splitting on white space
        line = line.split() 
        
        # use list comprehension to convert each word to lower case
        line = [word.lower() for word in line]
        
        # remove punctuation from each token
        # use list comprehension to apply punctuation mapping table remove all punctuation
        line = [w.translate(table) for w in line]
        
        # remove tokens with numbers in them
        # use list comprehension to exclude all tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        
        # store as string
        # join tokens in each line as strings joined by whitespace
        # append the joined strings to the emply list, cleaned = []
        cleaned.append(' '.join(line))
        
    # remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

In [13]:
# clean th stories and summaries
for example in tqdm(stories):
    example['story'] = clean_lines(example['story'].split('\n'))
    example['highlights'] = clean_lines(example['highlights'])

  0%|          | 0/92579 [00:00<?, ?it/s]

In [14]:
# install the Rouge module for calculating the Rouge scores
!pip install -q Rouge

In [15]:
# import the Rouge module and instantiate it
from rouge import Rouge 
rouge = Rouge()

In [16]:
# utility for calculating Rouge score between pairs of sentences
def get_rouge_f1(references, sentence):
    score_ls = []
    for ans in references :
        scores = rouge.get_scores(ans, sentence)
        score_ls.append(scores[0]['rouge-1']['f'])
    
    return max(score_ls)

In [17]:
def get_list_ans_each_story(story_inp, references_inp):
    
    scr = []
    hyp = []

    # iterate through each sentence of a given story
    for i in range(0, len(story_inp)):
        # calculate Rouge score between the current sentence and the
        # provided (abstractive summaries)
        hypothesis = story_inp[i]
        scores = get_rouge_f1(references_inp, hypothesis)

        # track sentences iterated and store their scores
        hyp.append(hypothesis)
        scr.append(scores)
        
    # convert to NumPy array
    hyp1 = np.array(hyp)
    
    # sort the scores to get the indices
    scr1 = np.array(scr)
    scr2 = np.sort(scr)[::-1]
    ind  = np.argsort(scr)[::-1]
    
    # take top 5 
    ind1 = ind[0:5]
    list_ref = list(hyp1[ind1])
    
    return list_ref, scr2[0:5]

In [18]:
dict_id_summary = {}
dict_id_score = {}

# iterate through each story
for s_id in tqdm(range(0, len(stories))):
    
    # story inputs (each sentence of a story)
    story_inp = stories[s_id]['story']
    
    # reference inputs (abstractive summaries)
    references_inp = stories[s_id]['highlights']
    
    # get the list of references and scores
    list_ref, list_score = get_list_ans_each_story(story_inp, references_inp)
    
    # store the results in the dictionaries
    dict_id_summary[s_id] = list_ref
    dict_id_score[s_id] = list_score

  0%|          | 0/92579 [00:00<?, ?it/s]

As this is a time-consuming operation, it's better to store the dictionaries created above as `.pkl` files so that they can be used later as needed. 

In [19]:
story_id = []
label_sent = []
sent_id = []
list_sent = []

# iterate through each story
for i in tqdm(range(0, len(stories))):
    
    # list of references for the story
    list_ref = dict_id_summary[i]
    
    # iterate through each sentence of the current story
    for j, story in enumerate(stories[i]['story']) :
        
        # check if the story is in the list reference
        ind =  int(story in list_ref)
        
        # append the indicator as the labels
        label_sent.append(ind)
        
        # 1. append the sentences per story
        list_sent.append(story)
        
        # 2. append the sent_ids
        sent_id.append(j)
        
        # 3. append the story_id
        story_id.append(i)

  0%|          | 0/92579 [00:00<?, ?it/s]

In [20]:
# create the dataframe
df_story_summary = pd.DataFrame()

df_story_summary['story_id'] = story_id
df_story_summary['sent_id'] = sent_id
df_story_summary['sentence'] = list_sent
df_story_summary['label_sent'] = label_sent

In [21]:
# preview the dataframe
df_story_summary.head()

Unnamed: 0,story_id,sent_id,sentence,label_sent
0,0,0,at the start of a big week for the higgs boson...,0
1,0,1,the scientists outlined their final analysis b...,0
2,0,2,what is the higgs boson and why is it important,1
3,0,3,their announcement came two days before resear...,0
4,0,4,our data strongly point toward the existence o...,0


In [22]:
# serialize
df_story_summary.to_pickle('dataframe_extractive.pkl')

In [None]:
!ls -l