# Get CNN/DailyMail dataset from GitHub repo

In [1]:
import os, pprint, re
from datetime import datetime

if not 'workbookDir' in globals():  workbookDir = os.getcwd()
os.chdir(workbookDir+'/..')
  
try:    __set_jtplot__('dark')
except: pass

pp = pprint.PrettyPrinter(indent=2)

## Load the dataset

In [2]:
%run datasets.py
ds = CNNDailyMail(split='train')  # train, val, or test
print(f'{len(ds)} documents.')

Using existing data.
287227 documents.


# Train LexRank model
The LexRank model finds the most significant sentences from a passage.

**Training time (Yenson's laptop):**
  - 500  segments: ~13s
  - 1000 segments: ~30s
  - 2000 segments: ~60s
  - 4000 segments: ~130s

Looks roughly linear (maybe nlog(n)?) -- won't know for sure at limited scale.

In [3]:
from lexrank import STOPWORDS, LexRank

trainSegs = min(500, len(ds))

ds.outFilt = lambda out: out.src  # use sources only
startTime = datetime.now()
lxr = LexRank(ds[:trainSegs], stopwords=STOPWORDS['en'])
print(f'Creating model took {(datetime.now()-startTime).seconds}s.')

Creating model took 15s.


In [4]:
ds.outFilt = None    # get source and target

def to_sentences(seg):
  sentences = map(lambda sent: sent.strip(), seg.split('.'))
  sentences = filter(lambda sent: len(sent)>0, sentences)
  sentences = map(lambda sent: re.sub(r' , ', ', ', sent), sentences)
  sentences = map(lambda sent: sent[0].upper()+sent[1:]+'.', sentences)
  return list(sentences)


testSeg = ds[-1]
testSents = to_sentences(testSeg.src)
print(f'Test segment has {len(testSents)} sentences.\n')

testSeg = ds[-1]
testSents = to_sentences(testSeg.src)
print(f'Test segment has {len(testSents)} sentences. Printing 5:')
pp.pprint(testSents[:5])

Test segment has 41 sentences.

Test segment has 41 sentences. Printing 5:
[ "A facebook page seeking to preserve the ` black pete ' clowns in blackface "
  'who accompany st.',
  'Nicholas to the netherlands during the holidays has become the '
  "fastest-growing dutch-language page ever, receiving 1 million ` likes ' in "
  'a single day.',
  "The popularity of the ` pete-ition ' page reflects the emotional attachment "
  'most dutch have to a figure that helped launch the tradition of santa '
  'claus.',
  'It also reflects their anger at critics who call it racist.',
  "Those critics include foreigners who they feel do n't understand the "
  'tradition.']


In [5]:
summary_size = 2
summary = lxr.get_summary(testSents, summary_size=summary_size, threshold=.1)
print(f'Test summary of {summary_size} sentences:')
pp.pprint(summary)

Test summary of 2 sentences:
[ 'Controversial : many claim that the tradition is offensive towards black '
  'people.',
  'Opponents say pete is an offensive caricature of black people.']


In [6]:
tgt = to_sentences(' '.join(ds.split_tags(testSeg.tgt)))
print('Target summary:')
pp.pprint(tgt)

Target summary:
[ "Facebook page supporting tradition gains one million ` likes ' in a day.",
  "` do n't let the netherlands ' most beautiful tradition disappear, ' it "
  'says.',
  'Un has condemned the tradition claiming it reflects racial prejudice.']


# Use the ROUGE metric to evaluate performance
Choose one of the F1 (`'f'`) scores.

In [7]:
from rouge import Rouge 

rouge = Rouge()
scores = rouge.get_scores(' '.join(summary), ' '.join(tgt))
pp.pprint(scores)  # choose one of the F1 (f) scores

[ { 'rouge-1': { 'f': 0.06779660549267486,
                 'p': 0.09090909090909091,
                 'r': 0.05405405405405406},
    'rouge-2': { 'f': 0.03508771464450662,
                 'p': 0.047619047619047616,
                 'r': 0.027777777777777776},
    'rouge-l': { 'f': 0.08333332864583361,
                 'p': 0.1111111111111111,
                 'r': 0.06666666666666667}}]
