In [1]:
import os
import pickle
from collections import Counter
from math import log, sqrt
import nltk

In [2]:
from priberam_summarizer.extractive_summarization_decoder import ExtractiveCoverageSummarizationDecoder
from priberam_summarizer.parts import SentencePart, ConceptPart
from priberam_summarizer.document import Document
from priberam_summarizer.sentence import Sentence
from priberam_summarizer.token import Token

In [3]:
def read_CNN_Dailymail(data_path):    
    corpus = []
    files = os.listdir(data_path)
    for index, doc_filename in enumerate(files):
        text = {}
        field = 'UNK'
        with open(os.path.join(data_path, doc_filename), 'r', encoding='utf8') as fp:
            for line in fp:
                line = line.strip()                
                if line.startswith('[SN]'):
                    field = line[4:-4]
                elif len(line) != 0:
                    text[field] = text.get(field, []) + [line]

            if 'HighlightsOrg' not in text or 'StoryOrg' not in text:
                # print('Not found highlights or story for {}'.format(doc_filename))
                continue

            doc = Document()
            for sentence in nltk.sent_tokenize('\n'.join(text['StoryOrg'])):
                sent = Sentence(sentence)
                for token in nltk.word_tokenize(sentence):
                    sent.tokens.append(Token(token))
                doc.body.append(sent)

            summary = Document()
            for sentence in text['HighlightsOrg']:
                sent = Sentence(sentence)
                for token in nltk.word_tokenize(sentence):
                    sent.tokens.append(Token(token))
                summary.body.append(sent)
            corpus.append((doc, summary))
            
        if index % 1000 == 0:
            print('Processed {} / {}'.format(index, len(files)))

    return corpus

In [4]:
corpus = read_CNN_Dailymail('/home/ppb/Data/summarization/orignals/CNN_Dailymail/cnn/test')

Processed 0 / 1090
Processed 1000 / 1090


In [5]:
def compute_tfs(corpus):
    # Compute TF frequencies for unigrams and bigrams
    for doc, summary in corpus:
        for document in [doc, summary]:            
            setattr(document, 'bigrams', Counter())
            for sentence in document.sentences + summary.sentences:                
                setattr(sentence, 'bigrams', Counter())
                tokens = [token.word for token in sentence.tokens]
                tokens = [token.lower() for token in tokens]                
                tokens = ['__start__'] + tokens + ['__end__']
                sentence.bigrams = Counter(zip(tokens, tokens[1:]))
                # update the count in document                
                document.bigrams.update(sentence.bigrams)

In [6]:
compute_tfs(corpus)

In [7]:
def compute_bigrams_idfs(corpus):    

    # Compute IDFs. Number of docs / number of documents the term happen    
    bigram_idfs = Counter()    
    for doc, summary in corpus:
        for document in [doc, summary]:            
            bigram_idfs.update({token: 1 for token in document.bigrams.keys()})

    num_docs = len(corpus) * 2    

    # inverse document frequency smooth    
    bigram_idfs = {key: log(num_docs / (1 + value)) for key, value in bigram_idfs.items()}
    bigram_idfs['__oov__'] = log(float(num_docs), 10)
    
    return bigram_idfs

In [8]:
bigram_idfs = compute_bigrams_idfs(corpus)

In [9]:
def compute_tf_idfs(corpus, bigram_idfs):
    for doc, summary in corpus:
        for document in [doc, summary]:                
            for sentence in document.sentences:                
                setattr(sentence, 'bigram_tfidf', Counter())                
                # compute tf_idf for bigrams
                sentence.bigram_tfidf = {key: freq * bigram_idfs[key] for key, freq in sentence.bigrams.items()}
                norm_factor = sqrt(sum([value * value for value in sentence.bigram_tfidf.values()]))
                sentence.bigram_tfidf = {key: value / norm_factor if norm_factor > 0 else value for key, value in sentence.bigram_tfidf.items()}                                
                delattr(sentence, 'bigrams')            
            delattr(document, 'bigrams')

In [10]:
compute_tf_idfs(corpus, bigram_idfs)

In [11]:
corpus[0][1].sentences[0].bigram_tfidf

{('__start__', 'a'): 0.10168304590465235,
 ('a', 'magnitude-7.8'): 0.421108005423122,
 ('earthquake', 'struck'): 0.35602536926803,
 ('magnitude-7.8', 'earthquake'): 0.40267439041025765,
 ('nepal', 'on'): 0.39494632164604654,
 ('on', 'saturday'): 0.2532715187541065,
 ('saturday', '__end__'): 0.37570221426466976,
 ('struck', 'nepal'): 0.40267439041025765}

In [12]:
def extract_oracle(bigram_idfs, document, highlight):    
    parts = []

    # compute unigram and bigram count for the summary
    bigrams = Counter()

    tokens = [token.word for token in highlight.tokens]
    tokens = [token.lower() for token in tokens]
    tokens = ['__start__'] + tokens + ['__end__']
    bigrams = Counter(zip(tokens, tokens[1:]))

    # compute tf_idf for bigrams
    bigram_tfidf = {key: freq * bigram_idfs.get(key, bigram_idfs['__oov__']) for key, freq in bigrams.items()}
    norm_factor = sqrt(sum([value * value for value in bigram_tfidf.values()]))
    bigram_tfidf = {key: value / norm_factor if norm_factor > 0 else value for key, value in bigram_tfidf.items()}

    sentence_scores = []
    concept_sentences = {}
    for sentence_index, sentence in enumerate(document.sentences):
        part = SentencePart(sentence)
        sentence_score = 0
        for concept in set(bigram_tfidf).intersection(set(sentence.bigram_tfidf)):
            concept_sentences[concept] = concept_sentences.get(concept, []) + [sentence_index]
            sentence_score += bigram_tfidf[concept]
        sentence_scores.append(sentence_score)
        part.active = True
        part.value = sentence_score
        parts.append(part)

    # Concept scores are based on summary tf-idf concepts
    for concept in sorted(bigram_tfidf):
        part = ConceptPart(concept, bigram_tfidf[concept], concept_sentences.get(concept, []))
        parts.append(part)

    scores = [part.value for part in parts]
    decoder_aux = ExtractiveCoverageSummarizationDecoder()
    decoder_aux.max_words = 30
    selected_sentences, _, predicted_concepts = decoder_aux.summarize_coverage(parts, scores)

    # Final summary and its score.
    summary = Document()
    summary.name = 'Oracle: '
    for selected_sentence in selected_sentences:
        summary.body.append(document.sentences[selected_sentence])

    return summary

In [13]:
f_source = open('source.txt', 'w', encoding='utf8')
f_target = open('target.txt', 'w', encoding='utf8')
for index, (document, summary) in enumerate(corpus):
    for sentence in summary.sentences:
        oracle = extract_oracle(bigram_idfs, document, sentence)        
        bigram_tfidf = Counter()
        for s in oracle.sentences:
            bigram_tfidf.update(s.bigram_tfidf)
        score = sum([bigram_tfidf[k] for k in set(bigram_tfidf).intersection(set(sentence.bigram_tfidf))]) / len(sentence.bigram_tfidf)

        if score >= 0.05:
            f_source.write(oracle.get_text().replace('\n', ' ') + '\n')
            f_target.write(sentence.text.replace('\n', ' ') + '\n')            

    if index % 1000 == 0:
        print('Processed {} / {}'.format(index, len(corpus)))
f_source.close()
f_target.close()

Processed 0 / 1090
Processed 1000 / 1090
