# Header

In [None]:
# !pip install transformers
# !pip install sentencepiece
# !pip install rouge-score
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ" -O cnn_stories.tgz && rm -rf /tmp/cookies.txt
# !tar -xzf "cnn_stories.tgz"

In [7]:
import json
import os
import stanza
import numpy as np
import pandas as pd
import re

from rouge_score import rouge_scorer
from tqdm import tqdm


# Data pre-processing

In [2]:
# url list from https://github.com/abisee/cnn-dailymail
with open('data/cnndm/filenames/cnn_files.json') as f:
    filenames = json.load(f)

train_files = filenames['train']
valid_files = filenames['valid']
test_files = filenames['test']


# stanza.download(lang='en')
nlp = stanza.Pipeline(lang='en', processors='tokenize')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-09-16 13:46:52 INFO: Downloading default packages for language: en (English)...





HBox(children=(FloatProgress(value=0.0, description='Downloading http://nlp.stanford.edu/software/stanza/1.2.2…




2021-09-16 13:48:23 INFO: Finished downloading models and saved to /home/aimenext/stanza_resources.
2021-09-16 13:48:23 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-09-16 13:48:23 INFO: Use device: gpu
2021-09-16 13:48:23 INFO: Loading: tokenize
2021-09-16 13:48:34 INFO: Done loading processors!


In [16]:
LOWER = False
LENGTH_THRESHOLD = 10
rouge_factors = {'rouge1': 0.4, 'rouge2': 0.3, 'rougeL': 0.3}  

def sent_tokenize(doc):
    doc = nlp(doc)
    sentences = []
    for sentence in doc.sentences:
        # print(sentence.tokens[0])
        sentence = ' '.join([token.text for token in sentence.tokens])
        if len(sentence) > LENGTH_THRESHOLD:
            sentences.append(sentence)
    
    return sentences

def reconstruct_text(text):
    return re.sub('\s([?.!"](?:\s|$))', '', text)

def parse_file(file):
    with open(file, encoding='utf-8') as f:
        document = f.read().rstrip().split("\n\n@highlight\n\n")
    summary = document[1:]
    doc = sent_tokenize(document[0])
    return doc, summary


def make_label(doc, sum, scorer):
    doc_size = len(doc)
    res = [0] * doc_size
    n = min(len(sum), doc_size)
    # f1 of rouge-L
    for j in range(n):
        # score = [scorer.score(sum[j], sent_i)['rouge2'][2] for sent_i in doc]
        score = [scorer.score(sum[j], sent_i) for sent_i in doc]
        score = [( 
            # x['rouge1'][2] * rouge_factors['rouge1'] + \
            x['rouge2'][2] * rouge_factors['rouge2'] + \
            x['rougeL'][2] * rouge_factors['rougeL']
            ) for x in score]
        sent_pos = np.argmax(score)
        for i in range(doc_size):
            if res[sent_pos] == 1:
                score[sent_pos] = 0
                sent_pos = np.argmax(score)
            else:
                res[sent_pos] = 1
                break
        # print(score[sent_pos])
        # print(doc[sent_pos])
        # print(sum[j], "\n")
    return res

def process(data_dir, files):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    docs = {}
    summaries = {}
    labels = {}
    remove_files = []
    for idx in tqdm(range(len(files))):
        # if idx%1000 == 0:
        #     print('\n', os.getpid(), idx)
        doc, summary = parse_file(os.path.join(data_dir, files[idx]))
        if len(doc) < len(summary) or len(doc) == 0 or len(summary) == 0:
            remove_files.append(files[idx])   
            continue    
        label = make_label(doc, summary, scorer)
        docs[files[idx]] = doc
        labels[files[idx]] = label
        summaries[files[idx]] = summary
        # if idx%5000 == 0:
        #     a = list(zip(label, doc))
        #     for i in a:
        #         print(len(i[1]), i[0], i[1])
        #     print('##########\n','\n'.join(summary))
    return docs, labels, summaries, remove_files

def json_dump(obj, file):
    with open(file, 'w', encoding='utf8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)

def process_and_write(data_dir, files, write_dir):
    docs, labels, summaries, remove_files = process(data_dir, files)

    os.makedirs(write_dir, exist_ok=True)
    json_dump(docs, os.path.join(write_dir, 'docs.json'))
    json_dump(labels, os.path.join(write_dir, 'labels.json'))
    json_dump(summaries, os.path.join(write_dir, 'summaries.json'))
    json_dump(remove_files, os.path.join(write_dir, 'remove_files.json'))


In [20]:
# base_write_dir = 'data/cnndm/cnn'
# process_and_write('cnn/stories', valid_files, os.path.join(base_write_dir, 'valid'))
# process_and_write('cnn/stories', train_files, os.path.join(base_write_dir, 'train'))

# Data processing

In [None]:
class Config:
    max_seq_len = 64
    max_doc_len = 32
    device = 'cuda:0'

