In [4]:
import numpy as np

In [5]:
import json

contents = []
titles = []

with open('../../signalmedia-1m.jsonl') as newsir_file:
    for line in newsir_file:
        sample = json.loads(line)
        contents.append(sample['content'])
        titles.append(sample['title'])
        
len(contents), len(titles)

(1000000, 1000000)

In [7]:
np.mean([len(content.split()) for content in contents])

393.08864899999998

In [8]:
np.mean([len(title.split()) for title in titles])

9.3770670000000003

In [9]:
from sklearn.model_selection import train_test_split

contents = {'full': contents}
titles = {'full': titles}

contents['train'], contents['eval'], titles['train'], titles['eval'] = train_test_split(
    contents['full'], titles['full'], train_size=0.8
)
contents['decode'], contents['eval'], titles['decode'], titles['eval'] = train_test_split(
    contents['eval'], titles['eval'], test_size=0.5
)

In [10]:
# Special tokens
PARAGRAPH_START = '<p>'
PARAGRAPH_END = '</p>'
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
UNKNOWN_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
DOCUMENT_START = '<d>'
DOCUMENT_END = '</d>'

In [11]:
def process_text(text):
    return '{} {} {} {} {} {} {}'.format(DOCUMENT_START, PARAGRAPH_START, SENTENCE_START,
                                         (SENTENCE_END + ' ' + SENTENCE_START).join(text.split('.')),
                                         SENTENCE_END, PARAGRAPH_END, DOCUMENT_END)

In [13]:
from tensorflow.core.example import example_pb2
import struct
from tqdm import tqdm

for mode in ['train', 'eval', 'decode']:
    with open('data/' + mode + '_newsir', 'wb') as data_file:
        for i in tqdm(range(len(contents[mode]))):
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend(
                [process_text(contents[mode][i]).encode()]
            )
            tf_example.features.feature['abstract'].bytes_list.value.extend(
                [process_text(titles[mode][i]).encode()]
            )
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            data_file.write(struct.pack('q', str_len))
            data_file.write(struct.pack('%ds' % str_len, tf_example_str))


100%|██████████| 800000/800000 [05:35<00:00, 2381.71it/s]
100%|██████████| 100000/100000 [00:41<00:00, 2387.54it/s]
100%|██████████| 100000/100000 [00:39<00:00, 2504.24it/s]


In [None]:
import nltk

token_counter = nltk.Counter()

for content in tqdm(contents['full']):
    token_counter.update(nltk.word_tokenize(process_text(content)))
for title in tqdm(titles['full']):
    token_counter.update(nltk.word_tokenize(process_text(title)))


 56%|█████▌    | 561233/1000000 [39:44<30:56, 236.29it/s]   51%|█████▏    | 514570/1000000 [36:40<34:35, 233.89it/s]

In [None]:
token_count = sum(token_counter.values())
vocab_size = 10000

with open('data/vocab_newsir', 'w') as vocab_file:
    most_common_token_count = 0
    for token, num in token_counter.most_common(vocab_size):
        print(token, num, file=vocab_file)
        most_common_token_count += num
    print(UNKNOWN_TOKEN, token_count - most_common_token_count, file=vocab_file)
    print(PAD_TOKEN, 5, file=vocab_file)
    for token in [PARAGRAPH_END, PARAGRAPH_START, DOCUMENT_END, DOCUMENT_START]:
        print(token, len(contents['full']), file=vocab_file)
    for token in [SENTENCE_END, SENTENCE_START]:
        print(token, len(contents['full']) + token_counter['.'], file=vocab_file)