In [46]:
from nltk.corpus import reuters
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
fids_train = np.array([fid for fid in reuters.fileids() if fid.startswith('train')])
fids_test = np.array([fid for fid in reuters.fileids() if fid.startswith('test')])

In [3]:
gen_sents = lambda fids: (' '.join(sent) for fid in fids for sent in reuters.sents(fid))

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(gen_sents(fids_train))

In [113]:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

class ReutersGenerator():
    def __init__(self, max_seq_length=250, num_words=5000):
        self.tok = Tokenizer(num_words=num_words)
        self.max_seq_length = max_seq_length
        self.num_words = num_words
    
    def _gen_sents(self, fids):
        return (' '.join(sent) for fid in fids for sent in reuters.sents(fid))
    
    def fit(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        self.tok.fit_on_texts(self._gen_sents(fids))
        return self

    def count(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        return sum(1 for _ in self._gen_sents(fids))
    
    def inverse_transform(self, X):
        return self.tok.sequences_to_texts(X)
    
    def generate_pairs(self, fid_startswith='train', bs=32, 
                         max_seq_len=250, forever=False, shuffle=True):
        fids_in = np.array([fid for fid in reuters.fileids() if fid.startswith(fid_startswith)])
        index = np.arange(fids_in.shape[0])
        while True:
            np.random.shuffle(index)
            fids = fids_in[index]
            sents = self._gen_sents(fids)
            for batch in grouper(sents, bs):
                seqs = self.tok.texts_to_sequences_generator(text for text in batch if text)
                X = pad_sequences(list(seqs), self.max_seq_length)
                yield X, to_categorical(X, self.num_words)
            if not forever:
                break

In [114]:
data = ReutersGenerator(num_words=10000)

In [115]:
data.fit()

<__main__.ReutersGenerator at 0x1825c9278>

In [117]:
data.count('test')

14439

In [100]:
X, X_hat = next(data.generate_pairs())

In [101]:
X.shape

(32, 250)

In [102]:
X_hat.shape

(32, 250, 10000)

In [103]:
data.inverse_transform(X)

["saudi output said at year low to help opec saudi arabian oil output has fallen to its lowest level in more than a year giving fresh evidence of the kingdom ' s determination to keep oil prices at 18 dlrs a barrel as agreed by opec last december oil industry sources said",
 'they said saudi output in the first eight days of march averaged 2 6 mln barrels per day bpd including oil from the neutral zone shared with kuwait compared to a february average of 3 5 mln bpd',
 "they said saudi arabia was also selling oil from its crude oil stocks in tankers around the world which opec says must be towards a member ' s production quota",
 "saudi arabia ' s quota is 4 133 mln bpd",
 'the lower production levels indicated saudi arabia the world s largest oil exporter was insisting on getting opec official prices even at the cost of lower production the sources said',
 "king fahd reiterated yesterday in an interview with reuters and the television news agency visnews the saudi commitment to opec '

In [121]:
" ".join(reuters.sents(reuters.fileids()[0])[0])

"ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPAN RIFT Mounting trade friction between the U . S . And Japan has raised fears among many of Asia ' s exporting nations that the row could inflict far - reaching economic damage , businessmen and officials said ."

In [4]:
# implement moving window approach: generate (n-gram (min < n < max), target)-pairs 
# and move by stride m, e.g. "dieser text ist ein test ." => [dieser text] -> ist,
# [text ist ein] -> test (for m=1, 1 < n < 50)
import numpy as np
from nltk import word_tokenize
from nltk.corpus import reuters
from keras.preprocessing.text import Tokenizer
from itertools import zip_longest, islice

tok = Tokenizer(filters='"#$%&()*+-/<=>@[\\]^_`{|}~\t\n')
tok.fit_on_texts(' '.join(word_tokenize(reuters.raw(fid))) for fid in reuters.fileids())
sents = tok.texts_to_sequences_generator(
    ' '.join(word_tokenize(reuters.raw(fid))) for fid in reuters.fileids())
tokens = (tok for sent in sents for tok in sent)

In [5]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

def generate_ngram_pairs(tokens, min_toks=2, max_toks=5, chunk_len=50):
    chunks_of_toks = grouper(tokens, chunk_len)
    for chunk in chunks_of_toks:
        chunk = np.array(chunk)
        for pred in range(min_toks, chunk_len):
            n_toks = np.random.randint(min_toks, max_toks)
            data = chunk[max(0, pred-n_toks-1):pred]
            target = chunk[pred]
            yield data, target
            
for data, target in islice(generate_ngram_pairs(tokens), 50):
    print('DATA:', tok.sequences_to_texts([data])[0])
    print('TARGET:', tok.sequences_to_texts([[target]])[0])

DATA: asian exporters
TARGET: fear
DATA: asian exporters fear
TARGET: damage
DATA: asian exporters fear damage
TARGET: from
DATA: exporters fear damage from
TARGET: u.s.
DATA: exporters fear damage from u.s.
TARGET: japan
DATA: fear damage from u.s. japan
TARGET: rift
DATA: damage from u.s. japan rift
TARGET: mounting
DATA: japan rift mounting
TARGET: trade
DATA: u.s. japan rift mounting trade
TARGET: friction
DATA: rift mounting trade friction
TARGET: between
DATA: trade friction between
TARGET: the
DATA: mounting trade friction between the
TARGET: u.s.
DATA: between the u.s.
TARGET: and
DATA: the u.s. and
TARGET: japan
DATA: the u.s. and japan
TARGET: has
DATA: and japan has
TARGET: raised
DATA: and japan has raised
TARGET: fears
DATA: and japan has raised fears
TARGET: among
DATA: japan has raised fears among
TARGET: many
DATA: fears among many
TARGET: of
DATA: fears among many of
TARGET: asia
DATA: among many of asia
TARGET: 's
DATA: of asia 's
TARGET: exporting
DATA: many of asia 