Skip to content

Commit

Permalink
try to accelerate
Browse files Browse the repository at this point in the history
  • Loading branch information
sxjscience committed Mar 12, 2018
1 parent 704a87e commit b7d93f9
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions python/mxnet/gluon/data/text/base.py
Expand Up @@ -29,6 +29,7 @@
from ..datareader import DataReader
from .utils import flatten_samples, collate


class CorpusReader(DataReader):
"""Text reader that reads a whole corpus and produces a dataset based on provided
sample splitter and word tokenizer.
Expand Down Expand Up @@ -63,13 +64,18 @@ def __init__(self, filename, encoding='utf8', flatten=False,
def read(self):
with io.open(self._filename, 'r', encoding=self._encoding) as fin:
content = fin.read()
samples = (s.strip() for s in self._sample_splitter(content))
if self._tokenizer:
samples = [self._tokenizer(s) for s in samples if s]
if self._flatten:
samples = flatten_samples(samples)
else:
samples = [s for s in samples if s]
samples = []
for sample in self._sample_splitter(content):
if sample:
new_sample = sample.strip()
if self._tokenizer:
new_sample = self._tokenizer(new_sample)
if self._flatten:
samples.extend(new_sample)
else:
samples.append(new_sample)
else:
samples.append(new_sample)
return SimpleDataset(samples)


Expand Down

0 comments on commit b7d93f9

Please sign in to comment.