Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example of data conversion and fix comments #379

Merged
merged 2 commits into from Sep 30, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 10 additions & 4 deletions textsum/README.md
Expand Up @@ -27,6 +27,9 @@ for example vocabulary format. In <b>How To Run</b> below, users can use toy
data and vocab provided in the data/ directory to run the training by replacing
the data directory flag.

data_convert_example.py contains example of convert between binary and text.


<b>Experiment Result</b>

8000 examples from testset are sampled to generate summaries and rouge score is
Expand Down Expand Up @@ -73,10 +76,13 @@ Install TensorFlow and Bazel.

```shell
# cd to your workspace
# clone the code to your workspace and create empty WORKSPACE file.
# move the data to your workspace. If don't have full dataset yet, copy
# the toy data from the data/ directory from code directory and rename
# the files.
# 1. Clone the textsum code to your workspace 'textsum' directory.
# 2. Create an empty 'WORKSPACE' file in your workspace.
# 3. Move the train/eval/test data to your workspace 'data' directory.
# In the following example, I named the data training-*, test-*, etc.
# If your data files have different names, update the --data_path.
# If you don't have data but want to try out the model, copy the toy
# data from the textsum/data/data to the data/ directory in the workspace.
ls -R
.:
data textsum WORKSPACE
Expand Down
12 changes: 8 additions & 4 deletions textsum/data.py
Expand Up @@ -70,11 +70,15 @@ def NumIds(self):
return self._count


def ExampleGen(recordio_path, num_epochs=None):
"""Generates tf.Examples from path of recordio files.
def ExampleGen(data_path, num_epochs=None):
"""Generates tf.Examples from path of data files.

Binary data format: <length><blob>. <length> represents the byte size
of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
the tokenized article text and summary.

Args:
recordio_path: CNS path to tf.Example recordio
data_path: path to tf.Example data files.
num_epochs: Number of times to go through the data. None means infinite.

Yields:
Expand All @@ -86,7 +90,7 @@ def ExampleGen(recordio_path, num_epochs=None):
while True:
if num_epochs is not None and epoch >= num_epochs:
break
filelist = glob.glob(recordio_path)
filelist = glob.glob(data_path)
assert filelist, 'Empty filelist.'
random.shuffle(filelist)
for f in filelist:
Expand Down
65 changes: 65 additions & 0 deletions textsum/data_convert_example.py
@@ -0,0 +1,65 @@
"""Example of Converting TextSum model data.
Usage:
python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data
python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data
python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2
diff data/text_data2 data/text_data
"""

import struct
import sys

import tensorflow as tf
from tensorflow.core.example import example_pb2

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('command', 'binary_to_text',
'Either binary_to_text or text_to_binary.'
'Specify FLAGS.in_file accordingly.')
tf.app.flags.DEFINE_string('in_file', '', 'path to file')
tf.app.flags.DEFINE_string('out_file', '', 'path to file')

def _binary_to_text():
reader = open(FLAGS.in_file, 'rb')
writer = open(FLAGS.out_file, 'w')
while True:
len_bytes = reader.read(8)
if not len_bytes:
sys.stderr.write('Done reading\n')
return
str_len = struct.unpack('q', len_bytes)[0]
tf_example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
tf_example = example_pb2.Example.FromString(tf_example_str)
examples = []
for key in tf_example.features.feature:
examples.append('%s=%s' % (key, tf_example.features.feature[key].bytes_list.value[0]))
writer.write('%s\n' % '\t'.join(examples))
reader.close()
writer.close()


def _text_to_binary():
inputs = open(FLAGS.in_file, 'r').readlines()
writer = open(FLAGS.out_file, 'wb')
for inp in inputs:
tf_example = example_pb2.Example()
for feature in inp.strip().split('\t'):
(k, v) = feature.split('=')
tf_example.features.feature[k].bytes_list.value.extend([v])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
writer.close()


def main(unused_argv):
assert FLAGS.command and FLAGS.in_file and FLAGS.out_file
if FLAGS.command == 'binary_to_text':
_binary_to_text()
elif FLAGS.command == 'text_to_binary':
_text_to_binary()


if __name__ == '__main__':
tf.app.run()