stitch preprocessing pipeline

Summary: 1. add call to binarization to complete preprocessing pipeline 2. add ability to specify task to select the dictionary, and add a bert task 3. Get rid of function calls that are no longer needed after moving functions from fairseq here Reviewed By: jingfeidu Differential Revision: D13977842 fbshipit-source-id: ec9bbb4e98e62e12c20ba68bb52b8bcc94aee91d
wouterkool · Feb 7, 2019 · cea0e4b · cea0e4b
1 parent c49c292
commit cea0e4b
Showing 1 changed file with 9 additions and 2 deletions.
diff --git a/preprocess.py b/preprocess.py
@@ -232,15 +232,22 @@ def make_all(lang):
                 print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
 
 
-def binarize(args, filename, dict, output_prefix, lang, offset, end):
+def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=True):
     ds = indexed_dataset.IndexedDatasetBuilder(
         dataset_dest_file(args, output_prefix, lang, "bin")
     )
 
     def consumer(tensor):
         ds.add_item(tensor)
 
-    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
+    res = Tokenizer.binarize(
+        filename,
+        dict,
+        consumer,
+        offset=offset,
+        end=end,
+        append_eos=append_eos
+    )
     ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
     return res