Improve detokenization performance with more recent NLTK versions

tomaarsen · Jan 2, 2023 · f994465 · f994465
1 parent 4c33740
commit f994465
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/Tokenizer.py b/Tokenizer.py
@@ -99,11 +99,12 @@ def detokenize(tokenized: List[str]) -> str:
     Returns:
         str: The correct string sentence, e.g. "Hello, I'm Tom"
     """
-    indices = [index for index, token in enumerate(tokenized) if token in ("''", "'")]
+    indices = [index for index, token in enumerate(tokenized) if token in ("''", "'", '"')]
+    # Replace '' with ", works better with more recent NLTK versions
+    tokenized_copy = [token if token != "''" else '"' for token in tokenized]
     # We get the reverse of the enumerate, as we modify the list we took the indices from
     enumerated = list(enumerate(indices))
 
-    tokenized_copy = deepcopy(tokenized)
     for i, index in enumerated[::-1]:
         # Opening quote
         if i % 2 == 0: