Preserve terminal punctuation marks during sentence tokenization

wannaphong · Sep 23, 2023 · a809b35 · a809b35
1 parent 6310bab
commit a809b35
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 2 deletions.
diff --git a/laonlp/tokenize/__init__.py b/laonlp/tokenize/__init__.py
@@ -14,6 +14,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+import re
 from typing import List
 from pythainlp.tokenize import Tokenizer
 from laonlp.corpus import lao_words
@@ -42,4 +43,7 @@ def sent_tokenize(txt: str) -> List[str]:
     :return: returns a list of lao sentence
     :rtype: list
     """
-    return txt.split(".")
+    sentences = []
+    for part in re.split(r"(?<=\.)(?!(?:\.|$))", txt):
+        sentences.append(part.strip())
+    return sentences
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -9,4 +9,7 @@ def test_word_tokenize(self):
         self.assertIsNotNone(word_tokenize("ພາສາລາວໃນປັດຈຸບັນ."))
 
     def test_sent_tokenize(self):
-        self.assertIsNotNone(sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ"))
+        self.assertEqual(
+            sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ"),
+            ["ພາສາລາວໃນປັດຈຸບັນ.", "ນະຄອນຫຼວງວຽງຈັນ"]
+        )