diff --git a/laonlp/tokenize/__init__.py b/laonlp/tokenize/__init__.py index d38cc7d..3206713 100644 --- a/laonlp/tokenize/__init__.py +++ b/laonlp/tokenize/__init__.py @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. """ +import re from typing import List from pythainlp.tokenize import Tokenizer from laonlp.corpus import lao_words @@ -42,4 +43,7 @@ def sent_tokenize(txt: str) -> List[str]: :return: returns a list of lao sentence :rtype: list """ - return txt.split(".") + sentences = [] + for part in re.split(r"(?<=\.)(?!(?:\.|$))", txt): + sentences.append(part.strip()) + return sentences diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 32e71ec..d817d1e 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -9,4 +9,7 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize("ພາສາລາວໃນປັດຈຸບັນ.")) def test_sent_tokenize(self): - self.assertIsNotNone(sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ")) + self.assertEqual( + sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ"), + ["ພາສາລາວໃນປັດຈຸບັນ.", "ນະຄອນຫຼວງວຽງຈັນ"] + )