Skip to content

Commit

Permalink
Preserve terminal punctuation marks during sentence tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Sep 23, 2023
1 parent 6310bab commit a809b35
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
6 changes: 5 additions & 1 deletion laonlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
import re
from typing import List
from pythainlp.tokenize import Tokenizer
from laonlp.corpus import lao_words
Expand Down Expand Up @@ -42,4 +43,7 @@ def sent_tokenize(txt: str) -> List[str]:
:return: returns a list of lao sentence
:rtype: list
"""
return txt.split(".")
sentences = []
for part in re.split(r"(?<=\.)(?!(?:\.|$))", txt):
sentences.append(part.strip())
return sentences
5 changes: 4 additions & 1 deletion tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ def test_word_tokenize(self):
self.assertIsNotNone(word_tokenize("ພາສາລາວໃນປັດຈຸບັນ."))

def test_sent_tokenize(self):
self.assertIsNotNone(sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ"))
self.assertEqual(
sent_tokenize("ພາສາລາວໃນປັດຈຸບັນ.ນະຄອນຫຼວງວຽງຈັນ"),
["ພາສາລາວໃນປັດຈຸບັນ.", "ນະຄອນຫຼວງວຽງຈັນ"]
)

0 comments on commit a809b35

Please sign in to comment.