# Workshop 3: Practice PyThaiNLP - Thai Language Processing
This workshop covers Thai word tokenization, POS tagging, transliteration, spell checking, normalization, and named entity recognition using PyThaiNLP.

In [21]:

# Install PyThaiNLP if not installed
!pip install pythainlp transformers python-crfsuite torch


Collecting torch
  Using cached torch-2.7.1-cp310-none-macosx_11_0_arm64.whl.metadata (29 kB)
Using cached torch-2.7.1-cp310-none-macosx_11_0_arm64.whl (68.6 MB)
Installing collected packages: torch
Successfully installed torch-2.7.1


In [22]:

from pythainlp import word_tokenize, pos_tag
from pythainlp.transliterate import transliterate
from pythainlp.spell import correct
from pythainlp.util import normalize
from pythainlp.tag import NER

text = "ฉันไปเที่ยวปราณบุรีเมื่ออาทิตย์ที่แล้ว"

In [23]:
# 1. Name Entity Recognition
ner = NER("thainer")
ner.tag(text)

[('ฉัน', 'O'),
 ('ไปเที่ยว', 'O'),
 ('ปราณ', 'O'),
 ('บุรี', 'O'),
 ('เมื่อ', 'O'),
 ('อาทิตย์', 'B-TIME'),
 ('ที่แล้ว', 'I-TIME')]

In [24]:

# 2. Word Tokenization (dictionary-based maximum matching)
tokens = word_tokenize(text, engine="newmm")
print("Tokenized words:", tokens)


Tokenized words: ['ฉัน', 'ไปเที่ยว', 'ปราณ', 'บุรี', 'เมื่อ', 'อาทิตย์', 'ที่แล้ว']


In [25]:

# 2. Part-of-Speech Tagging (ORCHID tagset)
pos_tags = pos_tag(tokens, corpus="orchid")
print("POS tags:", pos_tags)


POS tags: [('ฉัน', 'PPRS'), ('ไปเที่ยว', 'VACT'), ('ปราณ', 'VSTA'), ('บุรี', 'VACT'), ('เมื่อ', 'JSBR'), ('อาทิตย์', 'NCMN'), ('ที่แล้ว', 'DIAC')]


In [26]:

# 3. Transliteration: Thai script to Roman script
transliterated = [transliterate(word, engine="thaig2p") for word in tokens]
print("Transliterated words:", transliterated)


Corpus: thai-g2p
- Downloading: thai-g2p 0.1


  0%|          | 0/12164095 [00:00<?, ?it/s]

Transliterated words: ['t͡ɕʰ a n ˩˩˦', 'p a j ˧ . tʰ i a̯ w ˥˩', 'p r aː n ˧', 'b u ˨˩ . r iː ˧', 'm ɯ a̯ ˥˩', 'ʔ aː ˧ . tʰ i t̚ ˦˥', 'tʰ iː ˥˩ . l ɛː w ˦˥']


In [32]:

# 4. Spell Checking
text_with_typo = "ส้นตรบ"
corrected = correct(text_with_typo)
print("Corrected Text:", corrected)


Corrected Text: เส้นตรง


In [37]:

# 5. Text Normalization
text_unnormalized = "สวัสดีค่ะะ"
normalized_text = normalize(text_unnormalized)
print("Normalized Text:", normalized_text)


Normalized Text: สวัสดีค่ะ
