# Workshop 5: Practice Text Processing using NLTK and PyThaiNLP
This workshop covers fundamental text processing tasks for English and Thai texts using NLTK and PyThaiNLP.

## Part 1: English Text Processing with NLTK

In [1]:

# Install NLTK if not installed
!pip install nltk

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag

# Download required datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

text_en = "Natural language processing (NLP) is a fascinating field of artificial intelligence."

# Tokenization
tokens = word_tokenize(text_en)
print("Tokens:", tokens)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("Filtered Tokens (no stopwords):", filtered_tokens)

# Stemming
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in filtered_tokens]
print("Stemmed Tokens:", stemmed)

# POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)




[nltk_data] Downloading package punkt to
[nltk_data]     /Users/veerasakkritsanapraphan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veerasakkritsanapraphan/nltk_data...


Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', '.']
Filtered Tokens (no stopwords): ['Natural', 'language', 'processing', '(', 'NLP', ')', 'fascinating', 'field', 'artificial', 'intelligence', '.']
Stemmed Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'artifici', 'intellig', '.']
POS Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('fascinating', 'JJ'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('.', '.')]


[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/veerasakkritsanapraphan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Part 2: Thai Text Processing with PyThaiNLP

In [3]:

# Install PyThaiNLP if not installed
!pip install pythainlp

from pythainlp import word_tokenize, pos_tag
from pythainlp.util import normalize

text_th = "การประมวลผลภาษาธรรมชาติเป็นสาขาที่น่าสนใจของปัญญาประดิษฐ์"

# Word Tokenization
tokens_th = word_tokenize(text_th, engine="newmm")
print("Tokenized Thai words:", tokens_th)

# POS Tagging
pos_tags_th = pos_tag(tokens_th, corpus="orchid")
print("POS tags (Thai):", pos_tags_th)


Tokenized Thai words: ['การประมวลผล', 'ภาษาธรรมชาติ', 'เป็น', 'สาขา', 'ที่', 'น่าสนใจ', 'ของ', 'ปัญญาประดิษฐ์']
POS tags (Thai): [('การประมวลผล', 'NCMN'), ('ภาษาธรรมชาติ', 'NCMN'), ('เป็น', 'VSTA'), ('สาขา', 'NCMN'), ('ที่', 'PREL'), ('น่าสนใจ', 'VATT'), ('ของ', 'RPRE'), ('ปัญญาประดิษฐ์', 'NCMN')]


## Part 3: Combined Exercise
Try tokenizing and tagging texts in both English and Thai.

In [4]:

# English text
print("English text processing")
tokens = word_tokenize(text_en)
pos_tags = pos_tag(tokens)
print(pos_tags)

# Thai text
print("\nThai text processing")
tokens_th = word_tokenize(text_th, engine="newmm")
pos_tags_th = pos_tag(tokens_th, corpus="orchid")
print(pos_tags_th)


English text processing
[('Natural', 'NCMN'), (' ', 'PUNC'), ('language', 'NCMN'), (' ', 'PUNC'), ('processing', 'NCMN'), (' ', 'PUNC'), ('(NLP)', 'NCMN'), (' ', 'PUNC'), ('is', 'NCMN'), (' ', 'PUNC'), ('a', 'NCMN'), (' ', 'PUNC'), ('fascinating', 'NCMN'), (' ', 'PUNC'), ('field', 'NCMN'), (' ', 'PUNC'), ('of', 'NCMN'), (' ', 'PUNC'), ('artificial', 'NCMN'), (' ', 'PUNC'), ('intelligence', 'NCMN'), ('.', 'PUNC')]

Thai text processing
[('การประมวลผล', 'NCMN'), ('ภาษาธรรมชาติ', 'NCMN'), ('เป็น', 'VSTA'), ('สาขา', 'NCMN'), ('ที่', 'PREL'), ('น่าสนใจ', 'VATT'), ('ของ', 'RPRE'), ('ปัญญาประดิษฐ์', 'NCMN')]
