# Tokenization
Splitting the text into smaller units such as words or sub-word, to segment the unstructured data to tokens and subtract information from them.
#### Reduces size of data
#### Feature extraction and information retrieval

## Types of tokenization
Words, Sub-words, Sentence, character

In [16]:
## sentence
from nltk.tokenize import sent_tokenize
 
text = "Hello world, here's an illustration for tokenization with nltk!"
sent_tokenize(text)

["Hello world, here's an illustration for tokenization with nltk!"]

In [18]:
## words
from nltk.tokenize import word_tokenize
 
text = "Hello world, here's an illustration for tokenization with nltk!"
word_tokenize(text)

['Hello',
 'world',
 ',',
 'here',
 "'s",
 'an',
 'illustration',
 'for',
 'tokenization',
 'with',
 'nltk',
 '!']

In [20]:
## regular expression method
from nltk.tokenize import RegexpTokenizer
 
tokenizer = RegexpTokenizer(r'\w+')
text = "Hello world, here's an illustration for tokenization with nltk!"
tokenizer.tokenize(text)

['Hello',
 'world',
 'here',
 's',
 'an',
 'illustration',
 'for',
 'tokenization',
 'with',
 'nltk']

In [24]:
## wordPunctTokenizer: keep the punctuation
from nltk.tokenize import WordPunctTokenizer
 
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Hello world, here's an illustration for tokenization with nltk!")

['Hello',
 'world',
 ',',
 'here',
 "'",
 's',
 'an',
 'illustration',
 'for',
 'tokenization',
 'with',
 'nltk',
 '!']

### SpaCy: pretrained tokenizing models

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Hello world, here's an illustration for tokenization with SpaCy!"
doc = nlp(text)
tokens = [token.text for token in doc]
print(tokens)

### Bert Tokenizer
WordPiece tokenizer, which splits words into subwords based on frequency. The frequent words are kept and rare words are splited into sub-words, effective in rare word analysis

In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text = "Hello world, here's an illustration for tokenization with BertTokenizer!"
tokens = tokenizer.tokenize(text)
print(tokens)

['hello', 'world', ',', 'here', "'", 's', 'an', 'illustration', 'for', 'token', '##ization', 'with', 'bert', '##tok', '##eni', '##zer', '!']


In [8]:
##can be converted to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[7592, 2088, 1010, 2182, 1005, 1055, 2019, 14614, 2005, 19204, 3989, 2007, 14324, 18715, 18595, 6290, 999]


In [10]:
## when used in BERT models, padding and attention masks are required
encoded = tokenizer(text, padding="max_length", truncation=True, max_length=10, return_tensors="pt")
print(encoded["input_ids"])  # Token IDs
print(encoded["attention_mask"])  # 1 for real tokens, 0 for padding

tensor([[  101,  7592,  2088,  1010,  2182,  1005,  1055,  2019, 14614,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


### Byte-Pair Encoding (BPE)
sub-word tokenization algorithm, split text into characters first, then combine most frequent bigrams, repeat the process untill limit of vacabulary size reached

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # GPT-2 uses BPE
text = "Hello world, here's an illustration for tokenization with BPE!"
tokens = tokenizer.tokenize(text)
print(tokens)

['Hello', 'Ġworld', ',', 'Ġhere', "'s", 'Ġan', 'Ġillustration', 'Ġfor', 'Ġtoken', 'ization', 'Ġwith', 'ĠB', 'PE', '!']


### Sentence Piece
sub-word algorithm based on BPE or unigram language models, but treats the input text as a raw stream of characters and remove spaces as word boundaries, works better in non-space-separated languages (Chinese, Japanese, etc.)

In [30]:
import sentencepiece as spm

# Train a tokenizer on a corpus
spm.SentencePieceTrainer.train(input="de-en.txt", model_prefix="mymodel", vocab_size=8000)
sp = spm.SentencePieceProcessor(model_file="mymodel.model")

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: de-en.txt
  input_format: 
  model_prefix: mymodel
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  different

In [32]:
# Tokenize a sentence
text = "Hello world, here's an illustration for tokenization with BPE!"
tokens = sp.encode(text, out_type=str)
print(tokens)

['▁Hel', 'lo', '▁wo', 'r', 'l', 'd', ',', '▁her', 'e', "'", 's', '▁an', '▁', 'ill', 'ust', 'r', 'ation', '▁', 'for', '▁', 'to', 'ken', 'iz', 'ation', '▁', 'wi', 'th', '▁B', 'P', 'E', '!']


In [34]:
# Decode into sentence
decoded_text = sp.decode(tokens)
print(decoded_text)

Hello world, here's an illustration for tokenization with BPE!


# Lemmatization
reduce words to base or root form

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Lemmatize a word (default POS is noun)
print(lemmatizer.lemmatize("running"))  # "running" → "running" (as noun by default)
print(lemmatizer.lemmatize("running", pos=wordnet.VERB))  # "running" → "run" (as verb)

[nltk_data] Downloading package wordnet to /Users/xiao/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/xiao/nltk_data...


running
run


In [3]:
import spacy

# Load the pre-trained model for English
nlp = spacy.load("en_core_web_sm")
# Sample sentence
doc = nlp("You're looking at illustrations for tokenization with SpaCy!")
# Lemmatize each token in the sentence
for token in doc:
    print(f"{token.text} -> {token.lemma_}")

You -> you
're -> be
looking -> look
at -> at
illustrations -> illustration
for -> for
tokenization -> tokenization
with -> with
SpaCy -> SpaCy
! -> !


# Stemming
reduces words to root by removing suffixes.Porter/Lancaster/Snowball/Lovis/Rule-based stemmer

Porter stemmer is moderate with regard to agressiveness, only support English

Lancaster stemmer is the most agressive, only support English

Snowball is modified upon Porter, moderately agressive, supports over 30 languages

In [18]:
import nltk
from nltk.stem import PorterStemmer

# Download necessary data from NLTK (if needed)
nltk.download('punkt')

# Sample sentence
sentence = "You're looking at illustrations for stemming with nltk!"
# Tokenize the sentence into words
words = nltk.word_tokenize(sentence)

[nltk_data] Downloading package punkt to /Users/xiao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
# PorterStemmer object
stemmer = PorterStemmer()
# Stem each word
stemmed_words = [stemmer.stem(word) for word in words]

# Print the results
print(f"Original words: {words}")
print(f"Stemmed words: {stemmed_words}")

Original words: ['You', "'re", 'looking', 'at', 'illustrations', 'for', 'stemming', 'with', 'nltk', '!']
Stemmed words: ['you', "'re", 'look', 'at', 'illustr', 'for', 'stem', 'with', 'nltk', '!']


In [22]:
# LancasterStemmer
from nltk.stem import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
stemmed_words = [lancaster_stemmer.stem(word) for word in words]
print(f"Stemmed words: {stemmed_words}")

Stemmed words: ['you', "'re", 'look', 'at', 'illust', 'for', 'stem', 'with', 'nltk', '!']


In [24]:
#SnowballStemmer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
stemmed_words = [snowball_stemmer.stem(word) for word in words]
print(f"Stemmed words: {stemmed_words}")

Stemmed words: ['you', 're', 'look', 'at', 'illustr', 'for', 'stem', 'with', 'nltk', '!']


# Stopword removal
remove common words

In [30]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the stopwords list (only needed once)
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = "This is an example sentence to demonstrate stopword removal."

# Tokenize the text into words
words = word_tokenize(text)

# Get the list of stopwords in English
stop_words = set(stopwords.words("english"))

# Remove stopwords from the tokenized words
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

[nltk_data] Downloading package punkt to /Users/xiao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xiao/nltk_data...


Filtered Words: ['example', 'sentence', 'demonstrate', 'stopword', 'removal', '.']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [32]:
import spacy

# Load the pre-trained model for English
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "This is an example sentence to demonstrate stopword removal."

# Process the text with spaCy
doc = nlp(text)

# Filter out stopwords from the tokens
filtered_words = [token.text for token in doc if not token.is_stop]

print("Filtered Words:", filtered_words)

Filtered Words: ['example', 'sentence', 'demonstrate', 'stopword', 'removal', '.']


# Parts of speech tagging
assigns a part of speech to each word in sentence based on definition and context.

In [26]:
import spacy

# Load the pre-trained model for English
nlp = spacy.load("en_core_web_sm")

# Sample sentence
sentence = "You're looking at illustrations for POS tagging with Spacy!"

# Process the sentence
doc = nlp(sentence)

# Display POS tags for each token
for token in doc:
    print(f"{token.text} -> {token.pos_}")

You -> PRON
're -> AUX
looking -> VERB
at -> ADP
illustrations -> NOUN
for -> ADP
POS -> PROPN
tagging -> VERB
with -> ADP
Spacy -> PROPN
! -> PUNCT


In [28]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample sentence
sentence = "You're looking at illustrations for POS tagging with nltk!"

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Perform POS tagging
tags = nltk.pos_tag(tokens)

# Display POS tags
for word, tag in tags:
    print(f"{word} -> {tag}")

[nltk_data] Downloading package punkt to /Users/xiao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/xiao/nltk_data...


You -> PRP
're -> VBP
looking -> VBG
at -> IN
illustrations -> NNS
for -> IN
POS -> NNP
tagging -> VBG
with -> IN
nltk -> NN
! -> .


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
