<a href="https://colab.research.google.com/github/susuhlaingmyk26-tech/Colab-project/blob/main/UNK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================
# OOV (Out-Of-Vocabulary) Handling Example
# ==========================================

# 1️⃣ Training sentences
train_sentences = [
    "I love cats",
    "I love dogs",
    "cats are cute"
]

# 2️⃣ Tokenization function
def tokenize(sentence):
    return sentence.lower().split()

# 3️⃣ Build vocabulary
from collections import Counter

def build_vocab(sentences, vocab_size=5):
    word_counts = Counter()

    for sent in sentences:
        word_counts.update(tokenize(sent))

    # most frequent words (reserve 1 for <UNK>)
    most_common = word_counts.most_common(vocab_size - 1)

    vocab = {word for word, _ in most_common}
    vocab.add("<UNK>")

    return vocab, word_counts

# 4️⃣ Replace OOV words with <UNK>
def replace_oov(sentence, vocab):
    tokens = tokenize(sentence)
    return [word if word in vocab else "<UNK>" for word in tokens]

# 5️⃣ Build vocab from training data
vocab, word_counts = build_vocab(train_sentences, vocab_size=5)

print("Vocabulary:", vocab)

# 6️⃣ Normalize training data
normalized_train = []
for sent in train_sentences:
    normalized_train.append(replace_oov(sent, vocab))

print("\nNormalized Training Data:")
for s in normalized_train:
    print(s)

# 7️⃣ Test sentence (contains OOV word)
test_sentence = "I love elephants"

normalized_test = replace_oov(test_sentence, vocab)

print("\nOriginal:", test_sentence)
print("After <UNK> handling:", normalized_test)


Vocabulary: {'cats', 'love', '<UNK>', 'i', 'dogs'}

Normalized Training Data:
['i', 'love', 'cats']
['i', 'love', 'dogs']
['cats', '<UNK>', '<UNK>']

Original: I love elephants
After <UNK> handling: ['i', 'love', '<UNK>']
