<a href="https://colab.research.google.com/github/tlkahn/tlkahn.github.io/blob/main/Demonstrates_fastText_composing_an_OOV_word_vector_from_subword_n_grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.1-py3-none-any.whl (293 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=4498213 sha256=6570009c83f97f4c441d1cc86f13717f0beacb02475eea5567d8af49c8cb232b
  Stored in directory: /root/.cache/pip/wheels/20/27/95/a7baf1b435f1cbde017cabd

In [2]:
!pip install fasttext-wheel

Collecting fasttext-wheel
  Downloading fasttext_wheel-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading fasttext_wheel-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fasttext-wheel
Successfully installed fasttext-wheel-0.9.2


In [18]:
import os, tempfile, sys, subprocess
import fasttext
import numpy as np

In [9]:
corpus = """buddha dharma sangha lotus sutra mantra tantra vinaya abhidharma
the path to awakening includes compassion wisdom and meditation
we learn representations for words and subwords with fasttext
"""
with tempfile.NamedTemporaryFile(mode="w", delete=False, encoding="utf-8") as f:
    f.write(corpus)
    corpus_path = f.name


In [11]:
# Train a small fastText model with subwords
model = fasttext.train_unsupervised(
    corpus_path,
    model="skipgram",
    dim=50,
    epoch=20,
    minn=3,
    maxn=6,
    lr=0.05,
    thread=1,
    minCount=1,  # Add this line
)



In [12]:
# Choose an out-of-vocabulary word (not in the tiny corpus)
word = "bodhisattvahood"

# Check OOV status
in_vocab = word in set(model.get_words())
print(f"OOV: {not in_vocab}")


OOV: True


In [14]:
v_fasttext = model.get_word_vector(word)

In [16]:
v_fasttext.shape

(50,)

In [19]:
subwords, ids = model.get_subwords(word)  # n-gram strings and their input IDs
vectors = [model.get_input_vector(i) for i in ids]  # input embeddings for each n-gram
v_sum = np.sum(vectors, axis=0)
v_avg = v_sum / max(len(vectors), 1)


In [23]:
subwords[:10]

['<bo',
 '<bod',
 '<bodh',
 '<bodhi',
 'bod',
 'bodh',
 'bodhi',
 'bodhis',
 'odh',
 'odhi']

In [20]:
def dist(a, b):
    return float(np.linalg.norm(a - b))


In [21]:
print(f"num_subwords: {len(ids)}")
print(f"||fastText - sum||: {dist(v_fasttext, v_sum):.6f}")
print(f"||fastText - avg||: {dist(v_fasttext, v_avg):.6f}")


num_subwords: 54
||fastText - sum||: 0.236811
||fastText - avg||: 0.000000
