## Importing from JSON

In [38]:
import json
from typing import List

def load_words_from_json(filepath: str) -> List[str]:
    """Load words from a JSON file where keys are words

    Args:
        filepath (str): Path to the JSON file.

    Returns:
        List[str]: List of words.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return list(data.keys())


In [39]:
words = load_words_from_json("words_dictionary.json")[:20]
words

['a',
 'aa',
 'aaa',
 'aah',
 'aahed',
 'aahing',
 'aahs',
 'aal',
 'aalii',
 'aaliis',
 'aals',
 'aam',
 'aani',
 'aardvark',
 'aardvarks',
 'aardwolf',
 'aardwolves',
 'aargh',
 'aaron',
 'aaronic']

## Preprocessing

In [40]:
from typing import Set

import spacy

nlp = spacy.load("en_core_web_sm") # load once globally

def preprocess_words(words: List[str]) -> List[str]:
    """Clean and lemmatize words using spaCy. 
    Removes stopwords and non-alphabetic tokens.

    Args:
        words (List[str]): Raw word list.

    Returns:
        List[str]: Cleaned and lemmatized word list.
    """
    cleaned: Set[str] = set()
    for word in words:
        doc = nlp(word.strip().lower())
        token = doc[0]
        if token.is_alpha and not token.is_stop:
            lemma = token.lemma_
            cleaned.add(lemma)
    return list(cleaned)

In [41]:
cleaned_words = preprocess_words(words)
cleaned_words

['aa',
 'aaa',
 'aahed',
 'aah',
 'aal',
 'aalii',
 'aam',
 'aaron',
 'aahs',
 'aani',
 'aardwolf',
 'aardvark',
 'aargh',
 'aaliis',
 'aaronic']

## Embed words with FastText (Gensim)

In [42]:
import gensim.downloader as api
from gensim.models import KeyedVectors

def load_word2vec_model(model_name: str = "word2vec-google-news-300") -> KeyedVectors:
    """
    Load a pretrained Word2Vec model using Gensim.

    Args:
        model_name (str): Gensim model name.

    Returns:
        KeyedVectors: Loaded Word2Vec model.
    """
    return api.load(model_name)


In [43]:
import numpy as np
from typing import Tuple

def get_vectors(words: List[str], model: KeyedVectors) -> Tuple[List[str], np.ndarray]:
    """
    Get vectors for words using a pretrained model.

    Args:
        words (List[str]): Words to embed.
        model (KeyedVectors): Loaded embedding model.

    Returns:
        Tuple[List[str], np.ndarray]: (Words with vectors, Corresponding vectors)
    """
    valid_words = [w for w in words if w in model]
    vectors = np.array([model[w] for w in valid_words])
    return valid_words, vectors


## Cluster with HDBSCAN

In [44]:
import hdbscan

def cluster_words(vectors: np.ndarray, min_cluster_size: int = 30) -> np.ndarray:
    """
    Cluster word vectors using HDBSCAN.

    Args:
        vectors (np.ndarray): Word vectors.
        min_cluster_size (int): Minimum size of clusters.

    Returns:
        np.ndarray: Cluster labels (-1 = outlier).
    """
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    return clusterer.fit_predict(vectors)


## Auto-label clusters with WordNet

In [45]:
from collections import defaultdict, Counter
from nltk.corpus import wordnet as wn
from typing import Dict

import nltk
nltk.download("wordnet")

def get_lexname(word: str) -> str:
    """
    Get WordNet semantic field for a word.

    Args:
        word (str): Input word.

    Returns:
        str: WordNet lexname or 'unknown'.
    """
    synsets = wn.synsets(word)
    return synsets[0].lexname() if synsets else "unknown"

def label_clusters(words: List[str], labels: np.ndarray) -> Tuple[Dict[int, str], Dict[int, List[str]]]:
    """
    Assign semantic labels to clusters.

    Args:
        words (List[str]): Clustered words.
        labels (np.ndarray): Cluster labels.

    Returns:
        Tuple[Dict[int, str], Dict[int, List[str]]]: Cluster label names, cluster contents.
    """
    cluster_dict = defaultdict(list)
    for word, label in zip(words, labels):
        if label != -1:
            cluster_dict[label].append(word)

    cluster_labels = {}
    for label, cluster_words in cluster_dict.items():
        lexnames = [get_lexname(w) for w in cluster_words]
        top_label = Counter(lexnames).most_common(1)[0][0]
        cluster_labels[label] = top_label
    return cluster_labels, cluster_dict


[nltk_data] Downloading package wordnet to /home/thtkha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Visualization

In [46]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize_clusters(words: List[str], vectors: np.ndarray, labels: np.ndarray) -> None:
    """
    Visualize word clusters using t-SNE.

    Args:
        words (List[str]): Words.
        vectors (np.ndarray): Word vectors.
        labels (np.ndarray): Cluster labels.
    """
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(12, 8))
    for label in set(labels):
        idxs = [i for i, l in enumerate(labels) if l == label]
        x = [reduced[i][0] for i in idxs]
        y = [reduced[i][1] for i in idxs]
        label_name = f"Cluster {label}" if label != -1 else "Outliers"
        plt.scatter(x, y, alpha=0.6, label=label_name)
    plt.legend()
    plt.title("t-SNE Visualization of Word Clusters")
    plt.show()


In [47]:
words = load_words_from_json("words_dictionary.json")

cleaned_words = preprocess_words(words)

model = load_word2vec_model()

valid_words, vectors = get_vectors(cleaned_words, model)

labels = cluster_words(vectors, min_cluster_size=30)

cluster_labels, cluster_dict = label_clusters(valid_words, labels)

for cid, label in cluster_labels.items():
    print(f"\n[Cluster {cid}: {label}]")
    print(", ".join(cluster_dict[cid][:10]))

visualize_clusters(valid_words, vectors, labels)


KeyboardInterrupt: 