# Importing and setup

In [None]:
# Import necessary libraries
import json
from typing import List, Tuple, Dict, Set
import numpy as np
import spacy
from gensim.models import KeyedVectors
import hdbscan
from collections import defaultdict
import nltk
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Download NLTK WordNet data
nltk.download("wordnet")

# Load spaCy model
nlp = spacy.load("en_core_web_sm")  # Load once globally

## Importing from JSON

In [None]:
def load_words_from_json(filepath: str) -> List[str]:
    """Load words from a JSON file where keys are words."""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return list(data.keys())

## Preprocessing

In [None]:
def preprocess_words(words: List[str]) -> List[str]:
    """Clean and lemmatize words using spaCy."""
    cleaned: Set[str] = set()
    for word in words:
        doc = nlp(word.strip().lower())
        token = doc[0]
        if token.is_alpha and not token.is_stop:
            lemma = token.lemma_
            cleaned.add(lemma)
    return list(cleaned)

## Embed words with FastText (Gensim)

In [None]:
def load_word2vec_model(model_name: str = "word2vec-google-news-300") -> KeyedVectors:
    """Load a pretrained Word2Vec model using Gensim."""
    import gensim.downloader as api
    return api.load(model_name)

In [None]:
def reduce_dimensionality(vectors: np.ndarray, n_components: int = 50) -> np.ndarray:
    """
    Reduce the dimensionality of word vectors using PCA.

    Args:
        vectors (np.ndarray): Original high-dimensional word vectors.
        n_components (int): Number of dimensions to reduce to.

    Returns:
        np.ndarray: Reduced-dimensionality vectors.
    """
    pca = PCA(n_components=n_components, random_state=42)
    reduced_vectors = pca.fit_transform(vectors)
    return reduced_vectors

In [None]:
def filter_words_by_similarity(model: KeyedVectors, seed_words: List[str], top_n: int = 60) -> List[str]:
    """Filter words by finding the most similar words to the seed words."""
    similar_words = set()
    for seed in seed_words:
        if seed in model:
            similar = model.most_similar(seed, topn=top_n // len(seed_words))
            similar_words.update([word for word, _ in similar])
    return list(similar_words)

In [None]:
def get_vectors(words: List[str], model: KeyedVectors) -> Tuple[List[str], np.ndarray]:
    """Get vectors for words using a pretrained model."""
    valid_words = [w for w in words if w in model]
    vectors = np.array([model[w] for w in valid_words])
    return valid_words, vectors

## Cluster with HDBSCAN

In [None]:
def cluster_words(vectors: np.ndarray, min_cluster_size: int = 30) -> np.ndarray:
    """Cluster word vectors using HDBSCAN."""
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    return clusterer.fit_predict(vectors)

## Auto-label clusters using centroid-nearest

In [None]:
def label_clusters_centroid(words: List[str], vectors: np.ndarray, labels: np.ndarray, model: KeyedVectors) -> Dict[int, str]:
    """
    Assign labels to clusters using the centroid-nearest word approach.

    Args:
        words (List[str]): List of words.
        vectors (np.ndarray): Word vectors.
        labels (np.ndarray): Cluster labels.
        model (KeyedVectors): Pre-trained word embedding model.

    Returns:
        Dict[int, str]: Mapping of cluster IDs to their labels.
    """
    cluster_centroids = {}
    cluster_words = defaultdict(list)

    # Group words by cluster
    for word, vector, label in zip(words, vectors, labels):
        if label != -1:  # Ignore outliers
            cluster_words[label].append(vector)

    # Compute centroids and find nearest word
    for label, cluster_vectors in cluster_words.items():
        centroid = np.mean(cluster_vectors, axis=0)
        cluster_centroids[label] = centroid

    cluster_labels = {}
    for label, centroid in cluster_centroids.items():
        nearest_word = model.similar_by_vector(centroid, topn=1)[0][0]
        cluster_labels[label] = nearest_word

    return cluster_labels

## Visualization

In [None]:
def visualize_clusters(words: List[str], vectors: np.ndarray, labels: np.ndarray) -> None:
    """Visualize word clusters using t-SNE."""
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(12, 8))
    for label in set(labels):
        idxs = [i for i, l in enumerate(labels) if l == label]
        x = [reduced[i][0] for i in idxs]
        y = [reduced[i][1] for i in idxs]
        label_name = f"Cluster {label}" if label != -1 else "Outliers"
        plt.scatter(x, y, alpha=0.6, label=label_name)
    plt.legend()
    plt.title("t-SNE Visualization of Word Clusters")
    plt.show()

# Main worflow

In [None]:
# Load words from JSON
words = load_words_from_json("words_dictionary.json")

# Preprocess words
cleaned_words = preprocess_words(words)

In [None]:
# Load Word2Vec model
model = load_word2vec_model()

# Define seed words
seed_words = ["language", "learning", "education"]

# Filter words
filtered_words = filter_words_by_similarity(model, seed_words, top_n=60)
print("Filtered Words:", filtered_words)

In [None]:
# Get vectors for filtered words
valid_words, vectors = get_vectors(filtered_words, model)

# Reduce dimensionality with PCA
reduced_vectors = reduce_dimensionality(vectors, n_components=50)
print(f"Reduced Vectors Shape: {reduced_vectors.shape}")

# Cluster words using reduced vectors
labels = cluster_words(reduced_vectors, min_cluster_size=20)

In [None]:
# Label clusters using centroid-nearest word
cluster_labels = label_clusters_centroid(valid_words, reduced_vectors, labels, model)

# Print cluster labels
for cid, label in cluster_labels.items():
    print(f"Cluster {cid}: {label}")

In [None]:
# Visualize clusters
visualize_clusters(valid_words, vectors, labels)