# Contextual embeddings in Python
Notebook created by A.Pournaki for the NLP course (ENS/Ecole nationale des chartes)
Revu par t. Poibeau (22 oct 2025)
`

In [1]:
## install new libraries
!pip install umap-learn
!pip install altair
!pip install sentence-transformers

Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.8 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting tqdm (from umap-learn)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba>=0.51.2->umap-learn)
  Downloading llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
Downloading numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m1.6 MB/s[0m  [33m0:00:01[0mm0:00:01[0m00:01[0m0m
[?25hDownloading llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [3

In [None]:
## import
import umap
import pandas as pd
import altair as alt
from sentence_transformers import SentenceTransformer

In [None]:
## load the articles
df = pd.read_csv("./data/articles_chatgpt.csv")
documents = list(df['text'])

In [None]:
## load the multilingual model
## check out more models here:
## https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v1
model = SentenceTransformer("distiluse-base-multilingual-cased-v1")

In [None]:
## encode
document_vectors = model.encode(documents,show_progress_bar=True)

In [6]:
## let's inspect the shape of that array
document_vectors.shape

(493, 512)

In [7]:
## instantiate UMAP method
umap_embedding = umap.UMAP(n_components=2,
                           metric='euclidean')

In [8]:
## reduce dimensionality
document_vectors_2D = umap_embedding.fit_transform(document_vectors)

In [9]:
## add to the existing dataframe
df['X'] = document_vectors_2D[:,0]
df['Y'] = document_vectors_2D[:,1]

In [10]:
## plot
chart = alt.Chart(df).mark_circle(size=60).encode(
        alt.X('X'),
        alt.Y('Y'),
        tooltip=['title']
    ).properties(
        width=800,
        height=600,
        title='Article embedding'
    ).interactive()
chart.configure_title(
    fontSize=18,
    anchor='start',
    dx=20
)

### Exercice

📝 Do the same for the article text instead of titles. What changes?

📝 Do some clustering and display the clusters as colors in the interactive plot. Which method would you choose and why?


In [17]:
import umap
import numpy as np
import pandas as pd
import altair as alt

try:
    import hdbscan
    HDBSCAN_AVAILABLE = True
except Exception:
    HDBSCAN_AVAILABLE = False

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import umap

# ----- 2) Scale + UMAP
X_scaled = StandardScaler().fit_transform(document_vectors)
reducer = umap.UMAP(
    n_components=2,
    n_neighbors=15,       # raise for broader structure, lower for local detail
    min_dist=0.05,        # smaller -> tighter clusters in 2D
    metric='euclidean',
    random_state=None, n_jobs=-1  #42
)


emb = reducer.fit_transform(X_scaled)   # shape (n, 2)

# Recentre et met à l’échelle les coordonnées
emb_centered = emb - emb.mean(axis=0)
emb_scaled = emb_centered / np.abs(emb_centered).max(axis=0)


# ----- 3) Cluster on the UMAP embedding
if HDBSCAN_AVAILABLE:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=25,   # tune: ↑ for bigger, fewer clusters; ↓ for smaller, more clusters
        min_samples=None,      # None -> equals min_cluster_size by default
        metric='euclidean',
        cluster_selection_epsilon=0.0
    ).fit(emb)
    labels = clusterer.labels_            # -1 = noise
    probs  = clusterer.probabilities_     # cluster membership strength [0..1]
    label_names = np.where(labels==-1, "noise", labels.astype(str))
else:
    # Fallback: k-means
    k = 4
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    labels = km.fit_predict(emb)
    probs  = np.ones_like(labels, dtype=float)  # no probability in k-means
    label_names = labels.astype(str)

# ----- 4) DataFrame for plotting
df = pd.DataFrame({
    "x": emb_scaled[:,0],
    "y": emb_scaled[:,1],
    "cluster": label_names,
    "strength": probs
})

# Optional: make noise points semi-transparent in HDBSCAN
opacity = alt.condition(
    alt.datum.cluster == "noise",
    alt.value(0.25),
    alt.value(0.9)
) if HDBSCAN_AVAILABLE else alt.value(0.9)

# ----- 5) Altair interactive scatter
chart = alt.Chart(df).mark_circle(size=60).encode(
    x=alt.X("x", title="UMAP-1"),
    y=alt.Y("y", title="UMAP-2"),
    color=alt.Color("cluster:N", title="Cluster"),
    opacity=opacity,
    tooltip=["cluster", alt.Tooltip("strength:Q", format=".2f"), "x", "y"]
).properties(
    width=700, height=480, title="UMAP + clustering"
).interactive()

chart