In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
EMB_DIR = PROJECT_ROOT / "models" / "embeddings"

X = np.load(EMB_DIR / "job_embeddings.npy")
meta = pd.read_parquet(EMB_DIR / "job_embeddings_meta.parquet")

X.shape, meta.shape

In [None]:
import numpy as np

# take a random subset for speed
rng = np.random.default_rng(42)
idx = rng.choice(len(X), size=20000, replace=False)
X_sub = X[idx]

In [None]:
from sklearn.cluster import KMeans

Ks = list(range(20, 51, 5))
inertias = []

for k in Ks:
    km = KMeans(n_clusters=k, n_init=5, random_state=42)
    km.fit(X_sub)
    inertias.append(km.inertia_)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7,4))
plt.plot(Ks, inertias, marker="o")
plt.title("KMeans Elbow (Inertia vs K)")
plt.xlabel("K")
plt.ylabel("Inertia")
plt.tight_layout()
plt.show()

Based on the elbow method computed on a random subset of embeddings, K=30 provides a good trade-off between cluster granularity and compactness.

In [None]:
from sklearn.cluster import KMeans

K = 30  # adjust only if your elbow clearly suggests another value

kmeans = KMeans(
    n_clusters=K,
    n_init=5,
    random_state=42
)

labels = kmeans.fit_predict(X)

In [None]:
meta = meta.copy()
meta["cluster"] = labels

meta["cluster"].value_counts().head(10)

In [None]:
meta["cluster"].value_counts().describe()

In [None]:
top_titles_per_cluster = (
    meta.groupby("cluster")["title"]
    .apply(lambda s: s.value_counts().head(5))
)

top_titles_per_cluster.head()

In [None]:
cluster_labels = (
    top_titles_per_cluster
    .groupby(level=0)
    .apply(lambda s: " | ".join(s.index.get_level_values(1)[:2]))
)

cluster_labels.head()

In [None]:
meta["cluster_label"] = meta["cluster"].map(cluster_labels)

meta[["cluster", "cluster_label"]].drop_duplicates().sort_values("cluster").head(10)

In [None]:
OUT_DIR = PROJECT_ROOT / "models" / "clusters"
OUT_DIR.mkdir(parents=True, exist_ok=True)

meta.to_parquet(OUT_DIR / "job_postings_with_clusters.parquet", index=False)

In [None]:
if "period" in meta.columns:
    cluster_share = (
        meta.groupby(["period", "cluster"])
        .size()
        .groupby(level=0)
        .apply(lambda x: x / x.sum())
        .rename("share")
        .reset_index()
    )

    cluster_share.head()

In [None]:
if "period" in meta.columns:
    pivot = cluster_share.pivot(
        index="cluster", columns="period", values="share"
    ).fillna(0)

    pivot["delta"] = pivot.get("late_period", 0) - pivot.get("early_period", 0)
    pivot.sort_values("delta", ascending=False).head(10)

In [None]:
cluster_sizes = meta["cluster"].value_counts().sort_index()

top_titles = (
    meta.groupby("cluster")["title"]
    .apply(lambda s: s.value_counts().head(8).index.tolist())
)

summary = pd.DataFrame({
    "size": cluster_sizes,
    "top_titles": top_titles
}).sort_values("size", ascending=False)

summary.head(10)

In [None]:
summary.head(30)

In [None]:
import pandas as pd

cluster_summary = (
    meta
    .groupby("cluster")
    .agg(
        size=("cluster", "size"),
        top_titles=("title", lambda s: s.value_counts().head(5).index.tolist())
    )
    .sort_values("size", ascending=False)
)

cluster_summary.head(10)

In [None]:
cluster_summary["cluster_name"] = cluster_summary["top_titles"].apply(
    lambda lst: " | ".join(lst[:2])
)

cluster_summary[["size", "cluster_name"]].head(10)

In [None]:
if "period" in meta.columns:
    period_share = (
        meta.groupby(["cluster", "period"])
        .size()
        .groupby(level=0)
        .apply(lambda x: x / x.sum())
        .rename("share")
        .reset_index()
    )

    period_pivot = period_share.pivot(
        index="cluster", columns="period", values="share"
    ).fillna(0)

    cluster_summary = cluster_summary.join(period_pivot)

cluster_summary.head(10)

In [None]:
c = 1  # change this
cols = ["title", "location", "date"]
if "period" in meta.columns:
    cols.append("period")

meta.loc[meta["cluster"] == c, cols].head(15)

In [None]:
c = 25  # change this
cols = ["title", "location", "date"]
if "period" in meta.columns:
    cols.append("period")

meta.loc[meta["cluster"] == c, cols].head(15)

In [47]:
# Manual, human-readable cluster labels
cluster_label_map = {
    # meaningful cluster
    25: "Engineering Roles (Electrical & Mechanical)",

    # unclear but accepted cluster
    1: "Clinical Support & Medical Assistant Roles"
}

In [48]:
meta = meta.copy()

meta["cluster_label_manual"] = meta["cluster"].map(cluster_label_map)

# Fallback to automatic label if no manual label exists
meta["cluster_label_final"] = meta["cluster_label_manual"].fillna(meta["cluster_label"])

In [49]:
meta[meta["cluster_label_manual"].notna()][
    ["cluster", "cluster_label", "cluster_label_final"]
].drop_duplicates()

Unnamed: 0,cluster,cluster_label,cluster_label_final
7,25,Electrical Engineer | Mechanical Engineer,Engineering Roles (Electrical & Mechanical)
8,1,Patient Care Technician | Medical Assistant,Clinical Support & Medical Assistant Roles


In [50]:
OUT_DIR = PROJECT_ROOT / "models" / "clusters"
OUT_DIR.mkdir(parents=True, exist_ok=True)

meta.to_parquet(
    OUT_DIR / "job_postings_with_clusters.parquet",
    index=False
)

print("Saved clustered data with manual labels.")

Saved clustered data with manual labels.


Some clusters exhibit strong semantic coherence and map cleanly to well-defined job families (e.g. Engineering Roles – Electrical & Mechanical).
Other clusters aggregate adjacent roles with overlapping terminology and responsibilities, such as Patient Care Technician and Medical Assistant.
Rather than forcing artificial separation, these clusters are retained and explicitly labeled to reflect real-world job market ambiguity.
This approach prioritizes interpretability and realism over overly granular partitioning.

In [66]:
meta.loc[
    meta["cluster"].isin(cluster_label_map.keys()),
    ["cluster", "cluster_label", "cluster_label_final"]
].drop_duplicates()

Unnamed: 0,cluster,cluster_label,cluster_label_final
7,25,Electrical Engineer | Mechanical Engineer,Engineering Roles (Electrical & Mechanical)
8,1,Patient Care Technician | Medical Assistant,Clinical Support & Medical Assistant Roles


In [67]:
cluster_stats = (
    meta
    .groupby("cluster_label_final")
    .agg(
        size=("cluster", "size")
    )
    .sort_values("size", ascending=False)
)

cluster_stats.head(10)

Unnamed: 0_level_0,size
cluster_label_final,Unnamed: 1_level_1
Clinical Support & Medical Assistant Roles,7811
Maintenance Technician | Service Technician,6189
Software Engineer | Senior Software Engineer,6167
Receptionist | Package Handler - Part Time (Warehouse like),5913
Sales Specialist | Outside Sales Representative,5743
Product Manager | Account Executive,5650
Engineering Roles (Electrical & Mechanical),5627
Registered Nurse | Certified Nursing Assistant (CNA),5264
Material Handler | Warehouse Associate,5130
Administrative Assistant | Executive Assistant,4519


In [68]:
if "period" in meta.columns:
    cluster_period = (
        meta
        .groupby(["cluster_label_final", "period"])
        .size()
        .groupby(level=0)
        .apply(lambda x: x / x.sum())
        .rename("share")
        .reset_index()
    )

    cluster_period.head()

In [69]:
if "period" in meta.columns:
    cluster_period = (
        meta
        .groupby(["cluster_label_final", "period"])
        .size()
        .groupby(level=0)
        .apply(lambda x: x / x.sum())
        .rename("share")
        .reset_index()
    )

    pivot = cluster_period.pivot(
        index="cluster_label_final",
        columns="period",
        values="share"
    ).fillna(0)

    pivot["delta_share"] = (
        pivot.get("late_period", 0) - pivot.get("early_period", 0)
    )

    pivot = pivot.sort_values("delta_share", ascending=False)

In [71]:
meta.columns.tolist()

['title',
 'location',
 'date',
 'cluster',
 'cluster_label',
 'cluster_label_manual',
 'cluster_label_final']

In [72]:
import pandas as pd

# Ensure date is datetime
meta = meta.copy()
meta["date"] = pd.to_datetime(meta["date"], errors="coerce")

# Drop rows without a valid date (rare but possible)
meta = meta.dropna(subset=["date"])

# Split by midpoint date
split_date = meta["date"].min() + (meta["date"].max() - meta["date"].min()) / 2
meta["period"] = meta["date"].apply(lambda d: "early_period" if d <= split_date else "late_period")

print("Split date:", split_date)
meta["period"].value_counts()

Split date: 2024-04-06 23:08:35


period
late_period     110888
early_period     12812
Name: count, dtype: int64

In [74]:
# cluster share within each period (normalized)
cluster_period = (
    meta.groupby("period")["cluster_label_final"]
    .value_counts(normalize=True)
    .rename("share")
    .reset_index()
)

cluster_period.head()

Unnamed: 0,period,cluster_label_final,share
0,early_period,Registered Nurse | Certified Nursing Assistant...,0.081252
1,early_period,Clinical Support & Medical Assistant Roles,0.062832
2,early_period,Registered Nurse - RN - LTAC | Nurse Practitioner,0.057524
3,early_period,Maintenance Technician | Service Technician,0.048938
4,early_period,ASSISTANT STORE MANAGER | Customer Service Rep...,0.046285


In [75]:
pivot = (
    cluster_period
    .pivot(index="cluster_label_final", columns="period", values="share")
    .fillna(0)
)

pivot["delta_share"] = pivot["late_period"] - pivot["early_period"]
pivot = pivot.sort_values("delta_share", ascending=False)

pivot.head(10)

period,early_period,late_period,delta_share
cluster_label_final,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Software Engineer | Senior Software Engineer,0.031767,0.051944,0.020177
Engineering Roles (Electrical & Mechanical),0.036606,0.046515,0.009909
Project Manager | Senior Project Manager,0.02443,0.033448,0.009018
Network Engineer | Technical Support Specialist,0.022323,0.030725,0.008402
Sales Specialist | Outside Sales Representative,0.03965,0.04721,0.007559
Product Manager | Account Executive,0.039416,0.046398,0.006982
Data Analyst | Business Analyst,0.024742,0.031572,0.00683
Cyber Security Engineer | Security Officer,0.015845,0.022581,0.006737
Marketing Manager | Graphic Designer,0.022245,0.027983,0.005738
Financial Advisor | Mortgage Loan Officer,0.030674,0.035919,0.005245


In [76]:
top_rising = pivot.head(5)
top_declining = pivot.tail(5)

top_rising, top_declining

(period                                           early_period  late_period  \
 cluster_label_final                                                          
 Software Engineer | Senior Software Engineer         0.031767     0.051944   
 Engineering Roles (Electrical & Mechanical)          0.036606     0.046515   
 Project Manager | Senior Project Manager             0.024430     0.033448   
 Network Engineer | Technical Support Specialist      0.022323     0.030725   
 Sales Specialist | Outside Sales Representative      0.039650     0.047210   
 
 period                                           delta_share  
 cluster_label_final                                           
 Software Engineer | Senior Software Engineer        0.020177  
 Engineering Roles (Electrical & Mechanical)         0.009909  
 Project Manager | Senior Project Manager            0.009018  
 Network Engineer | Technical Support Specialist     0.008402  
 Sales Specialist | Outside Sales Representative     0.007559

In [77]:
cluster_period.columns.tolist()

['period', 'cluster_label_final', 'share']

Over the observed two-month window, cluster share shifted noticeably across job families. Software engineering roles increased the most in relative share (+2.02pp), followed by electrical/mechanical engineering (+0.99pp), project management (+0.90pp), and network/technical support (+0.84pp). Commercial roles also showed a modest rise, with outside sales increasing by +0.76pp. On the declining side, healthcare support clusters dropped sharply, with RN/CNA-related postings decreasing by −4.32pp and RN/LTAC–NP roles by −2.97pp. Several retail and business development clusters also decreased by around one percentage point. Given the short time horizon, these movements should be interpreted as short-window shifts in posting composition rather than long-term labor market trends, but they demonstrate how semantic clustering enables interpretable monitoring at the job-family level.

In [78]:
for label in list(top_rising.index[:2]) + list(top_declining.index[:2]):
    print("\nCLUSTER:", label)
    print(meta.loc[meta["cluster_label_final"] == label, "title"].value_counts().head(10))


CLUSTER: Software Engineer | Senior Software Engineer
title
Software Engineer           150
Senior Software Engineer    122
Software Developer           55
DevOps Engineer              55
Frontend Developer           54
Back End Developer           51
Full Stack Engineer          50
Web Developer                39
Python Developer             36
Java Developer               34
Name: count, dtype: int64

CLUSTER: Engineering Roles (Electrical & Mechanical)
title
Electrical Engineer           121
Mechanical Engineer            78
Manufacturing Engineer         77
Process Engineer               56
Quality Engineer               43
Project Engineer               41
Structural Engineer            40
Senior Mechanical Engineer     38
Design Engineer                36
Senior Electrical Engineer     36
Name: count, dtype: int64

CLUSTER: Sales Associate | Junior Groomer
title
Sales Associate               59
Junior Groomer                28
Groomer                       27
Property Manager   