In [None]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
data_path = PROJECT_ROOT / "data" / "processed" / "job_postings_clean.parquet"

df = pd.read_parquet(data_path)
df.shape, df.columns.tolist()

In [None]:
text = (df["title_clean"].fillna("") + " — " + df["description_clean"].fillna("")).tolist()
len(text), text[0][:200]

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

embeddings = model.encode(
    text,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

embeddings.shape

In [None]:
EMB_DIR = PROJECT_ROOT / "models" / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)

np.save(EMB_DIR / "job_embeddings.npy", embeddings)

meta_cols = ["title", "location", "date"]
if "period" in df.columns:
    meta_cols.append("period")

df_meta = df[meta_cols].copy()
df_meta.to_parquet(EMB_DIR / "job_embeddings_meta.parquet", index=False)

print("Saved:", EMB_DIR)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

idx = np.random.randint(0, len(df))
query = embeddings[idx:idx+1]
sims = cosine_similarity(query, embeddings).flatten()

topk = sims.argsort()[-6:][::-1]  # includes itself
print("QUERY TITLE:", df.loc[idx, "title"])
print("\nTop similar titles:")
for j in topk:
    print(f"{sims[j]:.3f}  |  {df.loc[j, 'title']}")

In [None]:
idx = int(idx)  # keep your existing idx
sims = cosine_similarity(embeddings[idx:idx+1], embeddings).flatten()

# exclude itself
sims[idx] = -1

topk = sims.argsort()[-10:][::-1]
seen = set()

query_title = df.loc[idx, "title"]

print("QUERY TITLE:", df.loc[idx, "title"])
print("\nTop similar titles (deduped):")
for j in topk:
    t = df.loc[j, "title"]
    if t == query_title:   # ← tiny tweak
        continue
    if t in seen:
        continue
    seen.add(t)
    print(f"{sims[j]:.3f}  |  {t}")
    if len(seen) >= 6:
        break

Nearest-neighbor inspection confirms that semantically related roles are embedded closely.