In [None]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
data_path = PROJECT_ROOT / "data" / "processed" / "job_postings_clean.parquet"

df = pd.read_parquet(data_path)
df.shape, df.columns.tolist()

((123700, 6),
 ['title',
  'description',
  'location',
  'date',
  'title_clean',
  'description_clean'])

In [2]:
text = (df["title_clean"].fillna("") + " — " + df["description_clean"].fillna("")).tolist()
len(text), text[0][:200]

(123700,
 'marketing coordinator — job descriptiona leading real estate firm in new jersey is seeking an administrative marketing coordinator with some experience in graphic design. you will be working closely w')

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

embeddings = model.encode(
    text,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

embeddings.shape

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1933 [00:00<?, ?it/s]

(123700, 384)

In [4]:
EMB_DIR = PROJECT_ROOT / "models" / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)

np.save(EMB_DIR / "job_embeddings.npy", embeddings)

meta_cols = ["title", "location", "date"]
if "period" in df.columns:
    meta_cols.append("period")

df_meta = df[meta_cols].copy()
df_meta.to_parquet(EMB_DIR / "job_embeddings_meta.parquet", index=False)

print("Saved:", EMB_DIR)

Saved: /Users/teodoro.bevilacqua/Desktop/Job Market Signal Monitor/models/embeddings


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

idx = np.random.randint(0, len(df))
query = embeddings[idx:idx+1]
sims = cosine_similarity(query, embeddings).flatten()

topk = sims.argsort()[-6:][::-1]  # includes itself
print("QUERY TITLE:", df.loc[idx, "title"])
print("\nTop similar titles:")
for j in topk:
    print(f"{sims[j]:.3f}  |  {df.loc[j, 'title']}")

QUERY TITLE: Landfill Heavy Equipment Operator

Top similar titles:
1.000  |  Landfill Heavy Equipment Operator
1.000  |  Landfill Heavy Equipment Operator
1.000  |  Landfill Heavy Equipment Operator
0.988  |  Landfill laborer
0.988  |  Landfill Laborer
0.988  |  Landfill Laborer


In [8]:
idx = int(idx)  # keep your existing idx
sims = cosine_similarity(embeddings[idx:idx+1], embeddings).flatten()

# exclude itself
sims[idx] = -1

topk = sims.argsort()[-10:][::-1]
seen = set()

query_title = df.loc[idx, "title"]

print("QUERY TITLE:", df.loc[idx, "title"])
print("\nTop similar titles (deduped):")
for j in topk:
    t = df.loc[j, "title"]
    if t == query_title:   # ← tiny tweak
        continue
    if t in seen:
        continue
    seen.add(t)
    print(f"{sims[j]:.3f}  |  {t}")
    if len(seen) >= 6:
        break

QUERY TITLE: Landfill Heavy Equipment Operator

Top similar titles (deduped):
0.988  |  Landfill laborer
0.988  |  Landfill Laborer
0.986  |  Heavy Equipment Operator


Nearest-neighbor inspection confirms that semantically related roles are embedded closely.