In [10]:
import os
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import csv

EMBED_DIR = "policy_embeddings"
THRESHOLD = 0.95

# Helper to extract department from path
def get_department(filepath):
    rel_path = os.path.relpath(filepath, EMBED_DIR)
    parts = rel_path.split(os.sep)
    return parts[0] if len(parts) >= 1 else "Unknown"

# Load all .npy embeddings
files = glob.glob(os.path.join(EMBED_DIR, "**/*.npy"), recursive=True)
embeddings = {}
departments = {}

for f in files:
    emb = np.load(f)
    if emb.shape[0] < 2:
        continue  # skip tiny ones
    embeddings[f] = emb
    departments[f] = get_department(f)

# Compare only across departments
results = []
pairs = list(itertools.combinations(embeddings.items(), 2))

for (f1, emb1), (f2, emb2) in pairs:
    if departments[f1] == departments[f2]:
        continue  # Skip same-department pairs

    sims = cosine_similarity(emb1, emb2)
    overlap_count = np.sum(sims > THRESHOLD)
    
    if overlap_count > 0:
        results.append((os.path.basename(f1), os.path.basename(f2), departments[f1], departments[f2], overlap_count))

# Save result
with open("cross_department_overlap.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Policy 1", "Policy 2", "Dept 1", "Dept 2", f"Overlap (sim>{THRESHOLD})"])
    for r in results:
        writer.writerow(r)

print(f"✓ Done. Found {len(results)} cross-departmental overlaps.")

✓ Done. Found 110708 cross-departmental overlaps.
