# Embedding Generation for BERTopic

This notebook generates embeddings for each subject dataset (cs, eess, math, physics, stat) using multiple transformer models.

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from typing import List
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel

In [2]:
LIST_SUBJECT = ["cs", "math", "physics"]

TRANSFORMERS = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "all-distilroberta-v1",
    "intfloat/e5-base-v2",
    "all-mpnet-base-v2", 
    "BAAI/bge-base-en-v1.5",
    "allenai/specter2"
]

BASE_DIR = Path("../../dataset")
OUTPUT_DIR = Path("./embedding")
BATCH_SIZE = 128
VERSION = "v1"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
from adapters import AutoAdapterModel

def load_specter2():
    tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
    model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
    
    # Load the proximity adapter
    model.load_adapter("allenai/specter2", source="hf", set_active=True)
    
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()
    return model, tokenizer


def encode_specter2(texts: List[str], model, tokenizer, batch_size: int = 32) -> np.ndarray:
    embeddings = []
    device = next(model.parameters()).device
    
    for i in tqdm(range(0, len(texts), batch_size), desc="  Generating"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        embeddings.append(batch_emb)
    
    return np.vstack(embeddings)


In [4]:
def load_dataset(subject: str) -> pd.DataFrame:
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    return pd.read_csv(file_path)


def get_model_safe_name(model_name: str) -> str:
    return model_name.replace("/", "_").replace("-", "_")


def generate_mmap_embeddings(
    texts: List[str],
    embedding_model: SentenceTransformer,
    mmap_path: str,
    batch_size: int = 1024
) -> np.memmap:
    N = len(texts)
    emb_dim = embedding_model.get_sentence_embedding_dimension()
    
    if N == 0:
        print("Error: Empty text list.")
        return None

    print(f"  Total documents: {N:,}")
    print(f"  Embedding dimension: {emb_dim}")
    print(f"  Batch size: {batch_size}")
    print(f"  Output path: {mmap_path}")

    embs = np.memmap(
        mmap_path, 
        dtype="float32", 
        mode="w+",
        shape=(N, emb_dim)
    )

    for i in tqdm(range(0, N, batch_size), desc="  Generating"):
        batch_texts = texts[i:i + batch_size]
        
        batch_embeddings = embedding_model.encode(
            batch_texts, 
            show_progress_bar=False, 
            convert_to_numpy=True
        )
        
        embs[i:i + len(batch_texts)] = batch_embeddings

    embs.flush()
    
    metadata = {
        "n_samples": N,
        "emb_dim": emb_dim,
        "dtype": "float32"
    }
    np.save(mmap_path.replace(".mmap", "_meta.npy"), metadata)
    
    return embs

In [5]:
all_data = {}

for subject in LIST_SUBJECT:
    df = load_dataset(subject)
    if df is not None:
        all_data[subject] = df
        print(f"{subject}: {len(df):,} documents loaded")

print(f"\nTotal subjects loaded: {len(all_data)}")

cs: 165,756 documents loaded
math: 126,192 documents loaded
physics: 146,311 documents loaded

Total subjects loaded: 3


In [6]:
for subject, df in all_data.items():
    print(f"\n{'='*60}")
    print(f"Processing: {subject.upper()}")
    print(f"{'='*60}")
    
    subject_output_dir = OUTPUT_DIR / subject
    subject_output_dir.mkdir(parents=True, exist_ok=True)
    
    texts = df["text"].fillna("").tolist()
    
    for model_name in TRANSFORMERS:
        print(f"\n[Model: {model_name}]")
        
        model_safe_name = get_model_safe_name(model_name)
        output_path = str(subject_output_dir / f"{model_safe_name}_{VERSION}.mmap")
        
        if os.path.exists(output_path):
            print(f"  Skipping (already exists): {output_path}")
            continue
        
        try:
            if model_name == "allenai/specter2":
                model, tokenizer = load_specter2()
                N = len(texts)
                emb_dim = 768
                
                print(f"  Total documents: {N:,}")
                print(f"  Embedding dimension: {emb_dim}")
                print(f"  Batch size: {BATCH_SIZE}")
                print(f"  Output path: {output_path}")
                
                embeddings = encode_specter2(texts, model, tokenizer, batch_size=BATCH_SIZE)
                
                embs = np.memmap(output_path, dtype="float32", mode="w+", shape=(N, emb_dim))
                embs[:] = embeddings
                embs.flush()
                
                np.save(output_path.replace(".mmap", "_meta.npy"), {"n_samples": N, "emb_dim": emb_dim, "dtype": "float32"})
                
                del model, tokenizer
            elif model_name == "nomic-ai/nomic-embed-text-v1.5":
                model = SentenceTransformer(model_name, trust_remote_code=True)
                generate_mmap_embeddings(texts=texts, embedding_model=model, mmap_path=output_path, batch_size=BATCH_SIZE)
                del model
            else:
                model = SentenceTransformer(model_name)
                generate_mmap_embeddings(texts=texts, embedding_model=model, mmap_path=output_path, batch_size=BATCH_SIZE)
                del model
            
            print(f"  ✓ Done")
            
            import gc
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"  ✗ Error: {e}")



print("\n" + "="*60)
print("All embeddings generated!")
print("="*60)


Processing: CS

[Model: sentence-transformers/all-MiniLM-L6-v2]
  Skipping (already exists): embedding/cs/sentence_transformers_all_MiniLM_L6_v2_v1.mmap

[Model: all-distilroberta-v1]
  Skipping (already exists): embedding/cs/all_distilroberta_v1_v1.mmap

[Model: intfloat/e5-base-v2]
  Skipping (already exists): embedding/cs/intfloat_e5_base_v2_v1.mmap

[Model: all-mpnet-base-v2]
  Skipping (already exists): embedding/cs/all_mpnet_base_v2_v1.mmap

[Model: BAAI/bge-base-en-v1.5]
  Skipping (already exists): embedding/cs/BAAI_bge_base_en_v1.5_v1.mmap

[Model: allenai/specter2]
  Skipping (already exists): embedding/cs/allenai_specter2_v1.mmap

Processing: MATH

[Model: sentence-transformers/all-MiniLM-L6-v2]
  Skipping (already exists): embedding/math/sentence_transformers_all_MiniLM_L6_v2_v1.mmap

[Model: all-distilroberta-v1]
  Skipping (already exists): embedding/math/all_distilroberta_v1_v1.mmap

[Model: intfloat/e5-base-v2]
  Skipping (already exists): embedding/math/intfloat_e5_ba

  Generating: 100%|██████████| 1144/1144 [03:20<00:00,  5.71it/s]


  ✓ Done

[Model: all-distilroberta-v1]
  Total documents: 146,311
  Embedding dimension: 768
  Batch size: 128
  Output path: embedding/physics/all_distilroberta_v1_v1.mmap


  Generating: 100%|██████████| 1144/1144 [11:25<00:00,  1.67it/s]


  ✓ Done

[Model: intfloat/e5-base-v2]
  Total documents: 146,311
  Embedding dimension: 768
  Batch size: 128
  Output path: embedding/physics/intfloat_e5_base_v2_v1.mmap


  Generating: 100%|██████████| 1144/1144 [22:40<00:00,  1.19s/it]


  ✓ Done

[Model: all-mpnet-base-v2]
  Total documents: 146,311
  Embedding dimension: 768
  Batch size: 128
  Output path: embedding/physics/all_mpnet_base_v2_v1.mmap


  Generating: 100%|██████████| 1144/1144 [26:51<00:00,  1.41s/it]


  ✓ Done

[Model: BAAI/bge-base-en-v1.5]
  Total documents: 146,311
  Embedding dimension: 768
  Batch size: 128
  Output path: embedding/physics/BAAI_bge_base_en_v1.5_v1.mmap


  Generating: 100%|██████████| 1144/1144 [22:43<00:00,  1.19s/it]


  ✓ Done

[Model: allenai/specter2]


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

There are adapters available but none are activated for the forward pass.


  Total documents: 146,311
  Embedding dimension: 768
  Batch size: 128
  Output path: embedding/physics/allenai_specter2_v1.mmap


  Generating: 100%|██████████| 1144/1144 [34:09<00:00,  1.79s/it]


  ✓ Done

All embeddings generated!


In [7]:
print("Generated embedding files:\n")

for subject in LIST_SUBJECT:
    subject_dir = OUTPUT_DIR / subject
    if subject_dir.exists():
        print(f"{subject}/")
        for f in sorted(subject_dir.iterdir()):
            size_mb = f.stat().st_size / (1024 * 1024)
            print(f"  └── {f.name} ({size_mb:.1f} MB)")

Generated embedding files:

cs/
  └── BAAI_bge_base_en_v1.5_meta_v1.npy (0.0 MB)
  └── BAAI_bge_base_en_v1.5_v1.mmap (485.6 MB)
  └── BAAI_bge_base_en_v1.5_v2.mmap (484.9 MB)
  └── BAAI_bge_base_en_v1.5_v2_meta.npy (0.0 MB)
  └── all_distilroberta_v1_meta_v1.npy (0.0 MB)
  └── all_distilroberta_v1_v1.mmap (485.6 MB)
  └── all_distilroberta_v1_v2.mmap (484.9 MB)
  └── all_distilroberta_v1_v2_meta.npy (0.0 MB)
  └── all_mpnet_base_v2_meta_v1.npy (0.0 MB)
  └── all_mpnet_base_v2_v1.mmap (485.6 MB)
  └── all_mpnet_base_v2_v2.mmap (484.9 MB)
  └── all_mpnet_base_v2_v2_meta.npy (0.0 MB)
  └── allenai_specter2_v1.mmap (485.6 MB)
  └── allenai_specter2_v1_meta.npy (0.0 MB)
  └── allenai_specter2_v2.mmap (484.9 MB)
  └── allenai_specter2_v2_meta.npy (0.0 MB)
  └── intfloat_e5_base_v2_meta_v1.npy (0.0 MB)
  └── intfloat_e5_base_v2_v1.mmap (485.6 MB)
  └── intfloat_e5_base_v2_v2.mmap (484.9 MB)
  └── intfloat_e5_base_v2_v2_meta.npy (0.0 MB)
  └── sentence_transformers_all_MiniLM_L6_v2_meta_v1.npy