In [1]:
import json
import numpy as np
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import re

# =============================================================================
# CONFIGURATION
# =============================================================================
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
OUTPUT_DIR = Path("precomputed_embeddings")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# =============================================================================
# LOAD GSM8K DATASET
# =============================================================================
print("📥 Loading GSM8K dataset...")
dataset = load_dataset("gsm8k", "socratic", split="test")
print(f"✅ Loaded {len(dataset)} test questions")

# =============================================================================
# LOAD EMBEDDING MODEL
# =============================================================================
print(f"\n📥 Loading embedding model: {EMBEDDING_MODEL}")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
embedding_dim = embedding_model.get_sentence_embedding_dimension()
print(f"✅ Embedding dimension: {embedding_dim}")

# =============================================================================
# EXTRACT QUESTIONS
# =============================================================================
print("\n🔍 Extracting questions from dataset...")
questions = []
metadata = []

for idx, item in enumerate(tqdm(dataset, desc="Processing dataset")):
    question = item["question"]
    questions.append(question)

    # Extract ground truth from answer field
    answer_field = item["answer"]
    # GSM8K format: "Step 1\n...\n#### 42"
    match = re.search(r"####\s*(-?\d+\.?\d*)", answer_field)
    ground_truth = match.group(1) if match else None

    metadata.append({
        "index": idx,
        "question": question,
        "answer_field": answer_field,
        "ground_truth": ground_truth
    })

print(f"✅ Extracted {len(questions)} questions")

# =============================================================================
# COMPUTE EMBEDDINGS
# =============================================================================
print("\n🔢 Computing embeddings for all questions...")

# Compute in batches for efficiency
batch_size = 32
embeddings_list = []

for i in tqdm(range(0, len(questions), batch_size), desc="Embedding batches"):
    batch = questions[i:i+batch_size]
    batch_embeddings = embedding_model.encode(
        batch,
        convert_to_numpy=True,
        normalize_embeddings=True,  # Important for cosine similarity
        show_progress_bar=False
    )
    embeddings_list.append(batch_embeddings)

# Concatenate all batches
embeddings = np.vstack(embeddings_list).astype('float32')
print(f"✅ Computed embeddings shape: {embeddings.shape}")

# =============================================================================
# SAVE EMBEDDINGS AND METADATA
# =============================================================================
print("\n💾 Saving embeddings and metadata...")

# Save embeddings as numpy array (memory efficient)
embeddings_path = OUTPUT_DIR / "gsm8k_socratic_embeddings.npy"
np.save(embeddings_path, embeddings)
print(f"✅ Embeddings saved to: {embeddings_path}")

# Save metadata as JSON
metadata_path = OUTPUT_DIR / "gsm8k_socratic_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f"✅ Metadata saved to: {metadata_path}")

# Save summary
summary = {
    "dataset": "gsm8k-socratic",
    "total_samples": len(questions),
    "embedding_model": EMBEDDING_MODEL,
    "embedding_dim": int(embedding_dim),
    "embeddings_file": str(embeddings_path),
    "metadata_file": str(metadata_path)
}

summary_path = OUTPUT_DIR / "summary.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"✅ Summary saved to: {summary_path}")

print("\n" + "="*80)
print("✅ EMBEDDING PRECOMPUTATION COMPLETE")
print(f"📂 Output directory: {OUTPUT_DIR}")
print(f"💾 Total size: {embeddings.nbytes / (1024**2):.2f} MB")
print("="*80)

📥 Loading GSM8K dataset...


README.md: 0.00B [00:00, ?B/s]

socratic/train-00000-of-00001.parquet:   0%|          | 0.00/2.68M [00:00<?, ?B/s]

socratic/test-00000-of-00001.parquet:   0%|          | 0.00/487k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

✅ Loaded 1319 test questions

📥 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding dimension: 384

🔍 Extracting questions from dataset...


Processing dataset: 100%|██████████| 1319/1319 [00:00<00:00, 33433.37it/s]


✅ Extracted 1319 questions

🔢 Computing embeddings for all questions...


Embedding batches: 100%|██████████| 42/42 [00:01<00:00, 28.70it/s]

✅ Computed embeddings shape: (1319, 384)

💾 Saving embeddings and metadata...
✅ Embeddings saved to: precomputed_embeddings/gsm8k_socratic_embeddings.npy
✅ Metadata saved to: precomputed_embeddings/gsm8k_socratic_metadata.json
✅ Summary saved to: precomputed_embeddings/summary.json

✅ EMBEDDING PRECOMPUTATION COMPLETE
📂 Output directory: precomputed_embeddings
💾 Total size: 1.93 MB



