In [1]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import os

# 1. Configuration
INPUT_FILE = 'wiki_math_knowledge_base_api.json'
MODEL_NAME = 'mixedbread-ai/mxbai-embed-large-v1'
OUTPUT_MATRIX_FILE = 'embeddings_matrix.npy'
# OUTPUT_MODEL_DIR = 'saved_mxbai_model_weights'

def main():
    # 2. Load the content from the JSON file
    print(f"Loading data from {INPUT_FILE}...")
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File {INPUT_FILE} not found. Please ensure the file is in the same directory.")
        return

    # Extract the raw text from each chunk
    # We verify 'raw_text' exists to avoid errors
    texts = [item['raw_text'] for item in data if 'raw_text' in item]
    print(f"Found {len(texts)} text chunks to embed.")

    # 3. Load the Embedding Model
    print(f"Loading model: {MODEL_NAME}...")
    # This automatically downloads the model weights from Hugging Face
    model = SentenceTransformer(MODEL_NAME)

    # 4. Generate Embeddings
    print("Generating embeddings (this may take a moment depending on your hardware)...")
    # For mxbai-embed-large-v1, documents (which these are) generally do not require 
    # the specific query prompt ("Represent this sentence...") that user queries do.
    embeddings = model.encode(texts, show_progress_bar=True)

    # 5. Save the Embedding Matrix
    print(f"Saving embedding matrix to {OUTPUT_MATRIX_FILE}...")
    # Saves as a binary NumPy file (.npy)
    np.save(OUTPUT_MATRIX_FILE, embeddings)
    print(f"Matrix shape: {embeddings.shape}")

    # 6. Save the Model Weights
    # print(f"Saving model weights to {OUTPUT_MODEL_DIR}...")
    # model.save(OUTPUT_MODEL_DIR)
    
    print("\nDone! You now have:")
    print(f"1. The embedding matrix: {os.path.abspath(OUTPUT_MATRIX_FILE)}")
    print(f"2. The model weights: {os.path.abspath(OUTPUT_MODEL_DIR)}")

if __name__ == "__main__":
    main()

Loading data from wiki_math_knowledge_base_api.json...
Found 2041 text chunks to embed.
Loading model: mixedbread-ai/mxbai-embed-large-v1...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

Generating embeddings (this may take a moment depending on your hardware)...


Batches:   0%|          | 0/64 [00:00<?, ?it/s]

KeyboardInterrupt: 