In [55]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(EMBEDDING_MODEL)

In [56]:
json_input_file = "raw_data/hsl_seq_daily_tools_docs.json"
json_output_file = "raw_data/hsl_seq_daily_tools_docs_with_embeddings.json"


In [57]:
with open(json_input_file, 'r') as f:
    json_data = json.load(f)

In [58]:
json_data[0]

{'type': 'library',
 'metadata': {'title': 'HSL_SeqDailyTools',
  'description': 'The HSL_SeqDailyTools library adds four functions aimed at making some parts of sequence handling slightly easier.',
  'tags': ['library'],
  'documentation_url': 'https://venuslibrarydocumentation.readthedocs.io/en/latest/HSL_SeqDailyTools.html'},
 'details': {'library': {'name': 'HSL_SeqDailyTools',
   'url': 'https://github.com/theonetruenerd/VenusPackages/blob/main/HSL_SeqDailyTools.pkg',
   'description': 'The HSL_SeqDailyTools library adds four functions aimed at making some parts of sequence handling slightly easier.',
   'functions': ['CopyPlatePattern96ToTipRack',
    'CopyPlatePatternToPlate',
    'GetNumberOfPositionsLeft',
    'SeqEasyEdit']}}}

In [59]:
def prepare_embedding_input(entry):
    # Combine fields for embedding input
    input_text = f"""
    Title: {entry['metadata']['title']}
    Description: {entry['metadata']['description']}
    Tags: {', '.join(entry['metadata']['tags']) if 'tags' in entry['metadata'] else ''}
    Details: {entry['details']['function']['description'] if 'function' in entry['details'] else ''}
    """
    return input_text.strip()

def generate_embeddings_with_transformers(text):
    embeddings = embedder.encode(text, convert_to_tensor=False)
    return [float(value) for value in embeddings]



In [60]:
try:
    with open(json_output_file, 'r') as f:
        json_data_with_embeddings = json.load(f)
        if "metadata" not in json_data_with_embeddings:
            json_data_with_embeddings["metadata"] = []
        if "embeddings" not in json_data_with_embeddings:
            json_data_with_embeddings["embeddings"] = []
except FileNotFoundError:
    json_data_with_embeddings = {"metadata": [], "embeddings": []}

In [61]:
output_data = []

for entry in json_data:
    input_text = prepare_embedding_input(entry)
    embeddings = generate_embeddings_with_transformers(input_text)
    data_entry = {
        "id": entry["metadata"].get("title", "unknown_id"),  # Use title or provide fallback value
        "metadata": entry["metadata"],  # Copy all metadata
        "embedding": list(embeddings)  # Convert embedding array to list for JSON serialization
    }
    output_data.append(data_entry)



In [62]:
# Save the properly structured JSON
with open(json_output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=4)

print(f"Embeddings saved to {json_output_file}")

Embeddings saved to raw_data/hsl_seq_daily_tools_docs_with_embeddings.json
