In [182]:
import json
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(EMBEDDING_MODEL)

In [183]:
json_input_file = "raw_data/reformatted_errors.json"
json_output_file = "hamilton_processed.json"

In [184]:
with open(json_input_file, 'r') as f:
    json_data = json.load(f)

In [185]:
json_data[0]

{'type': 'error_handling',
 'metadata': {'title': 'Error Code 0xa0220001: No memory',
  'description': 'Details about the Venus error code 0xa0220001 (No memory).',
  'tags': ['runtime_errors', 'Venus', 'error_handling', '0xa0220001'],
  'documentation_url': 'https://venuslibrarydocumentation.readthedocs.io/en/latest/RuntimeErrors.html#runtime-errors'},
 'details': {'error_handling': [{'error_code': '0xa0220001',
    'error_title': 'No memory',
    'description': 'The system cannot allocate or access enough memory or disk space for the given operation.',
    'user_action': "Close other RAM-intensive programs, increase Venus's allocated memory, or assign higher CPU priority in Task Manager.",
    'developer_action': "Optimize the method's memory usage and review memory allocation in the program."}]}}

In [186]:
def prepare_embedding_input(entry):
    # Combine fields for embedding input
    input_text = f"""
    Title: {entry['metadata']['title']}
    Description: {entry['metadata']['description']}
    Tags: {', '.join(entry['metadata']['tags']) if 'tags' in entry['metadata'] else ''}
    Details: {entry['details']['function']['description'] if 'function' in entry['details'] else ''}
    """
    return input_text.strip()

def generate_embeddings_with_transformers(text):
    embeddings = embedder.encode(text, convert_to_tensor=False)
    return [float(value) for value in embeddings]



In [187]:
try:
    with open(json_output_file, 'r') as f:
        json_data_with_embeddings = json.load(f)
except FileNotFoundError:
    json_data_with_embeddings = {"metadata": [], "embeddings": []}

In [188]:
# Create a dictionary from existing data for quick lookup based on `id`
existing_ids = {entry['id']: entry for entry in json_data_with_embeddings}
print(f"Existing entries: {len(existing_ids)}")

for entry in json_data:
    input_text = prepare_embedding_input(entry)
    entry_id = entry["metadata"].get("title", "unknown_id")  # Use title or provide fallback value

    # Check if the current entry is already in the existing dataset
    if entry_id in existing_ids:
        print(f"Entry {entry_id} already exists in the dataset.")
        # If the entry exists, you can optionally update it (e.g., refresh metadata or embeddings)
        continue  # Skip if no updating is required; remove this line to allow updates
    else:
        # Create a new entry and add it to `json_data_with_embeddings`
        embeddings = generate_embeddings_with_transformers(input_text)
        data_entry = {
            "id": entry_id,
            "metadata": entry["metadata"],  # Copy all metadata
            "embedding": list(embeddings)  # Convert embedding array to list for JSON serialization
        }
        json_data_with_embeddings.append(data_entry)

# Save the updated data
output_data = json_data_with_embeddings

Existing entries: 71
Entry Error Code 0xa0220001: No memory already exists in the dataset.
Entry Error Code 0xa1230002: Inserting identifier failed already exists in the dataset.
Entry Error Code 0xa1230003: Identifier not found already exists in the dataset.
Entry Error Code 0xa2230004: L-value not a number already exists in the dataset.
Entry Error Code 0xa2230005: R-value not a number already exists in the dataset.
Entry Error Code 0xa1230006: Not an identifier already exists in the dataset.
Entry Error Code 0xa1220007: Unrecognized token already exists in the dataset.
Entry Error Code 0xa1230008: R-value not bound already exists in the dataset.
Entry Error Code 0xa1230009: Bad number already exists in the dataset.
Entry Error Code 0xa123000a: Bad tree already exists in the dataset.
Entry Error Code 0xa123000b: Invalid entry already exists in the dataset.
Entry Error Code 0xa122000c: Function identifier is protected already exists in the dataset.
Entry Error Code 0xa223000d: Undersp

In [189]:
with open(json_output_file, 'w') as f:
    json.dump(output_data, f, indent=4)