 <h3>Git Clone</h3>

In [1]:
import os
import subprocess
from urllib.parse import urlparse

def clone_repo(repo_url: str, destination_dir: str = "./cloned_repo"):
    try:
        if os.path.exists(destination_dir):
            print(f"[INFO] Destination '{destination_dir}' already exists. Removing it first...")
            subprocess.run(["rm", "-rf", destination_dir], check=True)

        print(f"[INFO] Cloning repo from {repo_url} to {destination_dir}...")
        subprocess.run(["git", "clone", repo_url, destination_dir], check=True)
        print("[SUCCESS] Repo cloned successfully.")
        return destination_dir
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] Failed to clone repository: {e}")
        return None

# Example usage
if __name__ == "__main__":
    repo_url = "https://github.com/sivaprasadreddy/spring-modular-monolith.git"  # replace this
    destination = "./monolith_code"  # change if you want a different dir
    clone_repo(repo_url, destination)


[INFO] Cloning repo from https://github.com/sivaprasadreddy/spring-modular-monolith.git to ./monolith_code...
[SUCCESS] Repo cloned successfully.


 <h3>File Analysis</h3>

In [2]:
import os
import json
import threading
import openai
from pathlib import Path
from typing import List
from queue import Queue

openai.api_key = os.environ.get("OPEN_AI_API_KEY")

MAX_CHARS = 8000
max_threads = 8
skip_files = ('package-info.java', 'module-info.java', 'pom.xml', 'Dockerfile', '.gitignore', 'README.md', '.git', "Test.java", "Tests.java")

def read_code_file(file_path: Path) -> List[str]:
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        code = f.read()
    return [code[i:i + MAX_CHARS] for i in range(0, len(code), MAX_CHARS)]

def analyze_chunk(chunk: str, file_name: str) -> str:
    prompt = f"""
        You are a software architect assistant. Analyze the following code chunk from a file named '{file_name}'.
        Return a JSON object with the following fields:
        - "internal_dependencies": list of filenames this code depends on
        - "external_dependencies": list of libraries or packages it uses
        - "functions": an array of descriptions of the functions in this code

        Code:
        ```
        {chunk}
        ```

        Only return the JSON, nothing else.
        """
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

def worker(queue: Queue, results: List[dict], lock: threading.Lock):
    while not queue.empty():
        file_path = queue.get()
        file = file_path.name

        print(f"[INFO] Analyzing {file_path}")
        try:
            file_chunks = read_code_file(file_path)
            combined_analysis = {
                "file_name": file,
                "file_path": str(file_path),
                "internal_dependencies": [],
                "external_dependencies": [],
                "functions": []
            }

            for chunk in file_chunks:
                try:
                    analysis = analyze_chunk(chunk, file)
                    data = json.loads(analysis)
                    combined_analysis["internal_dependencies"].extend(data.get("internal_dependencies", []))
                    combined_analysis["external_dependencies"].extend(data.get("external_dependencies", []))
                    combined_analysis["functions"].extend(data.get("functions", []))
                except Exception as e:
                    print(f"[ERROR] Failed analyzing chunk from {file}: {e}")

            # Remove duplicates
            combined_analysis["internal_dependencies"] = list(set(combined_analysis["internal_dependencies"]))
            combined_analysis["external_dependencies"] = list(set(combined_analysis["external_dependencies"]))

            with lock:
                results.append(combined_analysis)

        except Exception as e:
            print(f"[ERROR] Failed analyzing file {file_path}: {e}")
        finally:
            queue.task_done()

def analyze_repo_code(repo_path: str, output_json_path: str):
    file_queue = Queue()
    results = []
    lock = threading.Lock()

    # Populate queue with all valid code files
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith(('.py', '.js', '.ts', '.java', '.go', '.rb')) and not file.endswith(skip_files):
                file_queue.put(Path(root) / file)

    # Start worker threads
    threads = []
    for _ in range(min(max_threads, file_queue.qsize())):
        t = threading.Thread(target=worker, args=(file_queue, results, lock))
        t.start()
        threads.append(t)

    # Wait for all threads to finish
    for t in threads:
        t.join()

    # Save output
    with open(output_json_path, 'w') as out_f:
        json.dump(results, out_f, indent=2)
    print(f"[SUCCESS] Analysis saved to {output_json_path}")

# Example usage
if __name__ == "__main__":
    analyze_repo_code("./monolith_code", "file_analysis.json")


[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\BookStoreApplication.java[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\ProductApi.java

[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\ProductDto.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\domain\ProductEntity.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\domain\ProductNotFoundException.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\domain\ProductRepository.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\domain\ProductService.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\mappers\ProductMapper.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\web\CatalogExceptionHandler.java
[INFO] Analyzing monolith_code\src\main\java\com\sivalabs\bookstore\catalog\web\Produc

In [3]:
def clean_code(response_text: str):
    response_text = response_text.strip()
    
    parts = response_text.split("```")
    if len(parts) > 1:
        response_text = parts[1].strip()
    
    if response_text.lower().startswith("python"):
        response_text = response_text.split("\n", 1)[-1].strip()
        
    if response_text.lower().startswith("JSON"):
        response_text = response_text.split("\n", 1)[-1].strip()
    
    if "Note that" in response_text:
        response_text = response_text.split("Note that", 1)[0].strip()
    
    response_text = response_text.replace("```", "").strip()
    
    return response_text


 <h3>Micro-service Identification</h3>

In [5]:
# pip install "autogen-agentchat[teachable]~=0.2"

In [14]:
import os
from autogen import UserProxyAgent, config_list_from_json
from autogen.agentchat.contrib.capabilities.teachability import Teachability
from autogen import ConversableAgent 

filter_dict = {"model": ["gpt-4"]}
llm_config = {
    "config_list": [
        {
            "model": "gpt-4",
            "api_key": os.getenv("OPEN_AI_API_KEY")
        }
    ],
    "timeout": 120
}

teachable_agent = ConversableAgent(
    name="teachable_agent", 
    llm_config=llm_config
)

teachability = Teachability(
    reset_db=False, 
    path_to_db_dir="./tmp/interactive/teachability_db" 
)

teachability.add_to_agent(teachable_agent)

user = UserProxyAgent("user", human_input_mode="ALWAYS")

teachable_agent.initiate_chat(user, message="Hi, I'm a teachable user assistant! What's on your mind?")

[92m
LOADING MEMORY FROM DISK[0m
[92m    Location = ./tmp/interactive/teachability_db\uid_text_dict.pkl[0m
[33mteachable_agent[0m (to user):

Hi, I'm a teachable user assistant! What's on your mind?

--------------------------------------------------------------------------------
[33muser[0m (to teachable_agent):

i am shubham

--------------------------------------------------------------------------------
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mteachable_agent[0m (to user):

Nice to meet you, Shubham! How can I assist you today?

--------------------------------------------------------------------------------
[31m
>>>>>>>> TERMINATING RUN (4ce2a270-63d1-4915-8f92-094e59b74955): User requested to end the conversation[0m


ChatResult(chat_id=None, chat_history=[{'content': "Hi, I'm a teachable user assistant! What's on your mind?", 'role': 'assistant', 'name': 'teachable_agent'}, {'content': 'i am shubham', 'role': 'user', 'name': 'user'}, {'content': 'Nice to meet you, Shubham! How can I assist you today?', 'role': 'assistant', 'name': 'teachable_agent'}], summary='Nice to meet you, Shubham! How can I assist you today?', cost={'usage_including_cached_inference': {'total_cost': 0.0029100000000000003, 'gpt-4-0613': {'cost': 0.0029100000000000003, 'prompt_tokens': 63, 'completion_tokens': 17, 'total_tokens': 80}}, 'usage_excluding_cached_inference': {'total_cost': 0.0029100000000000003, 'gpt-4-0613': {'cost': 0.0029100000000000003, 'prompt_tokens': 63, 'completion_tokens': 17, 'total_tokens': 80}}}, human_input=[])

In [23]:
import os
import json
from autogen import  UserProxyAgent, AssistantAgent, GroupChatManager
from autogen.agentchat.contrib.capabilities.teachability import Teachability

filter_dict = {"model": ["gpt-4"]}
llm_config = {
    "config_list": [
        {
            "model": "gpt-4",
            "api_key": os.getenv("OPEN_AI_API_KEY")
        }
    ],
    "timeout": 120
}

# teachable_agent = ConversableAgent(
#     name="teachable_agent", 
#     llm_config=llm_config,
#     human_input_mode="NEVER"
# )
def process_json_chunks(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    for chunk in data:
        chunk_text = json.dumps(chunk)
        teachable_agent.initiate_chat(
            recipient=teachable_agent,
            message=f"Please learn and summarize this information: {chunk_text}",
            silent=True
        )
    
    final_summary = teachable_agent.initiate_chat(
        recipient=teachable_agent,
        message="Please provide a comprehensive summary of all the information you've learned.",
        silent=True
    )
    
    return final_summary

summary_agent = AssistantAgent(
    name="teachable_agent", 
    llm_config=llm_config,
    human_input_mode="NEVER",
    system_message="""You are an assistant agent whose only purpose is to summarize data""",
    function_map={"process_json_chunks": process_json_chunks}
)

# teachability = Teachability(
#     reset_db=False, 
#     path_to_db_dir="./tmp/interactive/teachability_db" 
# )
# teachability.add_to_agent(teachable_agent)

# def process_json_chunks(file_path):
#     with open(file_path, 'r') as f:
#         data = json.load(f)
    
#     for chunk in data:
#         chunk_text = json.dumps(chunk)
#         teachable_agent.initiate_chat(
#             recipient=teachable_agent,
#             message=f"Please learn and summarize this information: {chunk_text}",
#             silent=True
#         )
    
#     final_summary = teachable_agent.initiate_chat(
#         recipient=teachable_agent,
#         message="Please provide a comprehensive summary of all the information you've learned.",
#         silent=True
#     )
    
#     return final_summary

# summary = process_json_chunks("file_analysis.json")
# print("\nFinal Summary:")
# print(summary)

In [17]:
import os
import json
import time
from tqdm import tqdm
from autogen import ConversableAgent
from autogen.agentchat.contrib.capabilities.teachability import Teachability

# Set up agent and teachability
llm_config = {
    "config_list": [
        {
            "model": "gpt-4",
            "api_key": os.getenv("OPEN_AI_API_KEY")
        }
    ],
    "timeout": 120
}

teachable_agent = ConversableAgent(
    name="teachable_agent",
    llm_config=llm_config,
    human_input_mode="NEVER"
)

teachability = Teachability(
    reset_db=False,
    path_to_db_dir="./tmp/interactive/teachability_db"
)
teachability.add_to_agent(teachable_agent)

# Load file analysis data
with open("file_analysis.json", "r") as f:
    analysis_data = json.load(f)

# Microservice classification
microservices = []
unique_services = set()

print("[INFO] Processing", len(analysis_data), "files...")

for idx, file in enumerate(tqdm(analysis_data, desc="Processing Files")):
    file_name = file.get("file_name", "UnknownFile")
    functions = str(file.get("functions", ""))[:1000]
    internal_deps = file.get("internal_dependencies", [])[:10]

    prompt = f"""
You are designing a microservices architecture. Below is a summary of a code file:
- File: {file_name}
- Internal Dependencies: {internal_deps}
- Functions: {functions}

Based on this, decide how to group this file into a microservice. Consider cohesion, dependencies, and overall design.
Only generate services that are necessary — keep total count between 6 to 9 services.

Return a JSON in the format:
{{
  "suggested_microservice": "name",
  "reason": "short explanation why this file belongs there"
}}
"""

    try:
        start_time = time.time()
        chat_result = teachable_agent.generate_reply(sender=teachable_agent, prompt=prompt)
        message = chat_result.message.content.strip()

        # Clean markdown wrapping
        if message.startswith("```json"):
            message = message.split("```json")[-1].split("```")[0].strip()

        result = json.loads(message)

        if not isinstance(result, dict) or "suggested_microservice" not in result:
            raise ValueError("Invalid format received from agent")

        microservices.append({
            "file_name": file_name,
            "microservice": result["suggested_microservice"],
            "reason": result["reason"]
        })

        unique_services.add(result["suggested_microservice"])
        print(f"[{idx+1}] ✅ {file_name} → {result['suggested_microservice']} ({time.time() - start_time:.1f}s)")

    except Exception as e:
        print(f"[{idx+1}] ❌ {file_name} -> Error: {e}")

# Save result
with open("microservices_list.json", "w") as f:
    json.dump(microservices, f, indent=2)

print(f"\n[SUCCESS] Microservice suggestions saved to microservices_list.json")
print(f"Unique microservices generated: {len(unique_services)} → {list(unique_services)}")


[92m
LOADING MEMORY FROM DISK[0m
[92m    Location = ./tmp/interactive/teachability_db\uid_text_dict.pkl[0m
[INFO] Processing 43 files...


Processing Files: 100%|██████████| 43/43 [00:00<00:00, 14337.79it/s]

[1] ❌ ProductRepository.java -> Error: list index out of range
[2] ❌ BookStoreApplication.java -> Error: list index out of range
[3] ❌ ProductNotFoundException.java -> Error: list index out of range
[4] ❌ ProductDto.java -> Error: list index out of range
[5] ❌ ProductMapper.java -> Error: list index out of range
[6] ❌ ProductApi.java -> Error: list index out of range
[7] ❌ ProductService.java -> Error: list index out of range
[8] ❌ CatalogExceptionHandler.java -> Error: list index out of range
[9] ❌ PagedResult.java -> Error: list index out of range
[10] ❌ InventoryRepository.java -> Error: list index out of range
[11] ❌ ProductEntity.java -> Error: list index out of range
[12] ❌ ProductWebController.java -> Error: list index out of range
[13] ❌ ProductRestController.java -> Error: list index out of range
[14] ❌ InventoryEntity.java -> Error: list index out of range
[15] ❌ OrderEventNotificationHandler.java -> Error: list index out of range
[16] ❌ CreateOrderResponse.java -> Error: lis




In [24]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.6.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence_transformers)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting networkx (from torch>=1.11.0->sentence_transformers)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch>=1.11.0->sentence_transformers)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence_transformers)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
import os
import json
import chromadb
import requests
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
from autogen import ConversableAgent

# === Load environment variables ===
load_dotenv()
OPEN_API_KEY = os.getenv("OPEN_AI_API_KEY")

# === Config ===
OPEN_API_URL = "https://api.openai.com/v1/chat/completions"
GROQ_MODEL = "gpt-4"
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE = 2000  # characters per chunk

# === SentenceTransformer Embedding Function ===
class LocalEmbeddingFunction(DefaultEmbeddingFunction):
    def __init__(self, model_name=EMBED_MODEL_NAME):
        self.model = SentenceTransformer(model_name)

    def __call__(self, texts):
        return self.model.encode(texts).tolist()

embedding_func = LocalEmbeddingFunction()

# === Setup ChromaDB ===
client = chromadb.Client()
collection = client.get_or_create_collection("summaries", embedding_function=embedding_func)

# === Use AutoGen Agent to call Groq ===
class GroqAgent(ConversableAgent):
    def summarize(self, content: str) -> str:
        payload = {
            "model": GROQ_MODEL,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant that summarizes text."},
                {"role": "user", "content": f"Summarize the following:\n\n{content}"}
            ],
            "temperature": 0.5,
            "max_tokens": 300
        }
        headers = {
            "Authorization": f"Bearer {OPEN_API_KEY}",
            "Content-Type": "application/json"
        }
        response = requests.post(OPEN_API_URL, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"].strip()
        else:
            raise Exception(f"Groq API Error: {response.status_code} - {response.text}")

agent = GroqAgent(name="groq-agent")

# === Chunking Function ===
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# === Store in Chroma ===
def store_summary(original: str, summary: str, metadata: dict = {}):
    doc_id = str(abs(hash(original)))[:16]
    collection.add(
        documents=[original],
        metadatas=[{"summary": summary, **metadata}],
        ids=[doc_id]
    )
    print(f"\n✅ Stored in ChromaDB with ID: {doc_id}")

# === Query Chroma ===
def query_summary(query: str, top_k: int = 3):
    results = collection.query(query_texts=[query], n_results=top_k)
    print("\n📚 Top Results:\n")
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"Document:\n{doc}\nSummary: {meta['summary']}\n---\n")

# === Main Flow ===
def main():
    mode = input("Enter mode (summarize / retrieve): ").strip().lower()

    if mode == "summarize":
        path = input("Enter path to .json file: ").strip()
        if not os.path.isfile(path) or not path.endswith(".json"):
            print("❌ Invalid file path.")
            return

        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                full_text = json.dumps(data, indent=2)
        except Exception as e:
            print(f"❌ Error reading JSON: {e}")
            return

        chunks = chunk_text(full_text)
        print(f"\n🔍 JSON split into {len(chunks)} chunks. Generating summaries...\n")
        chunk_summaries = []

        for i, chunk in enumerate(chunks):
            print(f"🧠 Summarizing chunk {i+1}/{len(chunks)}...")
            summary = agent.summarize(chunk)
            chunk_summaries.append(summary)

        final_input = "\n\n".join([f"Chunk {i+1} Summary: {s}" for i, s in enumerate(chunk_summaries)])
        print("\n📦 Generating final summary from chunk summaries...\n")
        final_summary = agent.summarize(final_input)

        print(f"\n📝 Final Summary:\n{final_summary}")
        store_summary(full_text, final_summary, metadata={"source_file": path})

    elif mode == "retrieve":
        query = input("Enter your search query: ")
        query_summary(query)

    else:
        print("❌ Invalid mode. Choose 'summarize' or 'retrieve'.")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



🔍 JSON split into 23 chunks. Generating summaries...

🧠 Summarizing chunk 1/23...
🧠 Summarizing chunk 2/23...
🧠 Summarizing chunk 3/23...
🧠 Summarizing chunk 4/23...
🧠 Summarizing chunk 5/23...
🧠 Summarizing chunk 6/23...
🧠 Summarizing chunk 7/23...
🧠 Summarizing chunk 8/23...
🧠 Summarizing chunk 9/23...
🧠 Summarizing chunk 10/23...
🧠 Summarizing chunk 11/23...
🧠 Summarizing chunk 12/23...
🧠 Summarizing chunk 13/23...
🧠 Summarizing chunk 14/23...
🧠 Summarizing chunk 15/23...
🧠 Summarizing chunk 16/23...
🧠 Summarizing chunk 17/23...
🧠 Summarizing chunk 18/23...
🧠 Summarizing chunk 19/23...
🧠 Summarizing chunk 20/23...
🧠 Summarizing chunk 21/23...
🧠 Summarizing chunk 22/23...
🧠 Summarizing chunk 23/23...

📦 Generating final summary from chunk summaries...


📝 Final Summary:
The text discusses various Java files and their functionalities in a bookstore application. These include files for product and order management, exception handling, and dependencies. Some of the notable files includ

In [None]:
import os
import json
import chromadb
import requests
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
from autogen import ConversableAgent

# === Load environment variables ===
load_dotenv()
OPEN_API_KEY = os.getenv("OPEN_AI_API_KEY")

# === Config ===
OPEN_API_URL = "https://api.openai.com/v1/chat/completions"
GROQ_MODEL = "gpt-4"
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE = 18000  # characters per chunk

# === SentenceTransformer Embedding Function ===
class LocalEmbeddingFunction(DefaultEmbeddingFunction):
    def __init__(self, model_name=EMBED_MODEL_NAME):
        self.model = SentenceTransformer(model_name)

    def __call__(self, texts):
        return self.model.encode(texts).tolist()

embedding_func = LocalEmbeddingFunction()

# === Setup ChromaDB ===
client = chromadb.Client()
collection = client.get_or_create_collection("microservices", embedding_function=embedding_func)

# === Use AutoGen Agent to call Groq ===
class GroqAgent(ConversableAgent):
    def generate_microservices(self, content: str) -> str:
        payload = {
            "model": GROQ_MODEL,
            "messages": [
                {
                    "role": "system",
                    "content": (
                        "You are a software architect helping to migrate a monolith to microservices. "
                        "Given code and related analysis, suggest which microservice this content belongs to, "
                        "what its role would be, and what service boundaries it should respect."
                    )
                },
                {"role": "user", "content": content}
            ],
            "temperature": 0.3,
            "max_tokens": 500
        }
        headers = {
            "Authorization": f"Bearer {OPEN_API_KEY}",
            "Content-Type": "application/json"
        }
        response = requests.post(OPEN_API_URL, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"].strip()
        else:
            raise Exception(f"Groq API Error: {response.status_code} - {response.text}")

agent = GroqAgent(name="microservice-agent")

# === Chunking Function ===
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# === Store in ChromaDB ===
def store_microservice_mapping(original: str, mapping: str, metadata: dict = {}):
    doc_id = str(abs(hash(original)))[:16]
    collection.add(
        documents=[original],
        metadatas=[{"microservice_mapping": mapping, **metadata}],
        ids=[doc_id]
    )
    print(f"\n✅ Stored microservice mapping in ChromaDB with ID: {doc_id}")

# === Query ChromaDB ===
def query_microservices(query: str, top_k: int = 3):
    results = collection.query(query_texts=[query], n_results=top_k)
    print("\n🔍 Top Microservice Mapping Results:\n")
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"Document Snippet:\n{doc[:300]}...\nMapping: {meta['microservice_mapping']}\n---\n")

# === Main Flow for Step 3 ===
def main():
    mode = input("Enter mode (generate / retrieve): ").strip().lower()

    if mode == "generate":
        path = input("Enter path to .json file (file analysis): ").strip()
        if not os.path.isfile(path) or not path.endswith(".json"):
            print("❌ Invalid file path.")
            return

        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                full_text = json.dumps(data, indent=2)
        except Exception as e:
            print(f"❌ Error reading JSON: {e}")
            return

        chunks = chunk_text(full_text)
        print(f"\n📦 File analysis split into {len(chunks)} chunks. Generating microservice mappings...\n")
        microservice_mappings = []

        for i, chunk in enumerate(chunks):
            print(f"🔧 Processing chunk {i+1}/{len(chunks)}...")
            mapping = agent.generate_microservices(chunk)
            microservice_mappings.append(mapping)

        all_mappings_text = "\n\n".join([f"Chunk {i+1} Mapping: {m}" for i, m in enumerate(microservice_mappings)])
        print("\n📦 Generating final microservice organization...\n")
        final_mapping = agent.generate_microservices(all_mappings_text)

        print(f"\n🧭 Final Microservice Architecture:\n{final_mapping}")
        store_microservice_mapping(full_text, final_mapping, metadata={"source_file": path})

    elif mode == "retrieve":
        query = input("Enter your search query (e.g., 'order service'): ")
        query_microservices(query)

    else:
        print("❌ Invalid mode. Choose 'generate' or 'retrieve'.")

if __name__ == "__main__":
    main()


ValueError: An instance of Chroma already exists for ephemeral with different settings