In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import time
import json
import random
import requests
import psutil
import chromadb
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}

client = chromadb.PersistentClient(path="chroma_db_2")
collection = client.get_or_create_collection(name="doctor_profiles_2")

existing_names = set()

PROMPT_TEMPLATE = """
Generate exactly 10 unique doctor profiles in valid JSON format.
Do not include any introductory or explanatory text—only return a valid JSON array.
Each profile should have the following fields:
- Name
- Specialization
- Sub-Specialization
- Years of Experience
- Hospital/Clinic Affiliation
- Certification
- Research
"""

def check_memory():
    mem = psutil.virtual_memory()
    if mem.percent > 95:
        print("High memory usage detected. Pausing for 30 seconds...")
        time.sleep(30)

def generate_profiles():
    payload = {
        "model": "llama3-70b-8192",
        "messages": [{"role": "user", "content": PROMPT_TEMPLATE}],
        "temperature": 0.7,
        "max_tokens": 2000
    }

    max_retries = 5
    base_delay = 10

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=payload)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                delay = base_delay * (2 ** attempt)
                delay += random.uniform(1, 5)
                print(f"Rate limit hit. Retrying in {delay:.2f} seconds...")
                time.sleep(delay)
            else:
                print(f"Request failed (attempt {attempt + 1}): {e}")
                time.sleep(30)

    print("Max retries reached. API is not responding.")
    return None

def extract_profiles(response):
    if not response or "choices" not in response:
        return []

    try:
        content = response["choices"][0]["message"]["content"].strip()

        json_start = content.find("[")
        json_end = content.rfind("]") + 1

        if json_start != -1 and json_end != -1:
            content = content[json_start:json_end]

        return json.loads(content)
    except json.JSONDecodeError:
        print("Error decoding JSON response. Raw response:")
        print(content)
        return []

def load_existing_names():
    global existing_names
    existing_names.clear()

    try:
        batch_size, offset = 100, 0
        while True:
            existing_docs = collection.get(include=["metadatas"], limit=batch_size, offset=offset)
            if not existing_docs or "metadatas" not in existing_docs or not existing_docs["metadatas"]:
                break

            for metadata in existing_docs["metadatas"]:
                if isinstance(metadata, dict) and "Name" in metadata:
                    existing_names.add(metadata["Name"])

            offset += batch_size
    except Exception as e:
        print(f"Error fetching ChromaDB metadata: {e}")

def save_profiles_to_chroma(profiles):
    load_existing_names()

    new_profiles = [p for p in profiles if p["Name"] not in existing_names]

    if new_profiles:
        try:
            doc_contents = [json.dumps(p) for p in new_profiles]
            metadata_list = new_profiles
            doc_ids = [p["Name"] for p in new_profiles]

            # Add documents in batch.
            collection.add(documents=doc_contents, metadatas=metadata_list, ids=doc_ids)
            print(f"{len(new_profiles)} profiles added to ChromaDB.")
        except Exception as e:
            print(f"Error adding profiles to ChromaDB: {e}")
    else:
        print("No new profiles to add.")

def count_profiles_in_chroma():
    try:
        stored_profiles = collection.get(include=["documents"])
        return len(stored_profiles["documents"]) if stored_profiles and "documents" in stored_profiles else 0
    except Exception as e:
        print(f"Error counting ChromaDB profiles: {e}")
        return 0

def main():
    total_profiles, batch_size, created_profiles = 500, 10, 0
    load_existing_names()

    while created_profiles < total_profiles:
        doctor_profiles = []

        while len(doctor_profiles) < batch_size:
            check_memory()

            response = generate_profiles()
            if response is None:
                print("API did not return a response. Retrying...")
                continue

            new_profiles = extract_profiles(response)
            for p in new_profiles:
                if p["Name"] not in existing_names:
                    doctor_profiles.append(p)
                    existing_names.add(p["Name"])

        if doctor_profiles:
            save_profiles_to_chroma(doctor_profiles)
            print(f"Batch Processed: {len(doctor_profiles)} profiles")
            print(f"Total Profiles in ChromaDB: {count_profiles_in_chroma()}")

            created_profiles += len(doctor_profiles)

        existing_names.clear()

        sleep_time = random.uniform(15, 30)
        print(f"Sleeping for {sleep_time:.2f} seconds...\n")
        time.sleep(sleep_time)

if __name__ == "__main__":
    main()


10 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 10
Sleeping for 22.03 seconds...

9 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 19
Sleeping for 15.64 seconds...

6 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 25
Sleeping for 27.83 seconds...

4 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 29
Sleeping for 15.72 seconds...

1 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 30
Sleeping for 17.26 seconds...

1 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 31
Sleeping for 16.41 seconds...

2 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 33
Sleeping for 28.06 seconds...

4 profiles added to ChromaDB.
Batch Processed: 10 profiles
Total Profiles in ChromaDB: 37
Sleeping for 23.64 seconds...

4 profiles added to ChromaDB.
B