# Query Data and Database

In [1]:
from pathlib import Path
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings


## Analyze Available Data

In [2]:
# Path to your data_processed folder, one level up from notebooks
data_processed_path = Path("..") / "data_processed"

# Function to print file lengths for .md and .txt files recursively
def print_file_lengths(folder_path):
    for file_path in folder_path.rglob("*"):
        if file_path.suffix in [".md", ".txt"]:
            try:
                content = file_path.read_text(encoding='utf-8')
                # Length in characters
                length_chars = len(content)
                # Optional: length in lines
                length_lines = content.count('\n') + 1
                print(f"{file_path.relative_to(folder_path)}: {length_chars} chars, {length_lines} lines")
            except Exception as e:
                print(f"Could not read {file_path}: {e}")

print_file_lengths(data_processed_path)


github_work_readmes/robotic-arm-training_README.md: 1150 chars, 5 lines
github_work_readmes/transformer-translation_README.md: 1112 chars, 5 lines
github_work_readmes/tts-agent_README.md: 1126 chars, 5 lines
github_work_readmes/langgraph-two-tool-demo_README.md: 1044 chars, 5 lines
github_work_readmes/spam-detection_README.md: 1115 chars, 5 lines
github_work_readmes/shopping-basket_README.md: 1039 chars, 5 lines
medium_articles/2023-11-10 Visualizing the Q-Learning Algorithm.md: 1045 chars, 6 lines
medium_articles/2025-08-22 Security Agent in Action Real-Time Drone Detection and Tracking with Python.md: 1158 chars, 6 lines
medium_articles/2025-09-18 Drone Live Detection FastAPI vs Spring Boot.md: 1050 chars, 6 lines
medium_articles/2025-08-04 Quantum Machine Learning QML A Practical Introduction with PennyLane and Qiskit.md: 1186 chars, 6 lines
medium_articles/2023-11-08 Solving the Lunar Lander with Genetic Algorithms.md: 1081 chars, 6 lines
medium_articles/2025-05-25 Retrieval Augmen

## Load Vector Store

In [3]:
def load_vector_store():
    """Load existing vector store."""
    persist_dir = Path("..") / "chroma_db"
    if not persist_dir.exists() or not any(persist_dir.iterdir()):
        raise FileNotFoundError(
            f"Vector store not found at {persist_dir}\n"
            "Please run: python -m src.create_database"
        )
    print("Loading vector store...")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vector_store = Chroma(
        persist_directory=str(persist_dir),
        embedding_function=embeddings,
        collection_name="digital_twin"
    )
    print(f"✓ Loaded vector store from {persist_dir}\n")
    return vector_store

vector_store = load_vector_store()
collection = vector_store

Loading vector store...
✓ Loaded vector store from ../chroma_db



## Analyze Metadata

In [4]:

def show_chunk_metadata(collection, num_chunks=605):
    """Prints out chunk IDs and metadata for the specified number of chunks."""
    try:
        data = collection.get(limit=num_chunks)  # use limit for performance if supported
    except Exception as e:
        print(f"Error fetching data: {e}")
        return
    total = min(num_chunks, len(data["ids"]))
    print(f"Showing metadata for {total} chunks:\n")
    for idx in range(total):
        chunk_id = data["ids"][idx]
        metadata = data["metadatas"][idx]
        print(f"Chunk {idx+1}: ID={chunk_id}, Metadata={metadata}")


show_chunk_metadata(collection, num_chunks=605)

Showing metadata for 55 chunks:

Chunk 1: ID=c2f0297d-a21d-4f55-abb6-6673880c7bb0, Metadata={'type': 'profile', 'source': 'linkedin'}
Chunk 2: ID=7ad5eb24-2a0e-43de-b37f-8f5f8050d0c2, Metadata={'file': 'java-ai-assistant_README.md', 'type': 'project', 'source': 'github'}
Chunk 3: ID=9c2f777d-612a-4959-951b-897dbf770b83, Metadata={'source': 'github', 'file': 'planning-agent_README.md', 'type': 'project'}
Chunk 4: ID=71cf6106-d913-44e1-a8a5-6a55f2125c61, Metadata={'type': 'project', 'file': 'quantum-computing-agent_README.md', 'source': 'github'}
Chunk 5: ID=f7ed4073-9a4d-4add-811f-fcd15c0b2689, Metadata={'type': 'project', 'source': 'github', 'file': 'Timer_README.md'}
Chunk 6: ID=fc328fb9-6ee1-4ed4-935d-b0ad48b123f5, Metadata={'type': 'project', 'source': 'github', 'file': 'drone-guard_README.md'}
Chunk 7: ID=c5b25e2f-7d43-419e-a7fe-d13311e96fa6, Metadata={'file': 'python-ai-assistant_README.md', 'source': 'github', 'type': 'project'}
Chunk 8: ID=020c131c-fdc6-423d-8469-3d2bba073732, M

## Analyze Chunks

In [5]:
def show_selected_chunks(collection, selected_indices):
    data = collection.get()
    total_chunks = len(data["ids"])
    
    for idx in selected_indices:
        if 0 <= idx < total_chunks:
            chunk_id = data["ids"][idx]
            metadata = data["metadatas"][idx]
            content = data["documents"][idx]
            print(f"Chunk {idx}: ID={chunk_id}")
            print(f"Metadata: {metadata}")
            print(f"Content:\n{content}\n{'-'*40}\n")
        else:
            print(f"Index {idx} out of range (0 to {total_chunks - 1})")


show_selected_chunks(collection, selected_indices=[0, 1, 2, 3, 4, 55])


Chunk 0: ID=c2f0297d-a21d-4f55-abb6-6673880c7bb0
Metadata: {'type': 'profile', 'source': 'linkedin'}
Content:
{
  "profile": {
    "link": [
      "https://de.linkedin.com/in/yauheniya-varabyova"
    ],
    "tagline": [
      "AI",
      "BI",
      "Data Science",
      "LLM",
      "ML"
    ],
    "experience": [
      {
        "company": "adesso SE",
        "employment_type": "Full-time",
        "start_date": "Oct 2025",
        "end_date": "Present",
        "location": "Berlin, Germany",
        "title": "Senior Consultant",
        "skills": [
          "AI",
          "Agents",
          "ML"
        ]
      },
      {
        "company": "Self-employed",
        "employment_type": "Full-time",
        "start_date": "Oct 2023",
        "end_date": "Sep 2025",
        "location": "Berlin, Germany",
        "title": "AI Engineer",
        "skills": [
          "AI",
          "Data Analysis",
          "Large Language Models (LLM)",
          "ML",
          "Software Developmen