In [1]:
!pip install --upgrade openai  > /dev/null 2>&1
!pip install faiss-cpu==1.7.4  > /dev/null 2>&1
!pip install tiktoken==0.5.2  > /dev/null 2>&1
!pip install numpy==1.24.3 > /dev/null 2>&1

In [2]:
!cp /kaggle/input/creating-a-knowledge-base-paid-models/*.md /kaggle/working/
!cp /kaggle/input/openai-to-z-challenge-deep-research-reports/*.md /kaggle/working/

In [3]:
!rm __notebook__.ipynb
!rm __output__.json
!rm __results__.html
!rm custom.css

rm: cannot remove '__output__.json': No such file or directory
rm: cannot remove '__results__.html': No such file or directory
rm: cannot remove 'custom.css': No such file or directory


In [4]:
!ls /kaggle/working/

deep_research_Amazonian_Dark_Earths.md
deep_research_competition_strategies_ideas.md
deep_research_Detecting_Archaeological_Sites_SOTA.md
deep_research_Effectiveness_of_LIDAR.md
deep_research_Known_Archaeological_Sites_2.md
deep_research_Known_Archaeological_Sites_3.md
deep_research_Known_Archaeological_Sites.md
deep_research_Kuhikugu_Archaeological.md
deep_research_Likely_Locations_of_Archaeological.md
deep_research_Machine_Learning_Models_2.md
deep_research_Machine_Learning_Models.md
deep_research_Open_Access_LiDAR_Datasets.md
deep_research_Suspected_Unconfirmed_Sites.md
__notebook__.ipynb
question_10_anthropic_claude-3-haiku_online_20250522190620.md
question_10_meta-llama_llama-4-scout_online_20250522190644.md
question_10_openai_gpt-4.1-mini_online_20250522190606.md
question_11_anthropic_claude-3-haiku_online_20250522190717.md
question_11_meta-llama_llama-4-scout_online_20250522190732.md
question_11_openai_gpt-4.1-mini_online_20250522190706.md
question_12_anthrop

# RAG-LLM Using OpenAI Vector Store
I tried the LangChain approach, but I encountered dependency issues.

In [5]:
import os
import glob
import itertools
import time
import httpx
from openai import OpenAI, BadRequestError
from datetime import datetime
from IPython.display import display, Markdown
from typing import Optional, List

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("openai_key")

In [6]:
# Configuration
DIR = "/kaggle/working/"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
VECTOR_STORE_NAME = f"OpenAI_challenge_{timestamp}"
print(VECTOR_STORE_NAME)
SEARCH_K = 5
CHAT_MODEL = "gpt-4.1" #gpt-4o o3-mini
TEMPERATURE = 0.2
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI()

def get_or_create_vector_store(name: str) -> str:
    """
    Retrieves an existing vector store by name or creates a new one.
    Returns the vector_store_id.
    """
    stores = client.vector_stores.list()
    for store in stores.data:
        if store.name == name:
            print(f"Found existing vector store '{name}' (id={store.id})")
            return store.id
    vs = client.vector_stores.create(name=name)
    print(f"Created new vector store '{name}' (id={vs.id})")
    return vs.id

def upload_to_vector_store(
    dir: str,
    vector_store_id: str,
    extensions: tuple[str, ...] = ("pdf", "md", "markdown", "txt"),
    min_bytes: int = 1,  # skip files smaller than this
):
    """
    Upload every file in *dir* whose extension appears in *extensions* to OpenAI
    and index it into the given vector store, skipping empty files.

    Parameters
    ----------
    dir : str
        Folder to scan.
    vector_store_id : str
        ID of the target vector store.
    extensions : tuple[str, ...], optional
        Allowed file extensions (case-insensitive, *without* the leading dot).
    min_bytes : int, optional
        Minimum file size to upload; defaults to 1 byte (skip blank files).
    """
    exts = [ext.lower().lstrip(".") for ext in extensions]
    patterns = (os.path.join(dir, f"*.{ext}") for ext in exts)
    file_paths = list(
        itertools.chain.from_iterable(glob.glob(p, recursive=False) for p in patterns)
    )

    if not file_paths:
        print(f"No files with extensions {exts} found in {dir}")
        return

    for path in file_paths:
        filename = os.path.basename(path)

        # ── Skip zero-byte (or very small) files ──────────────────────────────
        size = os.path.getsize(path)
        if size < min_bytes:
            print(f"⚠️  Skipping {filename} (size {size} bytes)")
            continue

        print(f"Uploading {filename} ({size} bytes)…")
        try:
            with open(path, "rb") as fp:
                file_resp = client.files.create(file=fp, purpose="assistants")

            client.vector_stores.files.create(
                vector_store_id=vector_store_id,
                file_id=file_resp.id,
            )
            print(f"Indexed {filename} (file_id={file_resp.id})")
            time.sleep(0.2)  # gentle rate limit

        except BadRequestError as e:
            # Catch other per-file errors so the loop keeps going
            print(f"❌  OpenAI rejected {filename}: {e}")

def search_vector_store(vector_store_id: str, query: str) -> list:
    """
    Queries the OpenAI Vector Store via HTTP and returns up to SEARCH_K items.
    """
    if not OPENAI_API_KEY:
        raise ValueError("OPENAI_API_KEY environment variable not set")
    url = f"https://api.openai.com/v1/vector_stores/{vector_store_id}/search"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
        "OpenAI-Beta": "assistants=v2"
    }
    payload = {"query": query}
    resp = httpx.post(url, headers=headers, json=payload)
    resp.raise_for_status()
    data = resp.json().get("data", [])
    # limit to top SEARCH_K
    return data[:SEARCH_K]


def answer_question(vector_store_id: str, question: str) -> str:
    """
    Retrieves relevant passages and asks the chat model to synthesize an answer.
    """
    # 1. Retrieve top passages
    items = search_vector_store(vector_store_id, question)
    # 2. Build context string
    context_parts = []
    for item in items:
        filename = item.get("filename", "unknown")
        for passage in item.get("content", []):
            context_parts.append(f"Source: {filename}\n{passage.get('text', '')}")
    context = "\n---\n".join(context_parts)

    # 3. Create chat messages
    system_msg = {
        "role": "system",
        "content": (
            "You are an expert in Maya archaeology and remote sensing. "
            "Use the provided context to answer precisely, citing sources when relevant."
        )
    }
    user_msg = {
        "role": "user",
        "content": (
            f"Context passages:\n{context}\n\n"
            f"Answer the following question: {question}"
        )
    }

    # 4. Call chat completion
    response = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[system_msg, user_msg],
        temperature=TEMPERATURE,
        max_tokens=800
    )

    return response.choices[0].message.content

def delete_vector_store_files(vector_store_id: str) -> int:
    """
    Delete all files associated with a vector store.
    
    Parameters
    ----------
    vector_store_id : str
        ID of the vector store whose files should be deleted.
    
    Returns
    -------
    int
        Number of files deleted.
    """
    deleted_count = 0
    
    try:
        # List all files in the vector store
        files = client.vector_stores.files.list(vector_store_id=vector_store_id)
        
        for file in files.data:
            try:
                # First, remove the file from the vector store
                client.vector_stores.files.delete(
                    vector_store_id=vector_store_id,
                    file_id=file.id
                )
                
                # Then delete the actual file
                client.files.delete(file.id)
                print(f"✓ Deleted file {file.id}")
                deleted_count += 1
                time.sleep(0.1)  # Rate limiting
                
            except Exception as e:
                print(f"❌ Error deleting file {file.id}: {e}")
                
    except Exception as e:
        print(f"❌ Error listing files for vector store {vector_store_id}: {e}")
    
    return deleted_count

def delete_vector_store(vector_store_id: str) -> bool:
    """
    Delete a vector store.
    
    Parameters
    ----------
    vector_store_id : str
        ID of the vector store to delete.
    
    Returns
    -------
    bool
        True if successfully deleted, False otherwise.
    """
    try:
        client.vector_stores.delete(vector_store_id)
        print(f"✓ Deleted vector store {vector_store_id}")
        return True
    except Exception as e:
        print(f"❌ Error deleting vector store {vector_store_id}: {e}")
        return False

def cleanup_vector_store(vector_store_id: str) -> None:
    """
    Complete cleanup: delete all files and the vector store.
    
    Parameters
    ----------
    vector_store_id : str
        ID of the vector store to completely remove.
    """
    print(f"\n🧹 Starting cleanup for vector store {vector_store_id}")
    
    # Step 1: Delete all files
    print("\n📄 Deleting files...")
    files_deleted = delete_vector_store_files(vector_store_id)
    print(f"Deleted {files_deleted} files")
    
    # Step 2: Delete the vector store
    print("\n🗑️  Deleting vector store...")
    if delete_vector_store(vector_store_id):
        print("\n✅ Cleanup complete!")
    else:
        print("\n⚠️  Cleanup completed with errors")

def cleanup_all_vector_stores_by_pattern(name_pattern: str = "OpenAI_challenge_") -> None:
    """
    Delete all vector stores whose names start with the given pattern.
    Useful for cleaning up multiple test runs.
    
    Parameters
    ----------
    name_pattern : str
        Pattern to match vector store names (default: "OpenAI_challenge_")
    """
    stores = client.vector_stores.list()
    matching_stores = [s for s in stores.data if s.name.startswith(name_pattern)]
    
    if not matching_stores:
        print(f"No vector stores found matching pattern '{name_pattern}'")
        return
    
    print(f"Found {len(matching_stores)} vector stores matching '{name_pattern}'")
    for store in matching_stores:
        print(f"\nProcessing: {store.name} (id={store.id})")
        cleanup_vector_store(store.id)

def list_all_files(purpose: Optional[str] = None) -> List:
    """
    List all files in OpenAI storage.
    
    Parameters
    ----------
    purpose : str, optional
        Filter by purpose ('assistants', 'fine-tune', etc.)
        If None, returns all files.
    
    Returns
    -------
    List
        List of file objects
    """
    all_files = []
    
    try:
        # OpenAI's list method might paginate, so we need to handle that
        has_more = True
        after = None
        
        while has_more:
            if after:
                files = client.files.list(purpose=purpose, after=after)
            else:
                files = client.files.list(purpose=purpose) if purpose else client.files.list()
            
            all_files.extend(files.data)
            
            # Check if there are more files
            has_more = files.has_more if hasattr(files, 'has_more') else False
            if has_more and files.data:
                after = files.data[-1].id
            
        return all_files
        
    except Exception as e:
        print(f"❌ Error listing files: {e}")
        return []

def delete_all_files(purpose: Optional[str] = None, dry_run: bool = True) -> dict:
    """
    Delete all files in OpenAI storage.
    
    Parameters
    ----------
    purpose : str, optional
        Only delete files with this purpose ('assistants', 'fine-tune', etc.)
        If None, deletes all files.
    dry_run : bool
        If True, only shows what would be deleted without actually deleting.
    
    Returns
    -------
    dict
        Statistics about the deletion process
    """
    files = list_all_files(purpose=purpose)
    
    if not files:
        print("No files found to delete.")
        return {"total": 0, "deleted": 0, "failed": 0}
    
    stats = {
        "total": len(files),
        "deleted": 0,
        "failed": 0,
        "total_bytes": sum(f.bytes for f in files if hasattr(f, 'bytes') and f.bytes)
    }
    
    print(f"\n{'🔍 DRY RUN - ' if dry_run else ''}Found {stats['total']} files")
    print(f"Total size: {stats['total_bytes'] / (1024*1024):.2f} MB")
    
    if dry_run:
        print("\nFiles that would be deleted:")
        for f in files[:10]:  # Show first 10 files
            created_date = datetime.fromtimestamp(f.created_at).strftime('%Y-%m-%d %H:%M:%S')
            print(f"  - {f.filename} (ID: {f.id}, Purpose: {f.purpose}, Created: {created_date})")
        if len(files) > 10:
            print(f"  ... and {len(files) - 10} more files")
        print("\n⚠️  Run with dry_run=False to actually delete these files")
        return stats
    
    # Actual deletion
    print("\n🗑️  Deleting files...")
    for i, file in enumerate(files):
        try:
            client.files.delete(file.id)
            stats["deleted"] += 1
            print(f"✓ Deleted {file.filename} ({i+1}/{stats['total']})")
            time.sleep(0.1)  # Rate limiting
            
        except Exception as e:
            stats["failed"] += 1
            print(f"❌ Failed to delete {file.filename}: {e}")
    
    print(f"\n✅ Deletion complete!")
    print(f"   Deleted: {stats['deleted']}")
    print(f"   Failed: {stats['failed']}")
    
    return stats

def delete_files_by_name_pattern(pattern: str, dry_run: bool = True) -> dict:
    """
    Delete files whose names contain a specific pattern.
    
    Parameters
    ----------
    pattern : str
        Delete files whose names contain this pattern
    dry_run : bool
        If True, only shows what would be deleted
    
    Returns
    -------
    dict
        Statistics about the deletion process
    """
    all_files = list_all_files()
    matching_files = [f for f in all_files if pattern in f.filename]
    
    if not matching_files:
        print(f"No files found matching pattern '{pattern}'")
        return {"total": 0, "deleted": 0, "failed": 0}
    
    stats = {
        "total": len(matching_files),
        "deleted": 0,
        "failed": 0
    }
    
    print(f"\n{'🔍 DRY RUN - ' if dry_run else ''}Found {stats['total']} files matching '{pattern}'")
    
    if dry_run:
        print("\nFiles that would be deleted:")
        for f in matching_files[:10]:
            print(f"  - {f.filename} (ID: {f.id})")
        if len(matching_files) > 10:
            print(f"  ... and {len(matching_files) - 10} more files")
        print("\n⚠️  Run with dry_run=False to actually delete these files")
        return stats
    
    # Actual deletion
    print("\n🗑️  Deleting matching files...")
    for i, file in enumerate(matching_files):
        try:
            client.files.delete(file.id)
            stats["deleted"] += 1
            print(f"✓ Deleted {file.filename} ({i+1}/{stats['total']})")
            time.sleep(0.1)
            
        except Exception as e:
            stats["failed"] += 1
            print(f"❌ Failed to delete {file.filename}: {e}")
    
    return stats

def show_storage_summary():
    """
    Display a summary of all files in storage grouped by purpose.
    """
    all_files = list_all_files()
    
    if not all_files:
        print("No files in storage.")
        return
    
    # Group by purpose
    by_purpose = {}
    total_size = 0
    
    for f in all_files:
        purpose = f.purpose
        if purpose not in by_purpose:
            by_purpose[purpose] = {"count": 0, "size": 0, "files": []}
        
        by_purpose[purpose]["count"] += 1
        if hasattr(f, 'bytes') and f.bytes:
            by_purpose[purpose]["size"] += f.bytes
            total_size += f.bytes
        by_purpose[purpose]["files"].append(f.filename)
    
    print("\n📊 Storage Summary")
    print("=" * 50)
    print(f"Total files: {len(all_files)}")
    print(f"Total size: {total_size / (1024*1024):.2f} MB")
    print("\nBy purpose:")
    
    for purpose, info in by_purpose.items():
        print(f"\n  {purpose}:")
        print(f"    Count: {info['count']}")
        print(f"    Size: {info['size'] / (1024*1024):.2f} MB")
        print(f"    Files: {', '.join(info['files'][:3])}" + 
              (f" ... and {info['count'] - 3} more" if info['count'] > 3 else ""))

OpenAI_challenge_20250523-082311


In [7]:
# 1. Get or create vector store
vs_id = get_or_create_vector_store(VECTOR_STORE_NAME)

# 2. Upload and index Documents (skip if already done)
upload_to_vector_store(DIR, vs_id)

Created new vector store 'OpenAI_challenge_20250523-082311' (id=vs_68303070059881918b3f51104fa0bcc0)
Uploading question_28_openai_gpt-4.1-mini_online_20250522192331.md (5699 bytes)…
Indexed question_28_openai_gpt-4.1-mini_online_20250522192331.md (file_id=file-G7tR39znZUHi4NEbRkeiCU)
Uploading question_33_anthropic_claude-3-haiku_online_20250522192802.md (5677 bytes)…
Indexed question_33_anthropic_claude-3-haiku_online_20250522192802.md (file_id=file-UBR3YzUWpdYLcCgKkaTdvW)
Uploading question_30_openai_gpt-4.1-mini_online_20250522192511.md (6266 bytes)…
Indexed question_30_openai_gpt-4.1-mini_online_20250522192511.md (file_id=file-5FwwZHK64yFo5dc46JzSwC)
Uploading question_5_openai_gpt-4.1-mini_online_20250522190010.md (8708 bytes)…
Indexed question_5_openai_gpt-4.1-mini_online_20250522190010.md (file_id=file-DieVKu6QKR6SjzaFBsbE9x)
Uploading question_61_openai_gpt-4.1-mini_online_20250522195723.md (6160 bytes)…
Indexed question_61_openai_gpt-4.1-mini_online_20250522195723.md (file_id=

# What are the locations of known archaeological sites in the Amazon Rainforest?

In [8]:
# 3. Answer Questions

question = """
What are the locations of known archaeological sites in the Amazon Rainforest?
"""

display(Markdown(answer_question(vs_id, question)))

Here is a list of **known archaeological sites in the Amazon Rainforest**, including their names, geographic coordinates, countries, and brief descriptions. This list covers sites from various cultures and time periods (pre-Columbian, colonial, etc.), both well-documented and recently discovered, across the Amazon Basin (Brazil, Peru, Ecuador, Bolivia, and more).

---

### 1. **Upano Valley Sites**
- **Coordinates:** 2.1317°S, 78.1054°W
- **Country:** Ecuador (Morona-Santiago)
- **Description:** A cluster of at least 15 ancient settlements in the Upano River Valley, revealed by LiDAR. These “lost cities” date to c. 500 BC–AD 300/600 (Upano/Kilamope culture). Features include earthen platform mounds, dug roadways, and extensive agriculture—among the earliest known complex societies in Amazonia.  
  *[Source: en.wikipedia.org]*

---

### 2. **Jacó Sá Geoglyph Site**
- **Coordinates:** 9°57′38″S, 67°29′51″W (Decimal: -9.9606, -67.4975)
- **Country:** Brazil (Acre)
- **Description:** One of over 450 pre-Columbian geometric earthworks in Acre, southwestern Amazon. Jacó Sá consists of large square ditches with a circular moat, built ~2,000 years ago. These geoglyphs indicate ancient land use and forest clearing for agriculture.  
  *[Source: agencia.fapesp.br]*

---

### 3. **Monte de Teso dos Bichos (Camutins)**
- **Coordinates:** 0.983°S, 49.583°W
- **Country:** Brazil (Marajó Island, Pará)
- **Description:** A major mound complex built by the Marajoara culture (AD 400–1300). Teso dos Bichos is a 2.5 ha artificial mound, part of a network of ~30 mounds used for habitation above seasonal floods. Marajoara society is known for elaborate pottery and large-scale fish-farming/agriculture.  
  *[Source: en.wikipedia.org]*

---

### 4. **Caverna da Pedra Pintada**
- **Coordinates:** 1.996°S, 54.071°W
- **Country:** Brazil (Monte Alegre, Pará)
- **Description:** Also called “Painted Rock Cave.” Contains rock art and evidence of human habitation ~11,200 years ago, making it one of the oldest sites in Amazonia. Excavated by Anna Roosevelt; features pictographs, Paleoindian tools, and plant remains.  
  *[Source: en.wikipedia.org]*

---

### 5. **Taperinha Shell Midden**
- **Coordinates:** 2.443°S, 54.280°W
- **Country:** Brazil (Santarém, Pará)
- **Description:** An ancient shell mound on a terra firme bluff near the Amazon River. Famous for yielding some of the earliest ceramics in the Americas.  
  *[Source: en.wikipedia.org]*

---

### 6. **Paratoari (Pyramids of Paratoari)**
- **Coordinates:** -12.6706, -71.4567
- **Country:** Peru (Manu region)
- **Description:** Natural pyramid-shaped formations in the southeastern Peruvian Amazon. Attracted archaeological interest due to their unusual shape and possible cultural significance.  
  *[Source: latitude.to]*

---

### 7. **Fonte Boa Site (approximate)**
- **Coordinates:** ~ -7.1, ~ -66.88
- **Country:** Brazil (near Acre)
- **Description:** Mounded ring village and geometric enclosures typical of

# What is the location of Kuhikugu (GPS coordinate)? 

In [9]:
question = """
What is the location of Kuhikugu (GPS coordinate)?
"""

display(Markdown(answer_question(vs_id, question)))

The central GPS coordinates of the Kuhikugu archaeological complex are approximately **12°33′30″S, 53°6′40″W** (decimal: **-12.5583, -53.1111**).

**References:**  
- Wikipedia: Kuhikugu – Archaeological site in Brazil’s Upper Xingu (coordinates, discovery and description) ([en.wikipedia.org](https://en.wikipedia.org/wiki/Kuhikugu))  
- Source: deep_research_Kuhikugu_Archaeological.md  
- Source: deep_research_Known_Archaeological_Sites_2.md

# What are the locations of suspected archaeological sites in the Brazilian Amazon Rainforest from the literature that have not yet been confirmed?

In [10]:
question = """
What are the locations of suspected archaeological sites
in the Brazilian Amazon Rainforest from the
literature that have not yet been confirmed?"
"""

display(Markdown(answer_question(vs_id, question)))

**Locations of Suspected but Unconfirmed Archaeological Sites in the Brazilian Amazon Rainforest (from the literature):**

---

### 1. **Belterra Plateau & Tapajós National Forest (Flona-Tapajós), near Santarém (Pará)**
- **Description:** LiDAR surveys have revealed non-randomly distributed depressions ("Poços de Água") and possible settlement patterns extending deep into the forest, suggesting previously unknown Pre-Columbian habitation.
- **Status:** Identified via remote sensing; not yet confirmed by fieldwork.
- **Reference:** Stenborg et al. 2018 ([tandfonline.com](https://www.tandfonline.com/doi/full/10.1080/00934690.2017.1417198))

---

### 2. **Southern Amazonia – Headwaters of the Tapajós River (Pará/Mato Grosso)**
- **Description:** Extensive clusters of ditched enclosures and mound villages (fortified settlements) have been detected by LiDAR and predictive modeling across an 1,800 km stretch, especially in the Upper Tapajós Basin.
- **Status:** Most sites remain unexcavated and unconfirmed; only a subset has been ground-truthed.
- **References:**  
  - Gregorio de Souza et al. 2018 ([pubmed.ncbi.nlm.nih.gov](https://pubmed.ncbi.nlm.nih.gov/29588444/))  
  - Peripato et al. 2023 ([Science](https://www.science.org/doi/10.1126/science.adg7700))

---

### 3. **Upper Xingu Basin (Mato Grosso)**
- **Description:** LiDAR has revealed a large, ring-ditch fortified village (plaza-town) beneath the forest canopy, similar to known pre-Columbian settlements, but this site remains unexcavated.
- **Status:** Detected remotely; awaiting field verification.
- **Reference:** [rainfor.org](https://www.rainfor.org/)

---

### 4. **Upper Purús River Region (Acre/Western Amazonia)**
- **Description:** Numerous geometric earthworks and geoglyphs (rectangular/circular ditches) have been identified, with many more suspected based on landscape patterns and remote sensing.
- **Status:** Several sites documented remotely; many remain unconfirmed by excavation.
- **References:**  
  - Pärssinen et al. 2009 ([cambridge.org](https://www.cambridge.org/core/journals/antiquity/article/abs/precolumbian-geometric-earthworks-in-the-upper-purus-a-complex-society-in-western-amazonia/CF5DB06F57758F24C32449BD4B2C5AFB))  
  - Peripato et al. 2023

---

### 5. **Acre State (Southwestern Amazon) – Purus-Madeira Interfluve**
- **Description:** At least 13 new geoglyph sites (large geometric earthen enclosures) have been predicted by satellite and modeling studies, expanding the known distribution of Acre’s geoglyph complexes.
- **Status:** Identified by remote sensing/modeling; not yet confirmed by excavation.
- **Reference:** Peripato et al. 2023; [researchgate.net](https://www.researchgate.net/publication/374567890)

---

### 6. **Central Amazon Floodplain (Amazon River Basin)**
- **Description:** Two large geometric earthwork features detected on the floodplain, possibly anthropogenic (e.g., ancient fish weirs or water management structures).
- **Status:** Detected by remote sensing; function and origin unconfirmed.
- **Reference:** [rainfor.org](https://www.rainfor.org/)

---

### 7

# What locations in the Amazon Rainforest match descriptions from historical accounts but haven't been verified on the ground?

In [11]:
question = """
What locations in the Amazon Rainforest match descriptions from historical accounts but haven't been verified on the ground?
"""

display(Markdown(answer_question(vs_id, question)))

Several locations in the Brazilian Amazon Rainforest match descriptions from historical accounts—such as early explorer reports or indigenous oral histories—but have not yet been verified archaeologically on the ground. Recent advances in remote sensing (LiDAR, satellite imagery) and predictive modeling have highlighted these areas as high-probability candidates for ancient settlements, earthworks, or engineered landscapes. Below are the main regions identified in the literature:

---

### 1. **Upper Xingu, Mato Grosso (Southern Amazonia)**
- **Description:** Historical accounts and indigenous oral traditions describe large, organized settlements with plazas and defensive ditches, similar to those reported by early explorers (e.g., Karl von den Steinen, 19th century).
- **Modern Evidence:** LiDAR has revealed a fortified village site (ancient plaza-town) with a central plaza and ditched enclosure beneath the forest canopy, matching these descriptions.
- **Status:** The site remains unexcavated and unverified on the ground.
- **Source:** *deep_research_Suspected_Unconfirmed_Sites.md*; [rainfor.org](https://www.rainfor.org/)

---

### 2. **Southwestern Amazon (Acre/Purus-Madeira Region)**
- **Description:** Early travelers and rubber tappers reported geometric earthworks and mounds in the forest, which were later echoed in indigenous narratives.
- **Modern Evidence:** LiDAR and satellite imagery have revealed multiple geoglyph earthworks (rectangular and circular ditched enclosures) on interfluvial plateaus, consistent with these historical accounts.
- **Status:** These features have not been confirmed by ground-truthing or excavation.
- **Source:** *deep_research_Suspected_Unconfirmed_Sites.md*; [rainfor.org](https://www.rainfor.org/)

---

### 3. **Upper Tapajós Basin (Pará/Mato Grosso)**
- **Description:** Colonial-era reports and indigenous oral histories reference large, fortified villages and mound complexes in the Tapajós headwaters.
- **Modern Evidence:** Remote surveys have identified an extensive cluster of 81 prehistoric sites (ditched enclosures and mound villages) forming a network over ~1,800 km, matching the scale and layout described historically.
- **Status:** Most sites remain unexcavated and unverified on the ground.
- **Source:** *deep_research_Suspected_Unconfirmed_Sites.md*; [nature.com](https://www.nature.com/articles/s41597-021-01067-7.pdf)

---

### 4. **Central Amazon Floodplain (Amazon River Basin)**
- **Description:** Historical accounts mention large engineered features (possibly fish weirs or water management structures) along the floodplains.
- **Modern Evidence:** Remote sensing has detected two large geometric earthwork features with straight edges, consistent with these descriptions.
- **Status:** Awaiting on-site verification; their function is hypothesized from context.
- **Source:** *deep_research_Suspected_Unconfirmed_Sites.md*; [rainfor.org](https://www.rainfor.org/)

---

### 5. **Acre State, Western Amazon**
- **Description:** Early 20th-century rubber tappers and explorers described mysterious geometric clearings and mounds.
- **Modern Evidence:** At least 13 new geoglyph sites have been identified via predictive modeling and satellite analysis, matching these historical reports.
- **Status:** Remain unconfirmed by excavation.
- **Source:** *deep_research_Suspected_Unconfirmed_Sites.md*; [researchgate.net](https://www.researchgate.net/)

---

### 6. **Eastern Amazon (Along Major Rivers)**
- **Description:** Historical sources and indigenous knowledge refer to fertile “dark earth” (terra preta) patches associated with ancient settlements.
- **Modern Evidence:** Modeling studies predict numerous high-probability terra preta sites along the Amazon

# What techniques have been used to discover new archaeological sites in the Amazon rainforest in recent years?

In [12]:
question = """
What techniques have been used to discover new archaeological sites in the Amazon rainforest in recent years?
"""

display(Markdown(answer_question(vs_id, question)))

**Recent Techniques for Discovering New Archaeological Sites in the Amazon Rainforest (2019–2024):**

Recent years have seen a dramatic expansion in the discovery of hidden archaeological sites in the Amazon, driven by a combination of advanced remote sensing technologies and machine learning (ML) models. The main techniques include:

---

### 1. **Airborne LiDAR (Light Detection and Ranging)**
- **How it works:** LiDAR emits laser pulses from aircraft or drones to map the ground surface in 3D, penetrating dense forest canopies.
- **Impact:** LiDAR has “digitally deforested” the Amazon, revealing earthworks, mounds, ditches, roads, canals, and even entire lost towns invisible to the naked eye or conventional aerial photography.
- **Recent discoveries:**
  - In Acre and Mato Grosso (Brazil), LiDAR revealed dozens of previously unknown circular and rectangular villages, geometric geoglyphs, and complex engineered landscapes ([Smithsonianmag.com](https://www.smithsonianmag.com/), [news.mongabay.com](https://news.mongabay.com/)).
  - In 2023, a large-scale LiDAR study covering just 0.1% of the Amazon identified 24 new sites, suggesting over 10,000 earthworks remain undiscovered ([Peripato et al., 2023, Science](https://www.science.org/doi/10.1126/science.adh3771)).
  - In 2024, drone-mounted LiDAR rediscovered a lost 18th-century Portuguese colonial town in Rondônia, also exposing much older indigenous features ([Washington Post](https://www.washingtonpost.com/)).
- **Advantages:** High-resolution, can see through vegetation, reveals subtle anthropogenic features.
- **Limitations:** Expensive, limited coverage so far, requires expert interpretation (though ML is automating this).

---

### 2. **Machine Learning Models Applied to Remote Sensing Data**
- **Deep Learning on LiDAR Data:**  
  - **Convolutional Neural Networks (CNNs):** Trained to automatically detect archaeological features (e.g., mounds, ditches, geometric enclosures) in LiDAR-derived elevation models.
  - **Effectiveness:** High accuracy (often 80–90%+ in tests), greatly speeds up site detection compared to manual analysis ([Fiorucci et al., 2022](https://www.livescience.com/)).
  - **Use cases:** Automated detection of “clock face” villages, earthworks, and causeways in Acre and other regions.
- **Predictive Spatial Modeling:**  
  - **Random Forests and Other Classifiers:** Used to predict the probability of archaeological site occurrence based on environmental variables (soil, hydrology, topography, vegetation).
  - **Example:** Walker et al. (2023) used a random forest model with 65 variables to predict earthwork and ADE (Amazonian Dark Earth) sites, achieving AUC ~0.91 and successfully guiding field surveys to new discoveries ([Walker et al., 2023, PeerJ](https://peerj.com/articles/15137/)).
  - **Industry Application:** TechnoLynx’s AI system identified multiple ancient settlement sites, later confirmed by ground truthing ([technolynx.com](https://technolynx.com/)).
- **Spectral Classification:**  
  - ML models analyze multispectral/hyperspectral satellite data to detect vegetation or soil anomalies indicative of buried features.

---

### 3. **Satellite Multispectral and Hyperspectral Imagery**
- **How it works:** Satellite sensors capture data in multiple wavelengths, revealing subtle differences in vegetation health or soil chemistry that may signal buried archaeological remains.
- **Applications:** Used to identify anthropogenic soils (ADEs), geometric earthworks, and settlement patterns, especially in deforested or partially cleared areas ([

# Were historical and indigenous texts analyzed using machine learning or a large language model to identify potential archaeological site locations?

In [13]:
question = """
Were historical and indigenous texts analyzed using machine
learning or a large language model to identify potential archaeological site locations?
"""

display(Markdown(answer_question(vs_id, question)))

Based on the provided context, **historical and indigenous texts have not yet been systematically analyzed using machine learning (ML) or large language models (LLMs) to directly identify potential archaeological site locations in the Amazon**. The main ML applications described in the sources focus on remote sensing data (LiDAR, satellite imagery, environmental predictors) and not on textual analysis of historical or indigenous documents.

However, the context does discuss the **potential and strategies for using LLMs** (such as GPT-4) to analyze historical texts, colonial diaries, and indigenous accounts for archaeological prospecting:

- **LLMs can be used to extract geographic clues from old exploration narratives**, such as mentions of rivers, directions, and distances, which could then be geocoded to approximate site locations. For example, prompts like “Read this expedition diary text and extract every sentence that mentions a river, compass direction, or distance traveled” are suggested as ways to mine texts for relevant information (deep_research_competition_strategies_ideas.md).
- **LLMs can translate and summarize non-English documents** (e.g., Portuguese or Spanish reports), making them accessible for archaeological research.
- **LLMs can help identify patterns in site descriptions** by summarizing and comparing environmental and locational features across known sites.

These are described as **recommended or emerging strategies**, not as methods that have already been systematically implemented in published Amazonian archaeological research. The context emphasizes that LLMs are valuable tools for augmenting human research in literature review, data extraction, and hypothesis generation, but it does not cite any studies where ML or LLMs have directly led to the discovery of archaeological sites through automated analysis of historical or indigenous texts.

**In summary:**  
- **No published Amazonian archaeological discoveries have yet resulted from ML or LLM analysis of historical or indigenous texts.**
- **LLMs are recognized as promising tools for this purpose, and workflows are being developed, but their use is currently at the stage of recommendation and experimentation, not established practice.**

**References:**  
- deep_research_competition_strategies_ideas.md (sections 3.1 and 3.2)
- No mention of such text-based ML/LLM discovery in the main archaeological ML literature cited (Walker et al. 2023; Peripato et al. 2023; Iriarte et al. 2020, etc.).

# What is the impact of the presence of Amazonian dark earth on vegetation? How could this be used to detect potential unexplored archaeological sites using satellite imagery and machine learning? 

In [14]:
question = """
What is the impact of the presence of Amazonian dark earth on vegetation?
How could this be used to detect potential
unexplored archaeological sites using satellite imagery and machine learning? 
"""

display(Markdown(answer_question(vs_id, question)))

### Impact of Amazonian Dark Earth (ADE) on Vegetation

**Amazonian Dark Earths (ADEs)**, or *terra preta*, are anthropogenic soils created by ancient human activity. Their presence has a marked impact on the vegetation growing above them:

- **Altered Vegetation Structure and Composition:** Forests over ADEs often differ from those on typical Amazonian soils. They tend to have more palms, shorter trees, and a higher proportion of secondary growth species. This is likely because ADEs mark the sites of former settlements, gardens, or agricultural fields, where centuries of human activity and later regrowth have shaped the plant community ([eos.org](https://eos.org)).
- **Vegetation Density and Moisture:** Satellite studies have shown that canopy vegetation over ADEs is often **less dense** and has **lower moisture content** than surrounding primary forest. This is somewhat counterintuitive, as one might expect richer soils to support lusher growth. However, the explanation lies in the legacy of disturbance: ADE forests are typically regrowth on old habitation sites, so the canopy is lower and less continuous ([eos.org](https://eos.org)).
- **Vegetation Indices:** These differences are reflected in satellite-derived indices. For example, **Normalized Difference Vegetation Index (NDVI)** values are often **lower** over ADEs, and canopy water content is reduced compared to adjacent undisturbed forests. These contrasts become especially pronounced during drought years, when ADE forests brown out more than others ([eos.org](https://eos.org)).

### Using These Effects to Detect Unexplored Archaeological Sites

The unique impact of ADEs on vegetation provides a **remote sensing signature** that can be exploited to locate potential archaeological sites:

1. **Spectral and Vegetation Anomalies:**  
   - **Multispectral and Hyperspectral Satellite Imagery:** Satellites like Landsat, Sentinel-2, and hyperspectral sensors can detect subtle differences in canopy reflectance, NDVI, and moisture indices over ADEs. These anomalies can be mapped even under dense forest cover ([eos.org](https://eos.org); [wired.com](https://www.wired.com/2012/12/satellite-sensing-black-earth-amazon/?cid=4800984)).
   - **Soil Exposure in Cleared Areas:** In agricultural or deforested zones, ADEs may be visible directly as unusually dark, nutrient-rich soil patches, which stand out from the typical red or yellow Amazonian soils ([daneshyari.com](https://daneshyari.com)).

2. **Machine Learning and Predictive Modeling:**  
   - **Training on Known Sites:** Machine learning (ML) models (e.g., random forests, convolutional neural networks) can be trained on the spectral and environmental characteristics of known ADE sites—including vegetation indices, soil reflectance, proximity to water, and topography ([Walker et al. 2023](https://pubmed.ncbi.nlm.nih.gov/37020851)).
   - **Automated Detection:** Once trained, these models can scan vast satellite datasets to flag areas with similar spectral “fingerprints,” identifying high-probability targets for unexplored archaeological sites. For example, ML models have achieved high accuracy (AUC ~0.91) in predicting ADE and earthwork locations ([Walker et al. 2023](https://pubmed.ncbi.nlm.nih.gov/37020851)).
   - **Integration with LiDAR:** In areas where LiDAR is available, ML can combine canopy and elevation data to further refine site predictions, revealing earthworks and settlement layouts often associated with ADEs ([deep_research_Machine_Learning_Models_2.md](#)).

3. **Ground Verification and Discovery:**  
   - **Guiding Fieldwork:** These remote predictions can direct archaeologists to previously unknown sites for ground-truthing, dramatically increasing the efficiency

# What open-access LIDAR datasets cover portions of the Brazilian Amazon?

In [15]:
question = """
What open-access LIDAR datasets cover portions of the Brazilian Amazon?
"""

display(Markdown(answer_question(vs_id, question)))

Several **open-access LIDAR datasets** cover portions of the Brazilian Amazon, providing valuable resources for research on forest structure, biomass, disturbance, and even archaeological features. The most significant datasets are:

---

### 1. **EBA Project Airborne LIDAR Transects (2016–2018)**
- **Description:** Over 900 airborne LIDAR transects (each 12.5 km × 0.3 km, ~375 ha) sampled across the Brazilian Amazon as part of the Estimating Biomass in Amazonia (EBA) project.
- **Data:** LAS point clouds (~4 pts/m²), suitable for canopy structure, gap dynamics, biomass, and detection of earthworks.
- **Access:** [Zenodo repository](https://zenodo.org/record/4288474) and [Nature article](https://www.nature.com/articles/s41598-020-80809-w)
- **Reference:** Aragão et al., 2021, *Scientific Reports*.

---

### 2. **NASA/Embrapa Sustainable Landscapes LIDAR (2008–2018)**
- **Description:** High-resolution airborne LIDAR surveys over select sites in Acre, Amazonas, Pará, Rondônia, and other states, collected for forest biomass and carbon monitoring.
- **Data:** Point clouds (~10 pts/m²), 1 km² tiles, with Digital Terrain Models (DTMs) at ~1 m resolution.
- **Access:** [ORNL DAAC](https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2007) (free EarthData login required).

---

### 3. **Manaus Region Forest LIDAR (2008)**
- **Description:** Airborne LIDAR over forest research sites near Manaus (Cuieiras Biological Reserve, Adolpho Ducke Reserve, BDFFP).
- **Data:** Raw point clouds and DTMs (~1 m resolution).
- **Access:** [ORNL DAAC](https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2007).

---

### 4. **Tapajós National Forest LIDAR (2008)**
- **Description:** Airborne LIDAR surveys in Tapajós (Pará state), covering flux tower areas and selectively logged sites.
- **Data:** Point clouds and 1 m DTM grids.
- **Access:** [ORNL DAAC](https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1284).

---

### 5. **Monitoring Selective Logging in Western Amazonia (Acre, 2010–2011)**
- **Description:** Repeat LIDAR flights to monitor selective logging impacts on canopy structure and biomass in Acre.
- **Data:** Canopy height models, biomass change, logging disturbance.
- **Access:** [ScienceDirect article](https://www.sciencedirect.com/science/article/pii/S0034425713003441).

---

### 6. **Antimary State Forest LIDAR (Acre State)**
- **Description:** Airborne LIDAR and ground plots for biomass estimation and low-intensity logging detection.
- **Data:** LIDAR metrics, canopy height, biomass models.
- **Access:** [ScienceDirect article](https://www.sciencedirect.com/science/article/pii/S0034425712002179).

---

### 7. **NASA GEDI Satellite LIDAR (2019–present)**
- **Description:** Spaceborne LIDAR providing billions of footprints (~25 m diameter, ~60 m spacing) across the Amazon.
- **Data:** Canopy height and ground elevation; not continuous but dense sampling.
- **Access:** [NASA Earthdata GEDI Portal](https://lpdaac.usgs.gov/products/gedi02_av002/).

---



# What machine learning models have successfully identified archaeological features beneath forest canopy?

In [16]:
question = """
What machine learning models have successfully identified archaeological features beneath forest canopy?
"""

display(Markdown(answer_question(vs_id, question)))

Several machine learning (ML) models have **successfully identified archaeological features beneath forest canopy**, especially when paired with remote sensing data such as LiDAR, multispectral, and hyperspectral imagery. The most prominent and effective models and approaches include:

---

## 1. **Random Forest Classifiers**
- **Description:** Ensemble tree-based models that classify regions as likely or unlikely to contain archaeological features based on environmental and remote sensing variables.
- **Successes:**  
  - **Walker et al. (2023)** used a random forest model trained on 65 environmental variables (including LiDAR, climate, and soil data) to predict the presence of earthworks and Amazonian Dark Earth (ADE) sites in the Brazilian Amazon. The model achieved high accuracy (AUC ≈ 0.91) and led to the discovery of 13 new geoglyph sites in areas flagged as high-probability by the model ([Walker et al., 2023, PeerJ](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10496461/)).
  - Random forests have outperformed neural networks, support vector machines, and gradient-boosted trees in some Amazonian site prediction tasks ([deep_research_Machine_Learning_Models.md](#)).

---

## 2. **Convolutional Neural Networks (CNNs)**
- **Description:** Deep learning models adept at recognizing spatial patterns in imagery, including subtle geometric or textural anomalies indicative of archaeological features.
- **Successes:**  
  - CNNs have been used to analyze LiDAR-derived digital elevation models (DEMs) and high-resolution aerial/satellite images to automatically detect features such as circular mounds, linear embankments, and geometric earthworks hidden beneath forest canopy ([deep_research_Machine_Learning_Models.md](#); [Fiorucci et al., 2022](https://www.livescience.com/archaeology/)).
  - In Amazonia, CNNs have been trained to flag anthropogenic relief anomalies in large LiDAR datasets, accelerating the identification of hidden sites ([deep_research_Machine_Learning_Models_2.md](#)).

---

## 3. **U-Net (Deep Learning Segmentation Networks)**
- **Description:** A specialized CNN architecture for image segmentation, capable of delineating individual objects or features within complex remote sensing data.
- **Successes:**  
  - **Wagner et al. (2020)** used U-Net to segment and map individual palm crowns in Amazonian forests from very-high-resolution multispectral imagery, achieving ~95% mapping accuracy (F1 ≈ 0.7). Certain palm species serve as proxies for past human activity ([Wagner et al., 2020, MDPI](https://www.mdpi.com/)).
  - U-Net has also been applied to airborne LiDAR data to detect over a million palm tree segments, which can indicate anthropogenic landscapes ([Dal’Agnol da Silva et al., 2022](https://www.researchgate.net/)).

---

## 4. **Mask R-CNN (Region-based Convolutional Neural Networks)**
- **Description:** An advanced deep learning model for object detection and segmentation, particularly effective on LiDAR-derived DEMs.
- **Successes:**  
  - Modified Mask R-CNN models have been used to automate the detection of archaeological sites in high-resolution LiDAR data, successfully identifying features in forested regions of Europe ([Wiley, 2022](https://onlinelibrary.wiley.com/doi/10.1002/arp.1806)).

---

## 5. **Hybrid Approaches (LiDAR + Multispectral/Hyperspectral + ML)**
- **Description:** Combining LiDAR with multispectral or hyperspectral satellite imagery, then applying ML classifiers (e.g., random forest, SVM, CNN) to detect features based on both elevation

#  How are radar (e.g., Sentinel-1 SAR) and hyperspectral data being combined with LIDAR for sub-canopy detection?

In [17]:
question = """
How are radar (e.g., Sentinel-1 SAR) and hyperspectral data being combined with LIDAR for sub-canopy detection?
"""

display(Markdown(answer_question(vs_id, question)))

Radar (e.g., Sentinel-1 SAR), hyperspectral, and LIDAR data are being **combined through multi-level data fusion approaches** to significantly improve sub-canopy detection in forested and complex vegetated environments. Here’s how each data type contributes and how their integration is achieved:

---

### **1. Complementary Strengths**

- **LIDAR** provides high-resolution 3D structural information, mapping vertical vegetation layers and ground elevation—even through canopy gaps.
- **Hyperspectral data** offers detailed spectral signatures, enabling discrimination of vegetation types, health, and biochemical properties, but struggles with spectral mixing in dense or heterogeneous canopies.
- **Radar (SAR)**, such as Sentinel-1, uses longer wavelengths that partially penetrate the canopy, providing information on sub-canopy structure, surface roughness, and moisture content.

---

### **2. Fusion Approaches**

**Data fusion** can occur at several levels ([Li, 2021](https://www.sciencedirect.com/science/article/pii/S1367578821000110); [Tusa et al., 2019](https://www.sciencedirect.com/science/article/pii/B9780444639776000134)):

- **Low-level (Observation-level) Fusion:**  
  Directly combines raw data (e.g., stacking LIDAR point clouds, hyperspectral bands, and radar backscatter) to create a unified dataset for analysis.

- **Medium-level (Feature-level) Fusion:**  
  Extracts features from each source (e.g., LIDAR-derived canopy height, hyperspectral vegetation indices, radar texture/backscatter statistics) and merges them for classification or regression tasks.

- **High-level (Decision-level) Fusion:**  
  Independently analyzes each data type, then combines their classification or detection outputs using ensemble or voting methods.

---

### **3. Addressing Sub-Canopy Complexity**

- **Spectral Mixing:**  
  Hyperspectral data alone can be confounded by mixed signals from soil, litter, and multiple vegetation layers. LIDAR’s structural data and radar’s partial penetration help disentangle these signals, allowing for more accurate sub-canopy class identification ([Mitchell, 2019](https://www.mdpi.com/2072-4292/11/18/2141/htm)).

- **Vertical Layer Separation:**  
  LIDAR’s vertical profiles, when combined with radar’s sensitivity to structure and hyperspectral’s species-level detail, enable the detection and mapping of understory vegetation and ground features that would otherwise be obscured by the upper canopy.

---

### **4. Practical Applications**

- **Tree Species Classification:**  
  UAV-based hyperspectral and LIDAR fusion improves the distinction of canopy and sub-canopy species, especially in complex tropical forests ([David et al., 2023](https://www.mdpi.com/1999-4907/14/5/945)).

- **Forest Monitoring:**  
  Integrated LIDAR, hyperspectral, and radar data support large-scale monitoring of forest structure, composition, and sub-canopy vegetation dynamics ([Sankey et al., 2017](https://www.sciencedirect.com/science/article/pii/S0034425717301578)).

---

### **Summary Table**

| Data Type      | Key Contribution                  | Limitation Alone           | Benefit in Fusion                  |
|----------------|-----------------------------------|----------------------------|------------------------------------|
| LIDAR          | 3D structure, vertical profiles   | Limited spectral info      | Separates canopy/sub-canopy layers |
| Hyperspectral  | Species/biochemical discrimination| Spectral mixing, no structure | Enhanced species mapping           |
| Radar (SAR)    | Structure, moisture, penetration  | Lower spatial resolution   | Reveals sub-canopy structure       |

---

### **References**

1. Li, S. (2021). [A comprehensive review of

#  What benchmark datasets and evaluation metrics are commonly used to compare canopy-penetrating algorithms?

In [18]:
question = """
What benchmark datasets and evaluation metrics are commonly used to compare canopy-penetrating algorithms?
"""

display(Markdown(answer_question(vs_id, question)))

**Benchmark datasets and evaluation metrics are essential for objectively comparing canopy-penetrating algorithms, such as those used for individual tree crown detection and delineation from remote sensing data.**

---

## Common Benchmark Datasets

1. **NeonTreeEvaluation**
   - *Description*: Provides airborne RGB, hyperspectral, and LiDAR imagery with manual tree annotations.
   - *Purpose*: Benchmark for tree detection algorithms using multimodal data.
   - *Reference*: [GitHub - NeonTreeEvaluation](https://github.com/weecology/NeonTreeEvaluation)

2. **National Ecological Observation Network (NEON) Dataset**
   - *Description*: Large-scale, co-registered airborne RGB, LiDAR, and hyperspectral imagery with annotated tree crowns.
   - *Purpose*: Benchmark for canopy crown detection and delineation; supports reproducible and transparent algorithm evaluation.
   - *Reference*: [Weinstein et al., 2021, PLOS Comp Biol](https://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1009180&type=printable)

3. **Open-Canopy**
   - *Description*: Country-scale (France) benchmark for canopy height estimation at 1.5 m resolution, using SPOT satellite imagery and high-resolution aerial LiDAR.
   - *Variants*: Includes Open-Canopy-Δ for canopy height change detection between years.
   - *Reference*: [Fogel et al., 2024, arXiv](https://arxiv.org/abs/2407.09392)

4. **ISPRS International Contest of Individual Tree Crown (ITC) Segmentation**
   - *Description*: Benchmark for individual tree crown segmentation using various data types and sensors.
   - *Reference*: [ISPRS ITC Segmentation Contest](https://www2.isprs.org/commissions/comm3/wg1/itc-segmentation-contest/)

5. **Heterogeneous Forest Data from the Alpine Space**
   - *Description*: LiDAR-based dataset covering diverse forest types (coniferous, deciduous, mixed) in the Alps, with varying stand structures and densities.
   - *Purpose*: Evaluates single tree detection methods across heterogeneous conditions.
   - *Reference*: [Eysn et al., 2015, MDPI Forests](https://www.mdpi.com/1999-4907/6/5/1721)

---

## Common Evaluation Metrics

1. **Detection Accuracy**
   - Measures the proportion of correctly identified tree crowns.

2. **Precision**
   - The proportion of true positives among all positive predictions (how many detected crowns are correct).

3. **Recall**
   - The proportion of true positives among all actual positives (how many actual crowns were detected).

4. **F1-score**
   - The harmonic mean of precision and recall, balancing both metrics.

5. **Intersection over Union (IoU)**
   - The ratio of the intersection area to the union area between predicted and actual tree crowns; used for evaluating spatial overlap in delineation.

6. **Root Mean Square Error (RMSE)**
   - Used for canopy height estimation, measuring the average error between predicted and actual tree heights.

7. **Change Detection Accuracy**
   - For datasets like Open-Canopy-Δ, evaluates the ability to detect changes in canopy height over time.

---

**Summary:**  
The most commonly used benchmark datasets for canopy-penetrating algorithms include NeonTreeEvaluation, the NEON dataset, Open-Canopy, the ISPRS ITC Segmentation Contest, and the Alpine Space dataset. Standard evaluation metrics are precision, recall, F1-score, IoU, RMSE (for height), and change detection accuracy. These resources and metrics enable robust, reproducible, and comparable assessments of algorithm performance across different forest types and remote sensing modalities.

**References:**
-

# What common geographical features are shared among verified archaeological sites in the Amazon?

In [19]:
question = """
What common geographical features are shared among verified archaeological sites in the Amazon?
"""

display(Markdown(answer_question(vs_id, question)))

Verified archaeological sites in the Amazon consistently share several key geographical features, reflecting both environmental opportunities and ancient human adaptation strategies. The most commonly identified features are:

---

### 1. **Proximity to Water Sources**
Most sites are located near rivers, streams, lakes, or wetlands. Access to water was crucial for transportation, fishing, agriculture, and daily sustenance. This pattern is especially evident along the southern rim of the Amazon and in the Peruvian Amazon, where clusters of sites follow major waterways ([Iriarte et al., 2018](https://www.nature.com/articles/s41467-018-03510-7); [Walker, 2023](https://peerj.com/articles/15137/)).

---

### 2. **Elevated or Well-Drained Terrain (Terra Firme)**
Sites are frequently found on slightly elevated ground or terra firme—areas above the seasonal floodplain. This placement protects settlements and earthworks from flooding while still allowing access to fertile alluvial soils ([Science, 2023](https://www.science.org/content/article/laser-mapping-reveals-hidden-structures-in-amazon-hints-thousands-more); [Elsevier, 2023](https://linkinghub.elsevier.com/retrieve/pii/S0277379116300919)).

---

### 3. **Fertile and Anthropogenic Soils (Terra Preta/ADE)**
Many sites are associated with patches of highly fertile, human-created soils known as Terra Preta or Amazonian Dark Earths (ADE). These soils result from long-term deposition of organic matter and char, enabling intensive agriculture and supporting larger populations ([Walker, 2023](https://peerj.com/articles/15137/)).

---

### 4. **Forested Environments and Landscape Modification**
Sites are typically embedded within forested landscapes, but often show evidence of deliberate landscape modification—such as raised fields, causeways, ditches, and mounds. These modifications improved drainage, facilitated agriculture, and connected settlements, indicating sophisticated environmental management ([Science, 2023](https://www.science.org/content/article/laser-mapping-reveals-hidden-structures-in-amazon-hints-thousands-more); [Elsevier, 2023](https://linkinghub.elsevier.com/retrieve/pii/S0277379116300919)).

---

### 5. **Strategic Placement for Social and Trade Networks**
Many sites are found along ecological boundaries (ecotones) or in linear arrangements along floodplain corridors. This facilitated inter-community exchange and integration, especially among culturally related groups such as the Arawak-speaking populations ([Iriarte et al., 2018](https://www.nature.com/articles/s41467-018-03510-7)).

---

**Summary Table of Common Features:**

| Feature                              | Description/Function                                         | Key References                                                                                   |
|---------------------------------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
| Proximity to water                   | Resource access, transport, fertile soils                   | [Nature, 2018](https://www.nature.com/articles/s41467-018-03510-7); [PeerJ, 2023](https://peerj.com/articles/15137/) |
| Elevated/well-drained terrain        | Flood avoidance, agriculture                                | [Science, 2023](https://www.science.org/content/article/laser-mapping-reveals-hidden-structures-in-amazon-hints-thousands-more) |
| Fertile/anthropogenic soils (ADE)    | Intensive agriculture, population support                   | [PeerJ, 2023](https://peerj.com/articles/15137/)                                                 |
| Forested environment & modification  | Timber, game, landscape engineering                         | [Elsevier, 2023](https://linkinghub.elsevier.com/retrieve/pii/S0277379116300919)

In [20]:
# Clean up
# Option 1: Clean up specific vector store
cleanup_vector_store(vs_id)
    
# Option 2: Clean up all test vector stores (uncomment to use)
#cleanup_all_vector_stores_by_pattern("OpenAI_challenge_")


🧹 Starting cleanup for vector store vs_68303070059881918b3f51104fa0bcc0

📄 Deleting files...
✓ Deleted file file-VTLwc9sLgTFV8Uc43hxt8C
✓ Deleted file file-33q3VhUEjjzkXJSNJb9CMv
✓ Deleted file file-GK6JpjbX2aJNPTePRb9qry
✓ Deleted file file-LXEXxjgvArfsvDL7WqdRe4
✓ Deleted file file-4cHcKBrAdX87eHVFdvpDhz
✓ Deleted file file-BPTUPADcy3Sd1giJjmvcM6
✓ Deleted file file-WJ7AFiV2qMgu88HiCTPnja
✓ Deleted file file-GzSzWKiQKQ1Z4kPe2EdQtc
✓ Deleted file file-1QeYfX36y8CgDg6zwMNmqh
✓ Deleted file file-FxYM4Jx4vvgKAGUfxLxoHh
✓ Deleted file file-PCm2mLGBiLBFopFW9EDt9N
✓ Deleted file file-TYMxPJVCfhimxFgtNdKFBF
✓ Deleted file file-8DdGyCNvK5qoE9kCSChaFm
✓ Deleted file file-5KCPiVc5sQtxkvb9LLYmQn
✓ Deleted file file-P9CegFPVceKGRtd3Q81StH
✓ Deleted file file-She4nzyt2VVpkDkncHLrSV
✓ Deleted file file-Tu77kDMjXAaTRmTjFXaBMP
✓ Deleted file file-RbT29neLsHNNJDQqgQAgKc
✓ Deleted file file-3L6s6TuqEqnGdcc9phXrW5
✓ Deleted file file-9KUysgQZt2QSE4j28DfXn1
Deleted 20 files

🗑️  Deleting vector store...

In [21]:
#print("Current storage status:")
#show_storage_summary()

In [22]:
# Example 1: Delete ALL files (dry run first for safety)
#print("\n" + "="*50)
#print("Preview of deleting ALL files:")
#delete_all_files(dry_run=True)

In [23]:
delete_all_files(dry_run=False)


Found 221 files
Total size: 1.33 MB

🗑️  Deleting files...
✓ Deleted question_16_meta-llama_llama-4-scout_online_20250522191209.md (1/221)
✓ Deleted question_41_anthropic_claude-3-haiku_online_20250522193647.md (2/221)
✓ Deleted deep_research_competition_strategies_ideas.md (3/221)
✓ Deleted question_41_meta-llama_llama-4-scout_online_20250522193701.md (4/221)
✓ Deleted question_73_meta-llama_llama-4-scout_online_20250522201158.md (5/221)
✓ Deleted deep_research_Kuhikugu_Archaeological.md (6/221)
✓ Deleted question_15_openai_gpt-4.1-mini_online_20250522191038.md (7/221)
✓ Deleted question_71_anthropic_claude-3-haiku_online_20250522200901.md (8/221)
✓ Deleted question_31_openai_gpt-4.1-mini_online_20250522192558.md (9/221)
✓ Deleted question_70_meta-llama_llama-4-scout_online_20250522200809.md (10/221)
✓ Deleted question_32_openai_gpt-4.1-mini_online_20250522192653.md (11/221)
✓ Deleted question_4_anthropic_claude-3-haiku_online_20250522185908.md (12/221)
✓ Deleted deep_research_Effect

{'total': 221, 'deleted': 221, 'failed': 0, 'total_bytes': 1394289}

In [24]:
# Example 2: Delete only assistant files
# delete_all_files(purpose="assistants", dry_run=False)
    
# Example 3: Delete files by name pattern
delete_files_by_name_pattern("question_", dry_run=False)
delete_files_by_name_pattern("deep_research_", dry_run=False)

No files found matching pattern 'question_'
No files found matching pattern 'deep_research_'


{'total': 0, 'deleted': 0, 'failed': 0}