# Checkpoint 1 — End‑to‑End Notebook
This Jupyter notebook fetches a small LiDAR **or** Sentinel‑2 sample, computes minimal stats, and asks an OpenAI model to describe the surface features in plain English.

**Steps:**
1. Install/verify dependencies
2. Configure your `OPENAI_API_KEY`
3. Choose `lidar` or `sentinel2`
4. Run the workflow and inspect the output

In [None]:
# Run once per Colab/runtime
%pip install -q rasterio laspy openai python-dotenv

In [None]:
import os, getpass, json, tempfile, requests, numpy as np, rasterio, laspy
from openai import OpenAI
from dotenv import load_dotenv

# ===============================================================================
# OpenAI ENVIRONMENT SETUP
# ===============================================================================
load_dotenv()

if 'OPENAI_API_KEY' not in os.environ or not os.environ['OPENAI_API_KEY'].strip():
    os.environ['OPENAI_API_KEY'] = getpass.getpass('Enter your OpenAI API key: ')

client = OpenAI()
MODEL_NAME = "gpt-4o"   # or "o3-8k", "gpt-4o-mini"

# ===============================================================================
# DATASET CHOICE
# ===============================================================================
DATASET_TYPE = os.getenv("DATASET_TYPE", "lidar")  # 'lidar' or 'sentinel2'
assert DATASET_TYPE in ("lidar", "sentinel2"), "DATASET_TYPE must be 'lidar' or 'sentinel2'"
print("Using dataset type:", DATASET_TYPE)


In [None]:
# ==============================================================================
# DATASET FETCHING
# ==============================================================================
def fetch_lidar():
    """Download a tiny LiDAR .laz file from OpenTopography."""
    DATASET_ID = "OT.072016.32611.1"  # example; override via env if desired
    url = (f"https://portal.opentopography.org/getOTDataset?"
           f"datasetID={DATASET_ID}&fileFormat=LAZ")
    tf = tempfile.NamedTemporaryFile(suffix=".laz", delete=False)
    resp = requests.get(url, timeout=60)
    resp.raise_for_status()
    tf.write(resp.content)
    tf.close()
    return tf.name, DATASET_ID

def fetch_sentinel2_bands():
    """Download Sentinel-2 RGB bands (10 m) from the AWS open registry."""
    tile = "22/MU/L"
    date_path = "2024/5/26/0"
    bands = ["2", "3", "4"]  # B02, B03, B04
    band_files = {}
    for band in bands:
        url = (f"https://sentinel-s2-l1c.s3.amazonaws.com/tiles/"
               f"{tile}/{date_path}/B0{band}.jp2")
        tf = tempfile.NamedTemporaryFile(suffix=f"_B0{band}.jp2", delete=False)
        resp = requests.get(url, timeout=60)
        resp.raise_for_status()
        tf.write(resp.content)
        tf.close()
        band_files[band] = tf.name
    scene_id = f"S2_T{tile.replace('/','')}_{date_path.replace('/','')}"
    return band_files, scene_id

def fetch_dataset(kind="lidar"):
    if kind == "lidar":
        return fetch_lidar()
    elif kind == "sentinel2":
        return fetch_sentinel2_bands()
    else:
        raise ValueError("kind must be 'lidar' or 'sentinel2'")


In [None]:
# ==============================================================================
# FEATURE EXTRACTION
# ==============================================================================
def laspy_stats(laz_path: str) -> dict:
    las = laspy.read(laz_path)
    z = las.z
    return {
        "mean_elev": float(np.mean(z)),
        "min_elev": float(np.min(z)),
        "max_elev": float(np.max(z)),
        "pt_count": int(len(z))
    }

def sentinel2_stats(band_files: dict) -> dict:
    bands = {}
    for band, path in band_files.items():
        with rasterio.open(path) as src:
            arr = src.read(1).astype(np.float32)
            bands[band] = arr
    stats = {f"mean_B0{b}": float(np.nanmean(arr))
             for b, arr in bands.items()}
    if "4" in bands and "8" in bands:
        red, nir = bands["4"], bands["8"]
        ndvi = (nir - red) / (nir + red + 1e-6)
        stats["mean_NDVI"] = float(np.nanmean(ndvi))
    return stats


In [None]:
# ==============================================================================
# OpenAI RESPONSES API HELPER
# ==============================================================================
def describe_stats(stats: dict, model: str = MODEL_NAME) -> str:
    resp = client.responses.create(
        model=model,
        instructions="You are an archaeologist.",
        input=(f"Here are basic stats: {json.dumps(stats)}\n"
               "Describe the surface features in plain English."),
        temperature=0
    )
    return resp.output_text.strip()


In [None]:
# ==============================================================================
# RUN END‑TO‑END
# ==============================================================================
data, dataset_id = fetch_dataset(DATASET_TYPE)

if DATASET_TYPE == "lidar":
    stats = laspy_stats(data)
else:
    stats = sentinel2_stats(data)

summary = describe_stats(stats)

print("Model:", MODEL_NAME)
print("Dataset ID:", dataset_id)
print("Stats:", json.dumps(stats, indent=2))
print("\nOpenAI summary:\n", summary)
