In [None]:
# Install deps (quiet mode)
!pip install geopandas fiona fitz rasterio sentence-transformers chromadb langchain langchain-community transformers -q
!pip install pymupdf==1.22.5
!apt-get update
!apt-get install -y libgdal-dev
!pip install geopandas rasterio

In [None]:
import rasterio

tif_path = "/kaggle/input/nasa-kenya-urban/GHS_BUILT_S_E2030_GLOBE_R2023A_54009_1000_V1_0_R9_C22.tif"

with rasterio.open(tif_path) as src:
    print("TIFF file:", tif_path)
    print("CRS:", src.crs)
    print("Bounds:", src.bounds)
    print("Width x Height:", src.width, "x", src.height)
    print("Number of bands:", src.count)
    for i in range(1, src.count + 1):
        band = src.read(i)
        print(f" Band {i}: dtype={band.dtype}, min={band.min()}, max={band.max()}")


In [None]:
import fiona

gdb_path = "/kaggle/input/nasa-kenya-urban/KEN.gdb"

layers = fiona.listlayers(gdb_path)
print("Layers in GDB:", layers)

# Example: load first layer
import geopandas as gpd
gdf_gdb = gpd.read_file(gdb_path, layer=layers[0])
print("Columns:", gdf_gdb.columns)
print("First 5 rows:")
print(gdf_gdb.head())
print("CRS:", gdf_gdb.crs)


In [None]:
import geopandas as gpd
import fiona
from shapely.geometry import Point

# Check layers in the GeoPackage
print("Layers in KEN.gpkg:", fiona.listlayers('/kaggle/input/nasa-kenya-urban/KEN.gpkg'))

# Load the GeoDataFrame
gdf = gpd.read_file('/kaggle/input/nasa-kenya-urban/KEN.gpkg')
print("Columns:", gdf.columns)
print("First 5 rows:")
print(gdf[['PIXELID', 'UrbanCenter', 'COUNTRY', 'geometry']].head())
print("CRS:", gdf.crs)

# Check unique values in potential columns
print("Unique values in 'UrbanCenter':", gdf['UrbanCenter'].unique())
print("Unique values in 'COUNTRY':", gdf['COUNTRY'].unique())

# Search for 'Nairobi' in all string columns
for col in gdf.select_dtypes(include=['object']).columns:
    try:
        nairobi_rows = gdf[gdf[col].str.contains('Nairobi', case=False, na=False)]
        if not nairobi_rows.empty:
            print(f"Found 'Nairobi' in column '{col}':")
            print(nairobi_rows)
    except Exception as e:
        print(f"Could not search in column '{col}': {e}")

# Try filtering by Nairobi's coordinates
nairobi_point = Point(36.82, -1.28)
nairobi_rows = gdf[gdf.geometry.contains(nairobi_point)]
if not nairobi_rows.empty:
    print("Rows containing Nairobi's coordinates:")
    print(nairobi_rows)
else:
    print("No rows found containing Nairobi's coordinates.")

In [None]:
import os
import geopandas as gpd
import rasterio
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from typing import List, Dict
import pandas as pd
import zipfile
import gc
from shapely.geometry import box

# Configuration
DATA_DIR = "/kaggle/input/nasa-kenya-urban"
CHROMA_PATH = "/kaggle/working/chroma_db"
MODEL_NAME = "all-mpnet-base-v2"  # 768 dimensions; change to "all-MiniLM-L6-v2" for 384 dimensions if needed
BATCH_SIZE = 5000  # Below Chroma's max batch size (5461)

# Initialize embedding model
try:
    embedding_model = SentenceTransformer(MODEL_NAME)
except Exception as e:
    print(f"Error loading SentenceTransformer: {e}")
    raise

# Load vector data for spatial metadata assignment
def load_vector_data() -> gpd.GeoDataFrame:
    vector_files = [f for f in os.listdir(DATA_DIR) if f.endswith(('.gpkg', '.gdb'))]
    gdfs = []
    for f in vector_files:
        fpath = os.path.join(DATA_DIR, f)
        try:
            gdf = gpd.read_file(fpath, layer="KEN_projections")
            gdfs.append(gdf)
        except Exception as e:
            print(f"Error reading {fpath}: {e}")
    if gdfs:
        return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)
    return None

# Helper function to assign urban center based on spatial overlap
def get_urban_center_from_bounds(bounds, vector_gdf: gpd.GeoDataFrame) -> str:
    if vector_gdf is None:
        return "unknown"
    try:
        tiff_bbox = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
        vector_gdf = vector_gdf.to_crs("ESRI:54009") if vector_gdf.crs != "ESRI:54009" else vector_gdf
        intersects = vector_gdf[vector_gdf.geometry.intersects(tiff_bbox)]
        if not intersects.empty:
            urban_centers = intersects["UrbanCenter"].value_counts()
            return urban_centers.index[0] if not urban_centers.empty else "unknown"
        return "unknown"
    except Exception as e:
        print(f"Error in spatial overlap: {e}")
        return "unknown"

# Processing functions
def process_geopkg_gdb(file_path: str) -> List[Dict]:
    try:
        gdf = gpd.read_file(file_path, layer="KEN_projections")
        docs = []
        for idx, row in gdf.iterrows():
            geom_desc = f"Geometry: {row.geometry.type}, Area: {row.geometry.area:.2f} sqm"
            attrs = " | ".join([f"{k}:{v}" for k, v in row.items() if pd.notna(v) and k != "geometry"])
            urban_center = str(row.get("UrbanCenter", "unknown"))
            country = str(row.get("COUNTRY", "unknown"))
            anthrome = str(row.get("ANTHROME", "unknown"))
            farm_sys = str(row.get("Farm_Sys", "unknown"))
            doc = f"{geom_desc} | Attributes: {attrs}"
            metadata = {
                "file": file_path,
                "type": "vector",
                "geometry": str(row.geometry),
                "urban_center": urban_center,
                "country": country,
                "anthrome": anthrome,
                "farm_sys": farm_sys
            }
            docs.append({
                "id": f"{os.path.basename(file_path)}_{idx}",
                "text": doc,
                "metadata": metadata
            })
        print(f"Processed {file_path}: {len(docs)} documents, urban_centers: {set(d['metadata']['urban_center'] for d in docs)}")
        return docs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

def process_tif(file_path: str, vector_gdf: gpd.GeoDataFrame) -> List[Dict]:
    try:
        with rasterio.open(file_path) as src:
            data = src.read(1)
            mean_val = np.mean(data[data > 0]) if np.any(data > 0) else 0.0
            doc = f"Raster stats: Mean {mean_val:.2f}, Shape {src.shape}, CRS {src.crs}, Min {src.profile['nodata'] or 0}, Max {data.max()}"
            urban_center = get_urban_center_from_bounds(src.bounds, vector_gdf)
            return [{
                "id": os.path.basename(file_path),
                "text": doc,
                "metadata": {
                    "file": file_path,
                    "type": "raster",
                    "bounds": str(src.bounds),
                    "urban_center": urban_center,
                    "country": "KEN",
                    "anthrome": "unknown",
                    "farm_sys": "unknown"
                }
            }]
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

# Ingest to Chroma with batch processing
def ingest_to_chroma(docs: List[Dict]):
    try:
        client = chromadb.PersistentClient(path=CHROMA_PATH)
        # Delete existing collection to avoid dimension mismatch
        try:
            client.delete_collection(name="nasa_kenya_urban")
            print("Deleted existing collection to ensure correct embedding dimension.")
        except:
            pass  # Collection may not exist
        collection = client.create_collection(name="nasa_kenya_urban")
        
        # Debug metadata distribution
        urban_centers = set(d["metadata"]["urban_center"] for d in docs)
        types = set(d["metadata"]["type"] for d in docs)
        print(f"Documents to ingest: {len(docs)}")
        print(f"Urban centers: {urban_centers}")
        print(f"Document types: {types}")
        
        # Process documents in batches
        total_docs = len(docs)
        for i in range(0, total_docs, BATCH_SIZE):
            batch_docs = docs[i:i + BATCH_SIZE]
            texts = [d["text"] for d in batch_docs]
            embeddings = embedding_model.encode(texts, show_progress_bar=True).tolist()
            ids = [d["id"] for d in batch_docs]
            metadatas = [d["metadata"] for d in batch_docs]
            collection.add(embeddings=embeddings, documents=texts, metadatas=metadatas, ids=ids)
            print(f"Ingested batch {i // BATCH_SIZE + 1}: {len(batch_docs)} documents")
            gc.collect()
        
        print(f"Ingested {total_docs} chunks into Chroma.")
        return collection
    except Exception as e:
        print(f"Error ingesting to Chroma: {e}")
        return None

# Main execution
if __name__ == "__main__":
    # Ensure output directory exists
    os.makedirs(CHROMA_PATH, exist_ok=True)
    
    # Check if DATA_DIR exists
    if not os.path.exists(DATA_DIR):
        print(f"Error: Dataset directory {DATA_DIR} not found.")
        print("Available datasets:")
        print(os.listdir("/kaggle/input"))
        raise FileNotFoundError(f"Dataset directory {DATA_DIR} not found.")
    
    # Load vector data for TIFF metadata
    vector_gdf = load_vector_data()
    
    # Process and ingest data
    all_docs = []
    for file in os.listdir(DATA_DIR):
        fpath = os.path.join(DATA_DIR, file)
        if file.endswith(('.gpkg', '.gdb')):
            all_docs.extend(process_geopkg_gdb(fpath))
        elif file.endswith('.tif'):
            all_docs.extend(process_tif(fpath, vector_gdf))
        gc.collect()

    if all_docs:
        collection = ingest_to_chroma(all_docs)
        if collection:
            # Zip Chroma DB
            zip_path = '/kaggle/working/chroma_db.zip'
            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, _, files in os.walk(CHROMA_PATH):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, CHROMA_PATH)
                        zipf.write(file_path, arcname)
            print(f"Chroma DB zipped at {zip_path}")
        else:
            print("Failed to ingest data into Chroma.")
    else:
        print("No documents processed.")

In [None]:
!ls