## Welcome to your notebook.


#### Run this cell to connect to your GIS and get started:

In [None]:
# from arcgis.gis import GIS
# import contextlib, io
# with contextlib.redirect_stderr(io.StringIO()):
#     gis = GIS("home")

#### Now you are ready to start!

In [None]:
# =============================================================================
# Title: NWS Active Weather Alerts to ArcGIS Feature Layer (Notebook Version)

# GitHub: https://github.com/yagaC64/Spring2026DAEN

# License: https://github.com/yagaC64/Spring2026DAEN/blob/main/LICENSE

# Description: A script for an ArcGIS Notebook that fetches active NWS weather
#              alerts for Puerto Rico, processes them, and updates a target
#              ArcGIS Feature Layer using a Truncate and Add workflow.
# Version: 2.8
#
# Optional: ArcGIS Online Sync
# If you want to publish updates to ArcGIS Online, set:
#   USE_ARCGIS=1
#   NWS_ALERTS_LAYER_ID (or FEATURE_LAYER_ITEM_ID)
# Then re-run the notebook from Cell 1.
# =============================================================================
# What's new in v2.8 (Definitive Geocoding Fix):
# - Reworked the API fallback logic to prevent coordinate overwriting on
#   exploded DataFrames.
# - The script now caches new coordinates and maps them back in a single,
#   safe operation, ensuring distinct coordinates are preserved.
# =============================================================================

# Cell 1: Install and Import Libraries
# =============================================================================
import sys
import subprocess
import logging
import json
import re
from datetime import datetime, timezone
import os  # Added for file diagnostics
from pathlib import Path

# Set USE_ARCGIS=1 to enable ArcGIS Online sync; otherwise run locally.
USE_ARCGIS = os.environ.get("USE_ARCGIS", "").lower() in ("1", "true", "yes")

# Install required libraries in the notebook environment
print("Installing required libraries...")
base_pkgs = ['pandas', 'openpyxl', 'requests']
if USE_ARCGIS:
    base_pkgs.append('arcgis')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', *base_pkgs])
print("Installation complete.")

import pandas as pd
import requests

if USE_ARCGIS:
    from arcgis.gis import GIS
    from arcgis.features import Feature
    from arcgis.geometry import Point, Polygon

# Import display for rich DataFrame output in notebooks
try:
    from IPython.display import display
except ImportError:
    display = print  # Fallback for non-IPython environments

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", stream=sys.stdout)

print("\nCell 1: Libraries installed and imported.")

In [None]:
# Cell 2: Configuration
def resolve_file(filename, env_var=None, search_roots=None):
    if env_var:
        env_val = os.environ.get(env_var)
        if env_val:
            return env_val
    roots = search_roots or [Path.cwd(), Path.cwd().parent, Path.home()]
    arcgis_home = Path("/arcgis/home")
    if arcgis_home.exists():
        roots.append(arcgis_home)
    for root in roots:
        if root.exists():
            match = next(root.rglob(filename), None)
            if match:
                return str(match)
    raise FileNotFoundError("Set the required env var or place the file under the repo or /arcgis/home.")

# =============================================================================
# --- USER-DEFINED VARIABLES ---

FEATURE_LAYER_ITEM_ID = os.environ.get("NWS_ALERTS_LAYER_ID") or os.environ.get("FEATURE_LAYER_ITEM_ID")
LAYER_INDEX = 0

if USE_ARCGIS and not FEATURE_LAYER_ITEM_ID:
    raise ValueError("Set NWS_ALERTS_LAYER_ID (or FEATURE_LAYER_ITEM_ID) in the environment.")

# Local file paths (resolved dynamically)
GEOCODER_FILE = resolve_file("puerto_rico_geocoder_reference.xlsx", env_var="PR_GEOCODER_XLSX")

# Local outputs (for non-ArcGIS runs)
OUTPUT_DIR = Path(os.environ.get("OUTPUT_DIR", "outputs"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV = OUTPUT_DIR / "nws_alerts.csv"
OUTPUT_GEOJSON = OUTPUT_DIR / "nws_alerts.geojson"

# NWS API Configuration
ALERTS_URL = "https://api.weather.gov/alerts/active"
HEADERS = {
    "User-Agent": os.environ.get("NWS_USER_AGENT", "DAEN-NWS-Notebook/1.0 (contact)"),
    "Accept": "application/geo+json"
}

print("Cell 2: Configuration variables set.")

In [None]:
# Cell 3: Connect to ArcGIS Organization (optional)
# =============================================================================
if USE_ARCGIS:
    try:
        logging.info("Connecting to ArcGIS environment...")
        gis = GIS("home")
        logging.info("Successfully connected to %s.", gis.properties.portalHostname)
    except Exception as e:
        logging.error(f"FATAL: Failed to connect to ArcGIS. Error: {e}")
        sys.exit(1)

    print("Cell 3: ArcGIS connection established.")
else:
    gis = None
    logging.info("ArcGIS disabled; running locally only.")

In [None]:
# Cell 4: Fetch NWS Alerts
# =============================================================================
# This cell contains the logic to fetch and process NWS alerts.

# --- 4.1: Build Puerto Rico Zone Map ---
pr_zone_map = {}
logging.info("--- Building Puerto Rico Zone Map ---")
try:
    # This map is crucial for translating zone IDs (e.g., PRZ001) to names ("San Juan and Vicinity")
    zones_url = "https://api.weather.gov/zones?area=PR&type=forecast"
    r = requests.get(zones_url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    pr_zones = r.json().get("features", [])
    for zone in pr_zones:
        zone_id = zone.get("properties", {}).get("id")
        zone_name = zone.get("properties", {}).get("name")
        if zone_id and zone_name:
            pr_zone_map[zone_id] = zone_name
    logging.info(f"Successfully mapped {len(pr_zone_map)} PR forecast zones.")
except Exception as e:
    logging.error(f"Could not build PR zone map, using fallbacks. Reason: {e}")
    if not pr_zone_map:
        for i in range(1, 14): pr_zone_map[f"PRZ{i:03d}"] = f"Puerto Rico Zone {i:03d}"

# --- 4.2: Fetch Data ---
all_features = []
logging.info("\n--- Starting Data Fetch ---")
# Strategy 1: Fetch all active US alerts
logging.info(f"Fetching all active US alerts from → {ALERTS_URL}")
try:
    r = requests.get(ALERTS_URL, headers=HEADERS, timeout=90)
    r.raise_for_status()
    nationwide_features = r.json().get("features", [])
    all_features.extend(nationwide_features)
    logging.info(f"Discovered {len(nationwide_features)} total active alerts nationwide.")
except Exception as e:
    logging.warning(f"Failed to fetch nationwide alerts → {e}")

# Strategy 2: Fetch alerts for specific Puerto Rico zones
logging.info("\nFetching alerts for all known PR zones...")
for zone_id in pr_zone_map.keys():
    zone_url = f"https://api.weather.gov/alerts/active/zone/{zone_id}"
    try:
        r = requests.get(zone_url, headers=HEADERS, timeout=30)
        r.raise_for_status()
        zone_features = r.json().get("features", [])
        if zone_features: all_features.extend(zone_features)
    except Exception:
        pass # Failures are expected for zones with no alerts

logging.info("\n--- Data Fetch Complete ---")

# --- 4.3: Deduplicate and Filter ---
unique_features = {f.get("id"): f for f in all_features if f.get("id")}.values()
logging.info(f"Total unique features scraped → {len(unique_features)}")
pr_alerts = []
if unique_features:
    pr_pattern = re.compile(r"puerto\s+rico|PRZ\d{3}|AMZ\d{3}|\bPR\b", re.I)
    text_search_fields = ["areaDesc", "headline", "description"]

    for feature in unique_features:
        properties = feature.get("properties", {})
        is_sanjuan_sender = properties.get("senderName") == "NWS San Juan PR"
        mentions_pr = any(pr_pattern.search(str(properties.get(key, ""))) for key in text_search_fields)

        if is_sanjuan_sender or mentions_pr:
            pr_alerts.append(feature)

logging.info(f"Found {len(pr_alerts)} alerts related to Puerto Rico after filtering.")

print("\nCell 4: NWS Alert data fetching and processing complete.")

In [None]:
# Cell 5: Build and Geocode DataFrame
# =============================================================================
logging.info("--- Building and Geocoding DataFrame ---")
all_records = []
found_zone_ids = set()

# Process real alerts
if pr_alerts:
    for feature in pr_alerts:
        p = feature.get("properties", {})
        affected_zones = [url.split('/')[-1] for url in p.get("affectedZones", [])]
        if not affected_zones:
            affected_zones = [p.get("areaDesc")]
        
        found_zone_ids.update(affected_zones)
        all_records.append({
            "sent": p.get("sent"), "event": p.get("event"), "headline": p.get("headline"),
            "description": p.get("description"), "instruction": p.get("instruction"),
            "response": p.get("response"), "severity": p.get("severity"),
            "urgency": p.get("urgency"), "certainty": p.get("certainty"),
            "expires": p.get("expires"), "status": p.get("status"), "id": p.get("id"),
            "affected_zones": affected_zones
        })

# Add placeholders for zones with no alerts
time_of_check = datetime.now(timezone.utc)
for zone_id, zone_name in pr_zone_map.items():
    if zone_id not in found_zone_ids:
        all_records.append({
            "sent": time_of_check.isoformat(), "event": "No Active Alerts",
            "headline": f"No Active Alerts for {zone_name}", "description": "N/A",
            "instruction": "N/A", "response": "N/A", "severity": "N/A",
            "urgency": "N/A", "certainty": "N/A",
            "expires": None, "status": "None",
            "id": f"placeholder_{zone_id}_{time_of_check.isoformat()}",
            "affected_zones": [zone_id]
        })

if all_records:
    alerts_df = pd.DataFrame(all_records)
    alerts_df = alerts_df.explode('affected_zones').rename(columns={'affected_zones': 'zone_id'})
    logging.info(f"DataFrame exploded. Total records now: {len(alerts_df)}")

    alerts_df['sent'] = pd.to_datetime(alerts_df['sent'], errors='coerce')
    alerts_df['expires'] = pd.to_datetime(alerts_df['expires'], errors='coerce')
    alerts_df['area_desc'] = alerts_df['zone_id'].map(pr_zone_map).fillna(alerts_df['zone_id'])
else:
    alerts_df = pd.DataFrame()

# --- FINAL GEOCODING SWEEP ---
if not alerts_df.empty:
    logging.info("--- STARTING FINAL GEOCODING SWEEP ---")
    # Diagnostics: Check if file exists
    if os.path.exists(GEOCODER_FILE):
        print("Geocoder file found.")
    else:
        print("Geocoder file not found. Set PR_GEOCODER_XLSX or place the file under the repo or /arcgis/home.")

    try:
        geo_df = pd.read_excel(GEOCODER_FILE)
        # Peek at columns for sanity
        print("Geocoder file columns:", geo_df.columns.tolist())
        # Pull extras if present
        cols_to_merge = ['designated_area', 'latitude', 'longitude', 'state']
        if 'geometry_api' in geo_df.columns:
            cols_to_merge.append('geometry_api')
        geo_merge_df = geo_df[cols_to_merge].copy()
        
        # Merge the dataframes, keeping all alerts
        merged_df = pd.merge(
            alerts_df,
            geo_merge_df,
            left_on='area_desc',
            right_on='designated_area',
            how='left'
        )
        merged_df = merged_df.drop(columns=['designated_area'])
        
        # Identify zones that still need coordinates
        zones_to_fetch = merged_df[merged_df['latitude'].isna()]['zone_id'].unique()
        
        # Create a cache to store new coordinates
        new_coords_cache = {}
        
        if len(zones_to_fetch) > 0:
            logging.info(f"Found {len(zones_to_fetch)} zones missing from local file. Attempting live API lookup...")
            for zone_id in zones_to_fetch:
                try:
                    zone_type = 'forecast'
                    if 'PRC' in zone_id: zone_type = 'county'
                    if 'AMZ' in zone_id: zone_type = 'marine'
                    
                    zone_url = f"https://api.weather.gov/zones/{zone_type}/{zone_id}"
                    z_r = requests.get(zone_url, headers=HEADERS, timeout=20)
                    z_r.raise_for_status()
                    z_data = z_r.json()
                    
                    geom_data = z_data.get("geometry")
                    if geom_data and geom_data.get('coordinates'):
                        first_ring = geom_data['coordinates'][0]
                        if geom_data['type'] == 'MultiPolygon':
                            first_ring = first_ring[0]
                        
                        if first_ring:
                            lon_list = [p[0] for p in first_ring]
                            lat_list = [p[1] for p in first_ring]
                            lon = sum(lon_list) / len(lon_list)
                            lat = sum(lat_list) / len(lat_list)
                            state = z_data.get("properties", {}).get("state", "PR")
                            new_coords_cache[zone_id] = {'latitude': lat, 'longitude': lon, 'state': state}
                            # If we want geometry_api from API (as JSON str), add: 'geometry_api': str(geom_data)
                            logging.info(f"✓ Cached API Fallback for '{zone_id}' -> Lat: {lat:.4f}, Lon: {lon:.4f}")
                except Exception as e:
                    logging.warning(f"API lookup failed for '{zone_id}': {e}")

        # Apply the cached coordinates vectorized (preserves dtypes, faster)
        if new_coords_cache:
            missing_mask = merged_df['latitude'].isna()
            for col, cache_key in zip(['latitude', 'longitude', 'state'], ['latitude', 'longitude', 'state']):
                merged_df.loc[missing_mask, col] = merged_df.loc[missing_mask, 'zone_id'].map(lambda z: new_coords_cache.get(z, {}).get(cache_key))
            # If caching geometry_api: zip(['...','geometry_api'], [...'geometry_api']) etc.

        logging.info("--- GEOCODING SWEEP COMPLETE ---")
        alerts_df = merged_df

    except FileNotFoundError:
        logging.warning(f"Geocoder file not found at {GEOCODER_FILE}. Geocoding skipped – no coords added!")
    except Exception as e:
        logging.error(f"Geocoding sweep bombed: {e}")
        import traceback
        traceback.print_exc()  # Spew the full stack for us to dissect


# Display a preview of the full DataFrame
if not alerts_df.empty:
    alerts_df_sorted = alerts_df.sort_values(by=['event', 'area_desc'], ascending=[False, True])
    logging.info("DataFrame Preview:")
    display(alerts_df_sorted)
else:
    logging.info("No records to process.")

print("\nCell 5: DataFrame built and geocoded.")


if not USE_ARCGIS:
    df_out = alerts_df_sorted if "alerts_df_sorted" in locals() else alerts_df
    if df_out is None or df_out.empty:
        logging.info("No records to write locally.")
    else:
        df_out.to_csv(OUTPUT_CSV, index=False)

        def to_jsonable(val):
            if isinstance(val, pd.Timestamp):
                return val.isoformat()
            try:
                if pd.isna(val):
                    return None
            except Exception:
                pass
            if hasattr(val, "item"):
                try:
                    return val.item()
                except Exception:
                    pass
            return val

        features = []
        if {"longitude", "latitude"}.issubset(df_out.columns):
            for _, row in df_out.iterrows():
                lon = row.get("longitude")
                lat = row.get("latitude")
                if pd.notna(lon) and pd.notna(lat):
                    props = row.drop(labels=["longitude", "latitude"]).to_dict()
                    props = {k: to_jsonable(v) for k, v in props.items()}
                    features.append({
                        "type": "Feature",
                        "geometry": {"type": "Point", "coordinates": [float(lon), float(lat)]},
                        "properties": props
                    })
        geojson = {"type": "FeatureCollection", "features": features}
        with open(OUTPUT_GEOJSON, "w", encoding="utf-8") as f:
            json.dump(geojson, f, ensure_ascii=False, indent=2)
        logging.info("Local outputs written: %s and %s", OUTPUT_CSV, OUTPUT_GEOJSON)


if USE_ARCGIS:
    # Cell 6: Access and Truncate Target Feature Layer
    # =============================================================================
    try:
        logging.info("Accessing Feature Layer")
        feature_layer_item = gis.content.get(FEATURE_LAYER_ITEM_ID)
        flayer = feature_layer_item.layers[LAYER_INDEX]
        logging.info(f"Target layer: '{flayer.properties.name}'")

        # Truncate the layer
        count = flayer.query(return_count_only=True)
        logging.info(f"Existing feature count: {count}")
        if count > 0:
            flayer.delete_features(where="1=1")
            logging.info("Layer successfully cleared.")
        else:
            logging.info("Layer is already empty.")
    except Exception as e:
        logging.error(f"FATAL: Could not access or truncate the feature layer. Error: {e}")
        sys.exit(1)

    print("\nCell 6: Feature Layer truncate complete.")


    # Cell 7: Prepare and Push Data to ArcGIS
    # =============================================================================
    if not alerts_df.empty:
        try:
            adds = []
            # Use the sorted dataframe for processing
            df_clean = alerts_df_sorted.copy()

            # Convert datetimes to strings for AGOL
            df_clean['sent'] = df_clean['sent'].dt.strftime('%Y-%m-%d %H:%M:%S').fillna('')
            df_clean['expires'] = df_clean['expires'].dt.strftime('%Y-%m-%d %H:%M:%S').fillna('')

            for _, row in df_clean.iterrows():
                attrs = row.to_dict()
            
                # Clean up potential NaN values and internal columns
                internal_cols = ['zone_id']
                clean_attrs = {k: v for k, v in attrs.items() if k not in internal_cols and pd.notna(v)}

                # Explicitly cast numeric types to prevent silent errors
                if 'latitude' in clean_attrs:
                    clean_attrs['latitude'] = float(clean_attrs['latitude'])
                if 'longitude' in clean_attrs:
                    clean_attrs['longitude'] = float(clean_attrs['longitude'])

                # Create geometry from the now-populated coordinate fields
                geom = None
                if pd.notna(row.get('longitude')) and pd.notna(row.get('latitude')):
                    geom = Point({
                        "x": row['longitude'],
                        "y": row['latitude'],
                        "spatialReference": {"wkid": 4326}
                    })

                adds.append(Feature(geometry=geom, attributes=clean_attrs))

            logging.info(f"Prepared {len(adds)} features for upload.")

            # Push edits to the feature layer
            if adds:
                result = flayer.edit_features(adds=adds, rollback_on_failure=True)
            
                # Verify success
                add_results = result.get("addResults", [])
                success_count = sum(1 for r in add_results if r.get("success"))
                if success_count == len(adds):
                    logging.info(f"✔ Successfully added {success_count} features to the layer.")
                else:
                    logging.error("Some features failed to add. See detailed response:")
                    logging.error(result)
            else:
                logging.info("No features to add.")

        except Exception as e:
            logging.error(f"FATAL: An error occurred while preparing or pushing edits: {e}")
            import traceback
            traceback.print_exc()
            sys.exit(1)
    else:
        logging.info("DataFrame is empty, nothing to push to ArcGIS.")

    print("\n--- WORKFLOW COMPLETE ---")

In [None]:
# pandas full tables
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
alerts_df_sorted.info()

In [None]:
# display(alerts_df_sorted[["sent","event","headline","description","instruction","response","severity","urgency","certainty","area_desc","expires","status",id	zone_id","latitude","longitude","state","area_desc"]])
display(alerts_df_sorted[["zone_id","event", "latitude","longitude","state","area_desc", "expires"]])
# display(alerts_df_sorted)