In [1]:
import rasterio
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt

# ---------------------------------------------------------------
# 1️⃣ Load raster
# ---------------------------------------------------------------
tif_path = "Land_surface_temperature.tif"
with rasterio.open(tif_path) as src:
    lst = src.read(1)
    transform = src.transform
    bounds = src.bounds
    crs = src.crs

# Mask invalid values
mask = np.isfinite(lst)
valid_vals = lst[mask]
print(f"Valid pixels: {mask.sum()}")

# ---------------------------------------------------------------
# 2️⃣ Compute sampling weights (hotter areas = higher weight)
# ---------------------------------------------------------------
# Shift temperatures to positive range
temp_shift = valid_vals - np.nanmin(valid_vals)
# Add small constant to keep low temps from being zero-weight
weights = temp_shift + 1  
# Normalize to sum to 1
weights = weights / np.sum(weights)

# ---------------------------------------------------------------
# 3️⃣ Choose 500 weighted random pixels
# ---------------------------------------------------------------
n_samples = 500
flat_indices = np.arange(mask.size)
valid_indices = flat_indices[mask.flatten()]
sample_indices = np.random.choice(valid_indices, size=n_samples, replace=False, p=weights)

# Convert flat index → row, col
rows, cols = np.unravel_index(sample_indices, lst.shape)

# Get x, y coordinates
xs, ys = rasterio.transform.xy(transform, rows, cols)
xs = np.array(xs)
ys = np.array(ys)
temps = lst[rows, cols]

# ---------------------------------------------------------------
# 4️⃣ Create GeoDataFrame
# ---------------------------------------------------------------
gdf = gpd.GeoDataFrame(
    {'x': xs, 'y': ys, 'temperature_K': temps},
    geometry=[Point(x, y) for x, y in zip(xs, ys)],
    crs=crs
)

# Save for later ML
out_path = "LST_samples_weighted_500.geojson"
gdf.to_file(out_path, driver="GeoJSON")
print(f"✅ Saved weighted samples to {out_path}")

# ---------------------------------------------------------------
# 5️⃣ Quick visualization
# ---------------------------------------------------------------
plt.figure(figsize=(8, 6))
plt.imshow(lst, cmap='turbo', vmin=290, vmax=335)
plt.scatter(cols, rows, s=8, c='white', edgecolor='black', linewidth=0.2)
plt.title("Weighted Sample Points over LST Raster")
plt.axis("off")
plt.show()


ImportError: dlopen(/opt/miniconda3/envs/pge383/lib/python3.13/site-packages/rasterio/_base.cpython-313-darwin.so, 0x0002): Library not loaded: @rpath/libnetcdf.19.dylib
  Referenced from: <A2860E87-DCA6-3E1C-869C-97EF1B16E899> /opt/miniconda3/envs/pge383/lib/libgdal.32.3.6.2.dylib
  Reason: tried: '/opt/miniconda3/envs/pge383/lib/libnetcdf.19.dylib' (no such file), '/opt/miniconda3/envs/pge383/lib/python3.13/site-packages/rasterio/../../../libnetcdf.19.dylib' (no such file), '/opt/miniconda3/envs/pge383/lib/python3.13/site-packages/rasterio/../../../libnetcdf.19.dylib' (no such file), '/opt/miniconda3/envs/pge383/bin/../lib/libnetcdf.19.dylib' (no such file), '/opt/miniconda3/envs/pge383/bin/../lib/libnetcdf.19.dylib' (no such file), '/usr/local/lib/libnetcdf.19.dylib' (no such file), '/usr/lib/libnetcdf.19.dylib' (no such file, not in dyld cache)

In [None]:
# !pip install requests pandas schedule tqdm


Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [5]:
import requests, os

API_KEY = os.getenv("PURPLEAIR_API_KEY") or "13847B4A-B6CB-11F0-BDE5-4201AC1DC121"
headers = {"X-API-Key": API_KEY}

url = "https://api.purpleair.com/v1/sensors?fields=latitude,longitude&nwlng=-122.1&nwlat=37.45&selng=-121.6&selat=37.15"
r = requests.get(url, headers=headers, timeout=30)

print("Status:", r.status_code)
print(r.text[:300])


Status: 200
{
  "api_version" : "V1.2.0-1.1.45",
  "time_stamp" : 1761965042,
  "data_time_stamp" : 1761965022,
  "max_age" : 604800,
  "firmware_default_version" : "7.02",
  "fields" : ["sensor_index","latitude","longitude"],
  "data" : [
    [2858,37.324482,-122.062096],
    [2916,37.362175,-122.06556],
    [


In [9]:
import requests, pandas as pd

url = "https://api.purpleair.com/v1/sensors.json"
r = requests.get(url)
r.raise_for_status()
js = r.json()

df = pd.DataFrame(js["results"])
print("✅ Total sensors:", len(df))
df = df[(df["Lat"].between(37.15, 37.45)) & (df["Lon"].between(-122.10, -121.60))]
print("✅ Found", len(df), "sensors in San José AOI")
df[["ID","Label","Lat","Lon"]].head()


HTTPError: 400 Client Error: Bad Request for url: https://api.purpleair.com/v1/sensors.json

In [8]:
import os, requests, pandas as pd

API_KEY = os.getenv("PURPLEAIR_API_KEY") or "YOUR_KEY_HERE"
BASE_URL = "https://api.purpleair.com/v1/sensors"

FIELDS = "sensor_index,name,latitude,longitude,altitude,location_type,confidence,last_seen"
headers = {"X-API-Key": API_KEY}
params  = {"fields": FIELDS, "location_type": 0}

r = requests.get(BASE_URL, headers=headers, params=params, timeout=60)
r.raise_for_status()
js = r.json()

df = pd.DataFrame(js["data"], columns=js["fields"])

# Filter locally for San José area
lat_min, lat_max = 37.15, 37.45
lon_min, lon_max = -122.10, -121.60
df = df[(df.latitude.between(lat_min, lat_max)) & (df.longitude.between(lon_min, lon_max))]

print(f"✅ Found {len(df)} sensors in local AOI")
print(df.head())


HTTPError: 403 Client Error: Forbidden for url: https://api.purpleair.com/v1/sensors?fields=sensor_index%2Cname%2Clatitude%2Clongitude%2Caltitude%2Clocation_type%2Cconfidence%2Clast_seen&location_type=0

In [None]:
import os, time, requests, pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

API_KEY = os.getenv("PURPLEAIR_API_KEY")
BASE_URL = "https://api.purpleair.com/v1/sensors"

# San José AOI
BBOX = {
    "nwlat": 37.45,
    "nwlng": -122.10,
    "selat": 37.15,
    "selng": -121.60
}

FIELDS = ",".join([
    "sensor_index","name","latitude","longitude","altitude",
    "last_seen","pm1.0","pm2.5","pm10.0",
    "pm1.0_atm","pm2.5_atm","pm10.0_atm",
    "temperature","humidity","pressure","voc",
    "ozone1","analog_input","rssi","confidence",
    "channel_flags","hardware","firmware_version"
])

def get_snapshot():
    headers = {"X-API-Key": API_KEY}
    params = {
        "fields": FIELDS,
        "location_type": 0,
        **BBOX
    }
    r = requests.get(BASE_URL, headers=headers, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()["data"]
    keys = r.json()["fields"]
    df = pd.DataFrame(data, columns=keys)
    df["timestamp"] = datetime.utcnow()
    return df

def save_snapshot(df):
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M")
    out_dir = "purpleair_snapshots"
    os.makedirs(out_dir, exist_ok=True)
    df.to_parquet(f"{out_dir}/snapshot_{ts}.parquet", index=False)
    print(f"Saved {len(df)} sensors at {ts}")

if __name__ == "__main__":
    START = datetime(2025,10,1,0,0)     # UTC
    DAYS = 10                           # or use 5
    INTERVAL_MIN = 30
    END = START + timedelta(days=DAYS)
    current = datetime.utcnow()
    # If start is in the future relative to now, you'll run live; 
    # If start is past, need history endpoint instead.
    while current < END:
        try:
            df = get_snapshot()
            save_snapshot(df)
        except Exception as e:
            print("Error:", e)
        time.sleep(INTERVAL_MIN * 60)
        current = datetime.utcnow()


SyntaxError: invalid syntax (1183916758.py, line 1)