In [None]:
import os
import requests
import pandas as pd
import time
from datetime import datetime, timedelta, timezone

# Load secret from .env
from dotenv import load_dotenv
load_dotenv()  # loads .env in project root

API_KEY = os.getenv("OPENAQ_API_KEY")
if not API_KEY:
    raise RuntimeError("OPENAQ_API_KEY not set in environment (.env missing or not loaded)")

BASE_URL = "https://api.openaq.org/v3"
headers = {"X-API-Key": API_KEY}

print("Các thư viện và biến đã được khởi tạo.")

def fetch_measurements(loc_id, start, end, max_retries=3):
    """Fetch measurements for 1 location in [start, end]"""
    page = 1
    results_all = []
    while True:
        url = (f"{BASE_URL}/measurements?location_id={loc_id}"
               f"&datetime_from={start.isoformat()}"
               f"&datetime_to={end.isoformat()}"
               f"&limit=1000&page={page}")
        retries = 0
        while retries < max_retries:
            r = requests.get(url, headers=headers)
            if r.status_code == 200:
                data = r.json().get("results", [])
                if not data:
                    return results_all
                results_all.extend(data)
                page += 1
                break
            elif r.status_code == 429:
                print("Rate limit hit, sleeping 10s...")
                time.sleep(10)
                retries += 1
            elif r.status_code == 404:
                return results_all
            else:
                print(f"Error {r.status_code} for loc {loc_id}, skipping.")
                return results_all
    return results_all


# Cell 2 – Crawl danh sách trạm trong Hà Nội

In [2]:
bbox = "105.7,20.9,106.0,21.2"
resp = requests.get(f"{BASE_URL}/locations?bbox={bbox}&limit=100", headers=headers).json()
locations = resp["results"]
print("Tổng số trạm trong bbox:", len(locations))

# Ngưỡng ngày từ 2023-01-01
threshold = "2023-01-01T00:00:00Z"

active_locations = []
for loc in locations:
    dt_last_obj = loc.get("datetimeLast")  # có thể là None hoặc dict
    if dt_last_obj and dt_last_obj.get("utc") and dt_last_obj["utc"] >= threshold:
        active_locations.append(loc)

print("Số trạm có dữ liệu từ 2023 đến nay:", len(active_locations))
for loc in active_locations:
    dt_first = loc.get("datetimeFirst", {}).get("utc") if loc.get("datetimeFirst") else None
    dt_last = loc.get("datetimeLast", {}).get("utc") if loc.get("datetimeLast") else None
    print(f"ID: {loc['id']:<8} | Name: {loc['name']:<40} | First: {dt_first} | Last: {dt_last}")


Tổng số trạm trong bbox: 33
Số trạm có dữ liệu từ 2023 đến nay: 29
ID: 7441     | Name: Hanoi                                    | First: 2016-11-09T18:00:00Z | Last: 2025-04-09T15:00:00Z
ID: 2161290  | Name: An Khánh                                 | First: 2024-01-29T06:00:00Z | Last: 2025-06-10T03:00:00Z
ID: 2161291  | Name: Cầu Diễn                                 | First: 2024-01-22T01:00:00Z | Last: 2024-12-11T14:00:00Z
ID: 2161292  | Name: Số 46, phố Lưu Quang Vũ                  | First: 2024-01-29T16:00:00Z | Last: 2025-10-02T02:00:00Z
ID: 2161293  | Name: Chúc Sơn                                 | First: 2024-01-09T21:00:00Z | Last: 2025-02-05T08:00:00Z
ID: 2161294  | Name: Cung thiếu nhi                           | First: 2024-01-29T06:00:00Z | Last: 2025-02-05T03:00:00Z
ID: 2161295  | Name: Đầm Trấu                                 | First: 2024-01-29T06:00:00Z | Last: 2024-01-30T08:00:00Z
ID: 2161296  | Name: Đào Duy Từ                               | First: 2024-01-15T07:0

# Cell 3a – Trạm lâu năm (ví dụ ID 7441: “Hanoi”)

In [3]:
# Crawl dữ liệu cho trạm lâu năm (Hanoi - ID 7441)

loc_id = 7441
loc_name = "Hanoi_long_term"

# Lấy metadata
loc_info = requests.get(f"{BASE_URL}/locations/{loc_id}", headers=headers).json()["results"][0]
dt_first = loc_info.get("datetimeFirst", {}).get("utc")
dt_last  = loc_info.get("datetimeLast", {}).get("utc")

if dt_first and dt_last:
    start_date = max(datetime(2023, 1, 1, tzinfo=timezone.utc),
                     datetime.fromisoformat(dt_first.replace("Z", "+00:00")))
    end_date   = datetime.fromisoformat(dt_last.replace("Z", "+00:00"))

    print(f"{loc_name} data range: {start_date} → {end_date}")

    # Test 1 request xem có data không
    test = fetch_measurements(loc_id, start_date, min(end_date, start_date + timedelta(days=7)))
    if len(test) == 0:
        print(f"⚠️ Trạm {loc_id} ({loc_name}) không có measurements qua API. "
              f"Cần dùng bulk dataset (AWS S3).")
    else:
        # Crawl theo tháng
        data = []
        cur = start_date
        while cur < end_date:
            next_month = (cur + timedelta(days=32)).replace(day=1)
            if next_month > end_date:
                next_month = end_date
            chunk = fetch_measurements(loc_id, cur, next_month)
            data.extend(chunk)
            cur = next_month
            time.sleep(0.5)

        df = pd.DataFrame(data)
        print(f"{loc_name}: {len(df)} rows")
        df.to_csv(f"{loc_name}_{loc_id}.csv", index=False)
        display(df.head())
else:
    print(f"❌ Không có datetimeFirst/datetimeLast cho trạm {loc_id}")


Hanoi_long_term data range: 2023-01-01 00:00:00+00:00 → 2025-04-09 15:00:00+00:00
⚠️ Trạm 7441 (Hanoi_long_term) không có measurements qua API. Cần dùng bulk dataset (AWS S3).


In [4]:
# Crawl dữ liệu cho trạm mới (Pho Luu Quang Vu - ID 2161292)

loc_id = 2161292
loc_name = "Pho_Luu_Quang_Vu"

loc_info = requests.get(f"{BASE_URL}/locations/{loc_id}", headers=headers).json()["results"][0]
dt_first = loc_info.get("datetimeFirst", {}).get("utc")
dt_last  = loc_info.get("datetimeLast", {}).get("utc")

if dt_first and dt_last:
    start_date = datetime.fromisoformat(dt_first.replace("Z", "+00:00"))
    end_date   = datetime.fromisoformat(dt_last.replace("Z", "+00:00"))

    print(f"{loc_name} data range: {start_date} → {end_date}")

    test = fetch_measurements(loc_id, start_date, min(end_date, start_date + timedelta(days=7)))
    if len(test) == 0:
        print(f"⚠️ Trạm {loc_id} ({loc_name}) không có measurements qua API. "
              f"Cần dùng bulk dataset (AWS S3).")
    else:
        data = []
        cur = start_date
        while cur < end_date:
            next_month = (cur + timedelta(days=32)).replace(day=1)
            if next_month > end_date:
                next_month = end_date
            chunk = fetch_measurements(loc_id, cur, next_month)
            data.extend(chunk)
            cur = next_month
            time.sleep(0.5)

        df = pd.DataFrame(data)
        print(f"{loc_name}: {len(df)} rows")
        df.to_csv(f"{loc_name}_{loc_id}.csv", index=False)
        display(df.head())
else:
    print(f"❌ Không có datetimeFirst/datetimeLast cho trạm {loc_id}")


Pho_Luu_Quang_Vu data range: 2024-01-29 16:00:00+00:00 → 2025-10-02 02:00:00+00:00
⚠️ Trạm 2161292 (Pho_Luu_Quang_Vu) không có measurements qua API. Cần dùng bulk dataset (AWS S3).


In [5]:
# Crawl dữ liệu cho trạm rất mới (Nguyen Van Cu - ID 4946811)

loc_id = 4946811
loc_name = "Nguyen_Van_Cu"

loc_info = requests.get(f"{BASE_URL}/locations/{loc_id}", headers=headers).json()["results"][0]
dt_first = loc_info.get("datetimeFirst", {}).get("utc")
dt_last  = loc_info.get("datetimeLast", {}).get("utc")

if dt_first and dt_last:
    start_date = datetime.fromisoformat(dt_first.replace("Z", "+00:00"))
    end_date   = datetime.fromisoformat(dt_last.replace("Z", "+00:00"))

    print(f"{loc_name} data range: {start_date} → {end_date}")

    test = fetch_measurements(loc_id, start_date, min(end_date, start_date + timedelta(days=7)))
    if len(test) == 0:
        print(f"⚠️ Trạm {loc_id} ({loc_name}) không có measurements qua API. "
              f"Cần dùng bulk dataset (AWS S3).")
    else:
        data = []
        cur = start_date
        while cur < end_date:
            next_month = (cur + timedelta(days=32)).replace(day=1)
            if next_month > end_date:
                next_month = end_date
            chunk = fetch_measurements(loc_id, cur, next_month)
            data.extend(chunk)
            cur = next_month
            time.sleep(0.5)

        df = pd.DataFrame(data)
        print(f"{loc_name}: {len(df)} rows")
        df.to_csv(f"{loc_name}_{loc_id}.csv", index=False)
        display(df.head())
else:
    print(f"❌ Không có datetimeFirst/datetimeLast cho trạm {loc_id}")


Nguyen_Van_Cu data range: 2025-07-03 15:40:00+00:00 → 2025-10-02 02:45:00+00:00
⚠️ Trạm 4946811 (Nguyen_Van_Cu) không có measurements qua API. Cần dùng bulk dataset (AWS S3).


# Cell 3b – Trạm mới (2024 → nay, ví dụ ID 2161292: “Số 46, phố Lưu Quang Vũ”)

In [None]:
# Crawl dữ liệu cho trạm mới (Pho Luu Quang Vu - ID 2161292)

loc_id = 2161292
loc_name = "Pho_Luu_Quang_Vu"

loc_info = requests.get(f"{BASE_URL}/locations/{loc_id}", headers=headers).json()["results"][0]
dt_first = loc_info["datetimeFirst"]["utc"] if loc_info.get("datetimeFirst") else None
dt_last  = loc_info["datetimeLast"]["utc"] if loc_info.get("datetimeLast") else None

start_date = datetime.fromisoformat(dt_first.replace("Z", "+00:00"))
end_date   = datetime.fromisoformat(dt_last.replace("Z", "+00:00"))

print(f"{loc_name} data range: {start_date} → {end_date}")

data = []
cur = start_date
while cur < end_date:
    next_month = (cur + timedelta(days=32)).replace(day=1)
    if next_month > end_date:
        next_month = end_date
    chunk = fetch_measurements(loc_id, cur, next_month)
    data.extend(chunk)
    cur = next_month
    time.sleep(0.5)

df = pd.DataFrame(data)
print(f"{loc_name}: {len(df)} rows")
df.to_csv(f"{loc_name}_{loc_id}.csv", index=False)
df.head()


# Cell 3c – Trạm rất mới (2025 → nay, ví dụ ID 4946811: “556 Nguyễn Văn Cừ”)

In [None]:
# Crawl dữ liệu cho trạm rất mới (Nguyen Van Cu - ID 4946811)

loc_id = 4946811
loc_name = "Nguyen_Van_Cu"

loc_info = requests.get(f"{BASE_URL}/locations/{loc_id}", headers=headers).json()["results"][0]
dt_first = loc_info["datetimeFirst"]["utc"] if loc_info.get("datetimeFirst") else None
dt_last  = loc_info["datetimeLast"]["utc"] if loc_info.get("datetimeLast") else None

start_date = datetime.fromisoformat(dt_first.replace("Z", "+00:00"))
end_date   = datetime.fromisoformat(dt_last.replace("Z", "+00:00"))

print(f"{loc_name} data range: {start_date} → {end_date}")

data = []
cur = start_date
while cur < end_date:
    next_month = (cur + timedelta(days=32)).replace(day=1)
    if next_month > end_date:
        next_month = end_date
    chunk = fetch_measurements(loc_id, cur, next_month)
    data.extend(chunk)
    cur = next_month
    time.sleep(0.5)

df = pd.DataFrame(data)
print(f"{loc_name}: {len(df)} rows")
df.to_csv(f"{loc_name}_{loc_id}.csv", index=False)
df.head()
