In [None]:
import requests
import pandas as pd
from pyspark.sql.functions import current_timestamp, lit, col
from pyspark.sql.types import DoubleType
from datetime import datetime
from api_key import MY_API_KEY

API_KEY = MY_API_KEY
BASE_URL = "https://api.openaq.org/v3/locations"

params = {
    "limit": 1000,
    "page": 1,
    "coordinates": "40.7128,-74.0060",
    "radius": "25000"
}

headers = {
    "X-API-Key": API_KEY,
    "Content-Type": "application/json"
}

print("OpenAQ NYC Air Quality Locations Ingestion")

all_locations = []
page = 1
max_pages = 10

while page <= max_pages:
    print(f"\nFetching page {page}...")
    params["page"] = page
    
    try:
        response = requests.get(BASE_URL, headers=headers, params=params, timeout=30)
        
        if response.status_code == 401:
            print("Authentication failed! Check your API key.")
            break
        elif response.status_code == 429:
            print("Rate limit exceeded. Waiting...")
            import time
            time.sleep(5)
            continue
        elif response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break
        
        data = response.json()
        results = data.get('results', [])
        
        if not results:
            print(f"No more data. Stopping at page {page}.")
            break
        
        all_locations.extend(results)
        found_total = data.get('meta', {}).get('found', 'unknown')
        print(f"Fetched {len(results)} locations")
        print(f"Total so far: {len(all_locations)}")
        print(f"API reports {found_total} total locations in radius")
        
        if len(results) < params["limit"]:
            print(f"Reached last page (page {page})")
            break
        
        page += 1
        
    except requests.exceptions.Timeout:
        print(f"Timeout on page {page}. Retrying...")
        continue
    except Exception as e:
        print(f"Error: {str(e)}")
        break

if all_locations:
    print(f"Successfully fetched {len(all_locations)} locations!")
    
    df_pandas = pd.json_normalize(all_locations)
    
    print(f"\nColumns found: {len(df_pandas.columns)}")
    print(f"   Preview: {', '.join(df_pandas.columns[:10])}...")
    
    df_spark = spark.createDataFrame(df_pandas)
    
    for col_name in df_spark.columns:
        if "." in col_name:
            df_spark = df_spark.withColumnRenamed(col_name, col_name.replace(".", "_"))
    
    if 'distance' in df_spark.columns:
        try:
            df_spark = df_spark.withColumn("distance", col("distance").cast(DoubleType()))
        except:
            df_spark = df_spark.drop("distance")
            print("Dropped 'distance' column (unsupported type)")
    
    df_spark = df_spark \
        .withColumn("ingestion_timestamp", current_timestamp()) \
        .withColumn("ingestion_date", lit(datetime.now().strftime("%Y-%m-%d")))
    
    print("\nPreview of data:")
    df_spark.select("id", "name", "country_name", "locality").show(10, truncate=False)
    
    columns_to_drop = []
    if "licenses" in df_spark.columns:
        columns_to_drop.append("licenses")
    if "sensors" in df_spark.columns:
        columns_to_drop.append("sensors")
    if "instruments" in df_spark.columns:
        columns_to_drop.append("instruments")
    if "bounds" in df_spark.columns:
        columns_to_drop.append("bounds")
    
    if columns_to_drop:
        df_spark_clean = df_spark.drop(*columns_to_drop)
        print(f"\nDropped complex columns: {', '.join(columns_to_drop)}")
    else:
        df_spark_clean = df_spark
    
    output_path = "Files/bronze/openaq/locations"
    print(f"\nSaving to: {output_path}")
    
    df_spark_clean.write \
        .mode("overwrite") \
        .format("parquet") \
        .save(output_path)
    
    print(f"Data saved successfully!")
    print(f"Total records: {df_spark_clean.count()}")
    print(f"Location: {output_path}")
    
    print("\nSummary:")
    print(f"Unique location IDs: {df_spark_clean.select('id').distinct().count()}")
    print(f"Countries: {df_spark_clean.select('country_name').distinct().count()}")
    print(f"Cities/Localities: {df_spark_clean.select('locality').distinct().count()}")
    
    print("\nLocations by country:")
    df_spark_clean.groupBy("country_name").count().orderBy(col("count").desc()).show(10)
    
    df_spark_clean.createOrReplaceTempView("bronze_openaq_locations")
    print("\nCreated temp view: bronze_openaq_locations")
    
else:
    print("\nNo data was fetched. Check your API key and connection.")

print("Ingestion complete!")

StatementMeta(, 755d2bf2-fb50-46eb-8654-4194d29d6de8, 3, Finished, Available, Finished)

OpenAQ NYC Air Quality Locations Ingestion

Fetching page 1...
Fetched 59 locations
Total so far: 59
API reports 59 total locations in radius
Reached last page (page 1)
Successfully fetched 59 locations!

Columns found: 24
   Preview: id, name, locality, timezone, isMobile, isMonitor, instruments, sensors, licenses, bounds...

Preview of data:
+---+---------------+-------------+----------------------------------------+
|id |name           |country_name |locality                                |
+---+---------------+-------------+----------------------------------------+
|384|CCNY           |United States|New York-Northern New Jersey-Long Island|
|386|Susan Wagner   |United States|NULL                                    |
|625|Manhattan/IS143|United States|New York-Northern New Jersey-Long Island|
|626|Bronx - IS52   |United States|New York-Northern New Jersey-Long Island|
|628|Maspeth        |United States|New York-Northern New Jersey-Long Island|
|631|Queens         |United States|New