In [None]:
import requests
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
from api_key import MY_API_KEY

API_KEY = MY_API_KEY
headers = {"X-API-Key": API_KEY}

print("\nReading locations...")
locations_df = spark.read.parquet("Files/bronze/openaq/locations")
locations = locations_df.select("id", "name").collect()
print(f"    Found {len(locations)} locations")

print("\nGetting sensors...")

all_sensors = []

for location in locations:
    location_id = location['id']
    location_name = location['name']
    
    sensors_url = f"https://api.openaq.org/v3/locations/{location_id}/sensors"
    
    try:
        response = requests.get(sensors_url, headers=headers, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            sensors = data.get('results', [])
            
            for sensor in sensors:
                sensor_id = sensor.get('id')
                parameter_obj = sensor.get('parameter', {})
                parameter_name = parameter_obj.get('name', 'unknown')
                
                if parameter_name in ['pm25', 'pm10', 'no2', 'o3', 'co', 'so2']:
                    all_sensors.append({
                        'sensor_id': sensor_id,
                        'location_id': location_id,
                        'location_name': location_name,
                        'parameter': parameter_name
                    })
        
        time.sleep(0.05)
                
    except Exception as e:
        continue

print(f"    Found {len(all_sensors)} relevant sensors")

print(f"\nFetching daily measurements...")

all_measurements = []
sensors_with_data = 0
sensors_processed = 0

for sensor in all_sensors:
    sensors_processed += 1
    
    if sensors_processed % 20 == 0:
        print(f"    Progress: {sensors_processed}/{len(all_sensors)}, {len(all_measurements)} measurements...")
    
    sensor_id = sensor['sensor_id']
    days_url = f"https://api.openaq.org/v3/sensors/{sensor_id}/days"
    params = {"date_from": "2019-01-01", "date_to": "2024-12-31", "limit": 1000}
    
    try:
        response = requests.get(days_url, headers=headers, params=params, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            results = data.get('results', [])
            
            if results:
                sensors_with_data += 1
                
                for record in results:
                    period = record.get('period', {})
                    datetime_from = period.get('datetimeFrom', {})
                    
                    if isinstance(datetime_from, dict):
                        date_str = datetime_from.get('utc', '')
                    else:
                        date_str = str(datetime_from)
                    
                    date_value = date_str[:10] if len(date_str) >= 10 else None
                    summary = record.get('summary', {})
                    
                    all_measurements.append({
                        'sensor_id': sensor_id,
                        'location_id': sensor['location_id'],
                        'location_name': sensor['location_name'],
                        'parameter': sensor['parameter'],
                        'date': date_value,
                        'value_avg': summary.get('avg'),
                        'value_min': summary.get('min'),
                        'value_max': summary.get('max'),
                        'value_sd': summary.get('sd'),
                        'value_count': record.get('coverage', {}).get('observedCount')
                    })
        
        time.sleep(0.1)
        
    except:
        continue

print(f"\n    Complete! {sensors_with_data}/{len(all_sensors)} sensors, {len(all_measurements)} measurements")

if all_measurements:
    print(f"\nCreating Silver DataFrame...")
    
    df_pandas = pd.DataFrame(all_measurements)
    df_spark = spark.createDataFrame(df_pandas)
    
    df_spark = df_spark \
        .withColumn("date", to_date(col("date"))) \
        .withColumn("value_avg", col("value_avg").cast(DoubleType())) \
        .withColumn("value_min", col("value_min").cast(DoubleType())) \
        .withColumn("value_max", col("value_max").cast(DoubleType())) \
        .withColumn("value_sd", col("value_sd").cast(DoubleType())) \
        .withColumn("value_count", col("value_count").cast(IntegerType())) \
        .withColumn("year", year("date")) \
        .withColumn("month", month("date")) \
        .withColumn("day", dayofmonth("date")) \
        .withColumn("dayofweek", dayofweek("date")) \
        .withColumn("dayname", date_format("date", "EEEE")) \
        .withColumn("is_weekend", when(col("dayofweek").isin([1, 7]), True).otherwise(False)) \
        .withColumn("unit", lit("µg/m³")) \
        .withColumn("ingestion_timestamp", current_timestamp())
    
    df_spark = df_spark \
        .withColumn("value_avg", round(col("value_avg"), 2)) \
        .withColumn("value_min", round(col("value_min"), 2)) \
        .withColumn("value_max", round(col("value_max"), 2)) \
        .withColumn("value_sd", round(col("value_sd"), 2))
    
    df_spark = df_spark.filter(col("date").isNotNull())

    print("\n Data Quality Checks:")
    null_counts = df_spark.select([
        count(when(col(c).isNull(), c)).alias(c) 
        for c in ["date", "value_avg", "location_name", "parameter"]
    ])
    null_counts.show()

    negative_count = df_spark.filter(col("value_avg") < 0).count()
    print(f"Negative values: {negative_count}")

    extreme_count = df_spark.filter(col("value_avg") > 500).count()
    print(f"Extreme values (>500): {extreme_count}")

    df_spark = df_spark.filter(
        (col("value_avg") >= 0) & 
        (col("value_avg") <= 500) &
        (col("date").isNotNull())
    )

    print(f"   Records after cleaning: {df_spark.count():,}")

    print(f"\n    Quality Summary:")
    df_spark.groupBy("parameter").agg(
        count("*").alias("measurements"),
        countDistinct("location_name").alias("locations"),
        round(avg("value_avg"), 2).alias("avg_value"),
        round(min("value_avg"), 2).alias("min_value"),
        round(max("value_avg"), 2).alias("max_value")
    ).show()
    
    print(f"DataFrame: {df_spark.count():,} measurements")
    
    print(f"\n Sample:")
    df_spark.select("date", "location_name", "parameter", "value_avg").orderBy("date").show(15, truncate=False)
    
    print(f"\n    By parameter:")
    df_spark.groupBy("parameter").agg(
        count("*").alias("days"),
        countDistinct("location_name").alias("locations"),
        round(avg("value_avg"), 2).alias("avg"),
        min("date").alias("first_date"),
        max("date").alias("last_date")
    ).orderBy(desc("days")).show()
    
    print(f"\nSaving...")
    
    df_spark.write.mode("overwrite").partitionBy("year", "month", "parameter").format("delta").saveAsTable("silver_openaq")
    
    df_spark.createOrReplaceTempView("silver_openaq_daily")
    print(f"    View: silver_openaq_daily")
    print(" \nOPENAQ SILVER COMPLETE!")
    
else:
    print("\n No data")

StatementMeta(, ae59fca5-0667-4e45-b7b3-9803a0e4130f, 3, Finished, Available, Finished)


Reading locations...
    Found 59 locations

Getting sensors...
    Found 97 relevant sensors

Fetching daily measurements...
    Progress: 20/97, 984 measurements...
    Progress: 40/97, 984 measurements...
    Progress: 60/97, 984 measurements...
    Progress: 80/97, 2584 measurements...

    Complete! 23/97 sensors, 4058 measurements

Creating Silver DataFrame...

 Data Quality Checks:
+----+---------+-------------+---------+
|date|value_avg|location_name|parameter|
+----+---------+-------------+---------+
|   0|        0|            0|        0|
+----+---------+-------------+---------+

Negative values: 15
Extreme values (>500): 14
   Records after cleaning: 4,029

    Quality Summary:
+---------+------------+---------+---------+---------+---------+
|parameter|measurements|locations|avg_value|min_value|max_value|
+---------+------------+---------+---------+---------+---------+
|     pm10|        1503|        8|    12.14|     0.01|   113.78|
|     pm25|        2526|        9|     9