In [5]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
# from rasterstats import zonal_stats
import requests
from pprint import pp
from tqdm.notebook import tqdm
import requests
import time
from zkyhaxpy import io_tools, pd_tools, gis_tools
from datetime import datetime, timedelta
from configparser import ConfigParser

In [10]:
#Register for OpenAQ's API Keys at https://api.openaq.org/register
#Then place API Key in openaq.ini following this format
#[openaq]
#api_key = <OPENAQ_API_KEY>
#or save into variable openaq_api_key directly
if os.path.exists('openaq.ini'):
    openaq_config = ConfigParser()
    openaq_config.read('openaq_api_key.ini')
    openaq_api_key = openaq_config.get('openaq', 'api_key')
else:
    openaq_api_key = '<OPENAQ_API_KEY>'


NoSectionError: No section: 'openaq'

In [11]:
def reformat_dict_measurement(dict_measurement_raw):
    '''
    Reformat dict measurement into simple form that can be added into dataframe easily.
    '''
    dict_measurement = dict_measurement_raw.copy()
    dict_measurement['date_utc'] = dict_measurement['date']['utc']
    dict_measurement['lat'] = dict_measurement['coordinates']['latitude']
    dict_measurement['long'] = dict_measurement['coordinates']['longitude']
    del(dict_measurement['date'], dict_measurement['coordinates'])

    return dict_measurement

def get_df_measurements(api_key, date_from, date_to, country='TH', parameter='pm25', limit=10000, offset=0):
    '''
    Get a dataframe of measurements
    '''
    params = {
        "limit": limit,
        "offset": 0,
        "parameter": parameter,
        "date_from": date_from,
        "date_to": date_to,
        "country": country,
    }
    url = "https://api.openaq.org/v2/measurements"
    headers = {"X-API-Key": api_key}

    # Send the request and get the data
    finished = False
    while not finished:
        response = requests.get(url, headers=headers, params=params)
        data = response.json()
        if 'meta' in data.keys():
            if type(data['meta']['found'])==int:
                finished = True
            else:
                if params['limit']<50000:
                    params['limit'] = params['limit'] + limit
                else:
                    raise ValueError
        else:
            print(params)
            print(data)
            raise ValueError

    #Transform data into dataframe
    list_dict_measurements = data['results']
    list_dict_measurements = [reformat_dict_measurement(dict_measurement) for dict_measurement in list_dict_measurements]
    

    return pd.DataFrame(list_dict_measurements)





In [12]:
from datetime import datetime, timedelta

def get_first_and_last_date(year, month):
    # Get the first date of the month
    first_date = datetime(year, month, 1)

    # Compute the last date of the month
    if month == 12:
        last_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        last_date = datetime(year, month + 1, 1) - timedelta(days=1)

    # Return the dates in 'yyyy-mm-dd' format
    return first_date.strftime('%Y-%m-%d'), last_date.strftime('%Y-%m-%d')



In [13]:
def get_dates_between(start_date, end_date):
    # Convert input strings to datetime objects
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")

    # Initialize an empty list to store the result
    result_dates = []

    # Iterate through the days between start_date and end_date
    current_dt = start_dt
    while current_dt <= end_dt:
        result_dates.append(current_dt.strftime("%Y-%m-%d"))
        current_dt += timedelta(days=1)

    return result_dates



In [14]:
list_df_measurements = []
list_finished_year_month = []

In [None]:
'''
### Notes ###
We can expect a few errors to occur for this cell of code that come from the API's unresponsiveness.
To resume the download, we can simply rerun this cell again.
This is a bit uncultured way to handle the error but it is easily implemented.
So I will keep it this way.
'''
YEAR_FROM = 2019
YEAR_TO = 2024
list_year = list(range(YEAR_FROM, YEAR_TO+1))
list_month = range(1, 13)
pbar_year = tqdm(list_year)
for year in pbar_year:
    pbar_year.set_description(f'Getting data for year {year}')
    pbar_month = tqdm(list_month)
    for month in pbar_month:
        pbar_month.set_description(f'Month {month}')
        year_month = f'{year}-{month}'
        if year_month in list_finished_year_month:
            continue

        if (year == 2024) & (month > 5):
            break
        date_from, date_to = get_first_and_last_date(year, month)
        try:
            df_measurements = get_df_measurements(api_key, date_from=date_from, date_to=date_to, country='TH', parameter='pm25', limit=10000)
            list_df_measurements.append(df_measurements)
        except Exception as e:
            print(e)
            list_date = get_dates_between(date_from, date_to)
            list_df_measurements_month = []
            pbar_date = tqdm(list_date)
            for date_curr in pbar_date:
                pbar_date.set_description(f'{date_curr}')
                for _ in range(0, 10):
                    try:
                        df_measurements = get_df_measurements(api_key, date_from=date_curr, date_to=date_curr, country='TH', parameter='pm25', limit=10000)
                        list_df_measurements_month.append(df_measurements)
                    except ValueError:
                        time.sleep(15)

            list_df_measurements = list_df_measurements + list_df_measurements_month

        list_finished_year_month.append(year_month)

df_measurements = pd.concat(list_df_measurements)

df_measurements.to_csv('openaqi_thailand_pm25_data.csv', index=False)

In [None]:
list_col_check = []
for col in df_measurements.columns:

    nunique = df_measurements[col].nunique()
    missing_rate = df_measurements[col].isnull().mean()
    min_val = df_measurements[col].min()
    max_val = df_measurements[col].max()
    dict_col_check = {
        'col':col,
        'nunique':nunique,
        'missing_rate':missing_rate,
        'min_val':min_val,
        'max_val':max_val
    }

    list_col_check.append(dict_col_check)

df_col_check = pd.DataFrame(list_col_check)
df_col_check