In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import openaq
import warnings
import numpy as np
import statsmodels.api as sm
import sklearn
sklearn.__version__

warnings.simplefilter('ignore')

%matplotlib inline

# Set major seaborn asthetics
sns.set("notebook", style='ticks', font_scale=1.0)

# Increase the quality of inline plots
mpl.rcParams['figure.dpi']= 500

print ("pandas v{}".format(pd.__version__))
print ("matplotlib v{}".format(mpl.__version__))
print ("seaborn v{}".format(sns.__version__))
print ("openaq v{}".format(openaq.__version__))

pandas v1.4.4
matplotlib v3.5.2
seaborn v0.12.2
openaq v1.1.0


***FIRST REQUEST***

In [2]:
api = openaq.OpenAQ()

***CITIES***

El endpoint API de cities lista las ciudades disponibles dentro de la plataforma. Los resultados pueden ser subseleccionados por país y paginados para recuperar todos los resultados en la base de datos. Comencemos realizando una consulta básica con un límite aumentado (para que podamos obtener todas las ciudades) y devolverlo como un DataFrame:

In [3]:
resp = api.cities(df=True, limit=10000)

***DATACRAWLING API CITITES***

In [4]:
resp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   country    3276 non-null   object
 1   name       3276 non-null   object
 2   city       3276 non-null   object
 3   count      3276 non-null   int64 
 4   locations  3276 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 128.1+ KB


In [5]:
resp=resp.dropna()

In [6]:
resp.isna().any()

country      False
name         False
city         False
count        False
locations    False
dtype: bool

In [7]:
resp.isna().sum()

country      0
name         0
city         0
count        0
locations    0
dtype: int64

In [8]:
print (resp.head(10))

  country       name       city  count  locations
0      US        007        007  55743          7
1      US        019        019   2112          1
2      US        037        037  48525         23
3      US        039        039   1140          1
4      US        047        047  47331          4
5      US        051        051    990          7
6      US        077        077   9982          1
7      CH     Aargau     Aargau  12162          1
8      MX    Abasolo    Abasolo   1095          1
9      US  ABBEVILLE  ABBEVILLE   4619          1


In [9]:
print (resp.query("country == 'IN'"))

     country           name           city    count  locations
28        IN       Agartala       Agartala   212295          4
29        IN           Agra           Agra  1290629         10
32        IN      Ahmedabad      Ahmedabad  1516674         18
41        IN         Aizawl         Aizawl   239586          2
42        IN          Ajmer          Ajmer   831131          4
...      ...            ...            ...      ...        ...
2656      IN  Visakhapatnam  Visakhapatnam  1133283          9
2667      IN      Vrindavan      Vrindavan   118368          3
2749      IN         Yadgir         Yadgir   329778          2
2756      IN    Yamunanagar    Yamunanagar       18          1
2757      IN   Yamuna Nagar   Yamuna Nagar   536475          1

[200 rows x 5 columns]


***COUNTRIES***

Similar al endpoint de ciudades, el endpoint de países lista los países disponibles. Los únicos parámetros con los que contamos son el límite y el número de página. Si queremos obtenerlos todos, podemos simplemente aumentar el límite al máximo (10000).

In [10]:
res = api.countries(limit=10000, df=True)

print (res.head())

  code     count  locations  cities                  name
0   AD    726810          3       2               Andorra
1   AE  23351954         32       9  United Arab Emirates
2   AF   3148590          3       2           Afghanistan
3   AG     94563          1       1                    AG
4   AJ     11668         10       1                    AJ


***DATA CRAWLING API COUNTRIES***

In [11]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code       159 non-null    object
 1   count      159 non-null    int64 
 2   locations  159 non-null    int64 
 3   cities     159 non-null    int64 
 4   name       159 non-null    object
dtypes: int64(3), object(2)
memory usage: 6.3+ KB


In [12]:
res=res.dropna()

In [13]:
res.isna().any()

code         False
count        False
locations    False
cities       False
name         False
dtype: bool

In [14]:
res.isna().sum()

code         0
count        0
locations    0
cities       0
name         0
dtype: int64

***PARAMETERS***

El endpoint de parámetros proporcionará un listado de todos los parámetros disponibles:





In [15]:
res = api.parameters(df=True)

print (res)

       id   name                                        description  \
0       1   pm10  Particulate matter less than 10 micrometers in...   
1       2   pm25  Particulate matter less than 2.5 micrometers i...   
2       3     o3                           Ozone mass concentration   
3       4     co                 Carbon Monoxide mass concentration   
4       5    no2                Nitrogen Dioxide mass concentration   
5       6    so2                  Sulfur Dioxide mass concentration   
6       7    no2                     Nitrogen Dioxide concentration   
7       8     co                      Carbon Monoxide concentration   
8       9    so2                       Sulfur Dioxide concentration   
9      10     o3                                Ozone concentration   
10     11     bc                    Black Carbon mass concentration   
11     19    pm1  Particulate matter less than 1 micrometer in d...   
12     21    co2                       Carbon Dioxide concentration   
13    

***DATA CRAWLING API PARAMETERS***

In [16]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             23 non-null     int64 
 1   name           23 non-null     object
 2   description    23 non-null     object
 3   preferredUnit  23 non-null     object
dtypes: int64(1), object(3)
memory usage: 864.0+ bytes


In [17]:
res=res.dropna()

In [18]:
res.isna().any()

id               False
name             False
description      False
preferredUnit    False
dtype: bool

In [19]:
res.isna().sum()

id               0
name             0
description      0
preferredUnit    0
dtype: int64

***SOURCES***

El endpoint de sources proporcionará una lista de las fuentes de donde provino la información cruda.

In [20]:
res = api.sources(df=True)

# Print out the first one
print (res)


                                                  url      adapter  \
0   http://agaar.mn/aqdata/stationlist?period=h&la...     agaar_mn   
1                        http://files.airnowtech.org/  airnow-http   
2                                   ftp.airnowapi.org   airnow-ftp   
3   http://www.dpccairdata.com/dpccairdata/display...        india   
4   http://www.juntadeandalucia.es/medioambiente/s...    andalucia   
..                                                ...          ...   
95  http://dosairnowdata.org/dos/RSS/Accra/Accra-P...     stateair   
96  http://dosairnowdata.org/dos/RSS/AddisAbabaCen...     stateair   
97  http://dosairnowdata.org/dos/RSS/AddisAbabaSch...     stateair   
98  http://dosairnowdata.org/dos/RSS/Algiers/Algie...     stateair   
99  http://dosairnowdata.org/dos/RSS/Amman/Amman-P...     stateair   

                            name         city country  \
0                       Agaar.mn  Ulaanbaatar      MN   
1                         AirNow             

***DATA CRAWLING SOURCES***

In [21]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   url          100 non-null    object
 1   adapter      100 non-null    object
 2   name         100 non-null    object
 3   city         89 non-null     object
 4   country      100 non-null    object
 5   description  99 non-null     object
 6   sourceURL    100 non-null    object
 7   resolution   37 non-null     object
 8   contacts     100 non-null    object
 9   active       100 non-null    bool  
dtypes: bool(1), object(9)
memory usage: 7.3+ KB


In [22]:
res=res.dropna()

In [23]:
res.isna().any()

url            False
adapter        False
name           False
city           False
country        False
description    False
sourceURL      False
resolution     False
contacts       False
active         False
dtype: bool

In [24]:
res.isna().sum()

url            0
adapter        0
name           0
city           0
country        0
description    0
sourceURL      0
resolution     0
contacts       0
active         0
dtype: int64

***LATEST***

Obtiene los datos más recientes de una o varias ubicaciones.

En este caso en la talba siguiente a esta probamos la misma ¿Cuál fue el valor más reciente de PM2.5 en Delhi?

In [25]:
res = api.latest(city='Hilo', parameter='so2', df=True)

res

Unnamed: 0_level_0,parameter,value,unit,sourceName,country,city,location,averagingPeriod.value,averagingPeriod.unit
lastUpdated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-26 04:00:00+00:00,so2,0.0,ppm,AirNow,US,Hilo,Hawaii Volcanoes NP,3600,seconds
2023-03-26 04:00:00+00:00,pm25,8.0,b'\xc2\xb5g/m\xc2\xb3',AirNow,US,Hilo,Pahala,3600,seconds
2023-03-26 04:00:00+00:00,so2,0.0,ppm,AirNow,US,Hilo,Pahala,3600,seconds
2023-03-26 04:00:00+00:00,so2,0.0,ppm,AirNow,US,Hilo,Kona,3600,seconds
2023-03-26 04:00:00+00:00,pm25,4.0,b'\xc2\xb5g/m\xc2\xb3',AirNow,US,Hilo,Kona,3600,seconds
2023-03-26 04:00:00+00:00,pm25,0.0,b'\xc2\xb5g/m\xc2\xb3',AirNow,US,Hilo,Hilo,3600,seconds
2023-03-26 04:00:00+00:00,so2,0.0,ppm,AirNow,US,Hilo,Hilo,3600,seconds
2023-03-26 04:00:00+00:00,so2,0.001,ppm,AirNow,US,Hilo,Ocean View,3600,seconds
2023-03-26 04:00:00+00:00,pm25,2.0,b'\xc2\xb5g/m\xc2\xb3',AirNow,US,Hilo,Ocean View,3600,seconds
2018-07-02 20:00:00+00:00,so2,0.178,ppm,AirNow,US,Hilo,Hawaii Volcanoes NP,3600,seconds


***DATA CRAWLING LATEST***

In [26]:
res.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12 entries, 2023-03-26 04:00:00+00:00 to 2017-01-26 17:00:00+00:00
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   parameter              12 non-null     object 
 1   value                  12 non-null     float64
 2   unit                   12 non-null     object 
 3   sourceName             12 non-null     object 
 4   country                12 non-null     object 
 5   city                   12 non-null     object 
 6   location               12 non-null     object 
 7   averagingPeriod.value  12 non-null     int64  
 8   averagingPeriod.unit   12 non-null     object 
dtypes: float64(1), int64(1), object(7)
memory usage: 960.0+ bytes


In [27]:
res=res.dropna()

In [28]:
res.isna().any()

parameter                False
value                    False
unit                     False
sourceName               False
country                  False
city                     False
location                 False
averagingPeriod.value    False
averagingPeriod.unit     False
dtype: bool

In [29]:
res.isna().sum()

parameter                0
value                    0
unit                     0
sourceName               0
country                  0
city                     0
location                 0
averagingPeriod.value    0
averagingPeriod.unit     0
dtype: int64

***MEASUREMENTS***

Finalmente, ¡el endpoint que todos hemos estado esperando! Measurements te permite obtener ¡todos los datos! Puedes consultar sobre una gran cantidad de parámetros que se encuentran listados en la documentación de la API. Vamos a sumergirnos:

Obtengamos los últimos 10000 puntos de datos para PM2.5 en Delhi:



In [30]:
res = api.measurements(city='Delhi', parameter='pm25', limit=10000, df=True)

# Print out the statistics on a per-location basiss
res.groupby(['location'])['value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Alipur, Delhi - DPCC",233.0,57.107296,40.764268,0.0,29.0,53.0,83.0,290.0
"Anand Vihar, Delhi - DPCC",1.0,283.0,,283.0,283.0,283.0,283.0,283.0
"Ashok Vihar, Delhi - DPCC",365.0,54.131507,36.80479,0.0,25.0,48.0,77.0,277.0
"Aya Nagar, Delhi - IMD",160.0,37.006188,40.657587,-212.75,14.99,32.395,58.0925,212.54
"Bawana, Delhi - DPCC",286.0,53.877622,39.713832,2.0,23.0,48.0,77.0,296.0
"CRRI Mathura Road, Delhi - IMD",122.0,50.537623,35.674027,0.0,23.7575,48.37,73.93,224.72
"DTU, Delhi - CPCB",425.0,42.849412,113.939143,6.0,18.0,26.0,35.0,1000.0
"Dr. Karni Singh Shooting Range, Delhi - DPCC",2.0,154.5,127.986327,64.0,109.25,154.5,199.75,245.0
"Dwarka-Sector 8, Delhi - DPCC",382.0,53.188482,34.259333,1.0,27.0,47.0,78.0,246.0
"IGI Airport (T3), Delhi - IMD",244.0,48.230738,33.170737,-138.33,25.3825,46.365,71.375,126.87


In [31]:
res = api.measurements(city='Delhi', parameter='pm25', limit=10000, df=True)

# Print out the statistics on a per-location basiss
res

Unnamed: 0_level_0,location,parameter,value,unit,country,city,date.utc,coordinates.latitude,coordinates.longitude
date.local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-26 11:00:00,US Diplomatic Post: New Delhi,pm25,33.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2023-03-26 05:30:00+00:00,28.635760,77.224450
2023-03-26 10:00:00,US Diplomatic Post: New Delhi,pm25,45.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2023-03-26 04:30:00+00:00,28.635760,77.224450
2023-03-26 09:00:00,US Diplomatic Post: New Delhi,pm25,58.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2023-03-26 03:30:00+00:00,28.635760,77.224450
2023-03-26 08:00:00,US Diplomatic Post: New Delhi,pm25,65.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2023-03-26 02:30:00+00:00,28.635760,77.224450
2023-03-26 07:00:00,US Diplomatic Post: New Delhi,pm25,53.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2023-03-26 01:30:00+00:00,28.635760,77.224450
...,...,...,...,...,...,...,...,...,...
2022-09-26 05:30:00,"NSIT Dwarka, Delhi - CPCB",pm25,9.5,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2022-09-26 00:00:00+00:00,28.609090,77.032541
2022-09-26 05:15:00,"ITO, Delhi - CPCB",pm25,71.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2022-09-25 23:45:00+00:00,28.628624,77.241060
2022-09-26 05:15:00,"DTU, Delhi - CPCB",pm25,21.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2022-09-25 23:45:00+00:00,28.750050,77.111261
2022-09-26 05:15:00,"Sirifort, Delhi - CPCB",pm25,37.0,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2022-09-25 23:45:00+00:00,28.550425,77.215938


***DATA CRAWLING MEASURMENTS***

In [32]:
res.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14962 entries, 2023-03-26 11:00:00 to 2022-09-26 05:00:00
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   location               14962 non-null  object             
 1   parameter              14962 non-null  object             
 2   value                  14962 non-null  float64            
 3   unit                   14962 non-null  object             
 4   country                14962 non-null  object             
 5   city                   14962 non-null  object             
 6   date.utc               14962 non-null  datetime64[ns, UTC]
 7   coordinates.latitude   14962 non-null  float64            
 8   coordinates.longitude  14962 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(3), object(5)
memory usage: 1.1+ MB


In [33]:
res=res.dropna()

In [34]:
res.isna().any()

location                 False
parameter                False
value                    False
unit                     False
country                  False
city                     False
date.utc                 False
coordinates.latitude     False
coordinates.longitude    False
dtype: bool

In [35]:
res.isna().sum()

location                 0
parameter                0
value                    0
unit                     0
country                  0
city                     0
date.utc                 0
coordinates.latitude     0
coordinates.longitude    0
dtype: int64