## Librerías

In [88]:
import pandas as pd
import requests
import warnings

# Suprime todas las advertencias
warnings.filterwarnings("ignore")

## Descarga de datos

In [89]:
# Nombre de cada df
param_names = ['df_PM25', 'df_SO2', 'df_CO', 'df_NO2']
# Código de cada contaminante
params = [88502, 42401, 42101, 42602]
# Periodo de fechas
bdates = [20210304, 20210804, 20211219, 20220304, 20220804, 20221219]
edates = [20210310, 20210810, 20211226, 20220310, 20220810, 20221226]

# Crea un diccionario para almacenar los DataFrames
dfs = {}

for param, param_name in zip(params, param_names):
    df = pd.DataFrame()

    for bdate, edate in zip(bdates, edates):
        url = f"https://aqs.epa.gov/data/api/sampleData/byCounty?email=rodrigodavid.1993@gmail.com&key=greymallard41&param={param}&bdate={bdate}&edate={edate}&duration=1&state=36&county=081"

        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            data_list = data.get("Data", [])

            if data_list:
                df_temp = pd.DataFrame(data_list)
                df = pd.concat([df, df_temp], ignore_index=True)

    # Asigna un nombre al DataFrame
    df.name = param_name

    # Almacena el DataFrame en el diccionario
    dfs[param_name] = df

# DataFrames por nombre
df_PM25 = dfs['df_PM25']
df_SO2 = dfs['df_SO2']
df_CO = dfs['df_CO']
df_NO2 = dfs['df_NO2']



## Guardar los datos

In [90]:
df_PM25.to_csv('Datasets/Inputs/Input_PM25.csv')
df_SO2.to_csv('Datasets/Inputs/Input_SO2.csv')
df_CO.to_csv('Datasets/Inputs/Input_CO.csv')
df_NO2.to_csv('Datasets/Inputs/Input_NO2.csv')

## Dataset Indice de Contaminante

### 1. Seleción de los parámetros (Nombres) y sus respectivos códigos

In [91]:
df_PM25.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   state_code             2111 non-null   object 
 1   county_code            2111 non-null   object 
 2   site_number            2111 non-null   object 
 3   parameter_code         2111 non-null   object 
 4   poc                    2111 non-null   int64  
 5   latitude               2111 non-null   float64
 6   longitude              2111 non-null   float64
 7   datum                  2111 non-null   object 
 8   parameter              2111 non-null   object 
 9   date_local             2111 non-null   object 
 10  time_local             2111 non-null   object 
 11  date_gmt               2111 non-null   object 
 12  time_gmt               2111 non-null   object 
 13  sample_measurement     1994 non-null   float64
 14  units_of_measure       2111 non-null   object 
 15  unit

In [92]:
# Códigos y Nombres de los contaminantes
print(f"{df_PM25['parameter_code'].unique()} , {df_PM25['parameter'].unique()}")
print(f"{df_SO2['parameter_code'].unique()} , {df_SO2['parameter'].unique()}")
print(f"{df_CO['parameter_code'].unique()} , {df_CO['parameter'].unique()}")
print(f"{df_NO2['parameter_code'].unique()} , {df_NO2['parameter'].unique()}")

['88502'] , ['Acceptable PM2.5 AQI & Speciation Mass']
['42401'] , ['Sulfur dioxide']
['42101'] , ['Carbon monoxide']
['42602'] , ['Nitrogen dioxide (NO2)']


### 2. Confección del dataset "**Parameter_index**"

In [93]:
# Selecciona solo las columnas relevantes de cada DataFrame
PM25 = df_PM25[['parameter_code', 'parameter']].drop_duplicates()
SO2 = df_SO2[['parameter_code', 'parameter']].drop_duplicates()
CO = df_CO[['parameter_code', 'parameter']].drop_duplicates()
NO2 = df_NO2[['parameter_code', 'parameter']].drop_duplicates()

# Concatena los DataFrames con las columnas relevantes
Parameter_index = pd.concat([PM25, SO2, CO, NO2], ignore_index=True)


In [94]:
Parameter_index

Unnamed: 0,parameter_code,parameter
0,88502,Acceptable PM2.5 AQI & Speciation Mass
1,42401,Sulfur dioxide
2,42101,Carbon monoxide
3,42602,Nitrogen dioxide (NO2)


### 3. Guardar como un archivo csv

In [95]:
# Guardar como Queens_air_quality.csv
Parameter_index.to_csv('Datasets/Outputs/Parameter_index.csv', index=False)

## DATASET: df_PM25

### Selección y tratamiento de columnas

In [96]:
selected_columns = df_PM25[['parameter_code', 'date_local', 'time_local', 'sample_measurement',  'latitude', 'longitude']].copy()


In [97]:
selected_columns.head(5)

Unnamed: 0,parameter_code,date_local,time_local,sample_measurement,latitude,longitude
0,88502,2021-03-04,00:00,8.8,40.72698,-73.89313
1,88502,2021-03-04,01:00,8.2,40.72698,-73.89313
2,88502,2021-03-04,02:00,8.2,40.72698,-73.89313
3,88502,2021-03-04,03:00,7.9,40.72698,-73.89313
4,88502,2021-03-04,04:00,9.2,40.72698,-73.89313


### Columna "parameter_code"

### 1. Cambiar a tipo entero (int)

In [98]:
selected_columns['parameter_code'] = selected_columns['parameter_code'].astype(int)

### Columna "date_local"

### 1. Creación de las columna "year" , "month" , "day"

In [99]:
# Divide la columna 'date_local' en columnas separadas de año, mes y día
selected_columns[['year', 'month', 'day']] = selected_columns['date_local'].str.split('-', expand=True)

In [100]:
# Convierte las nuevas columnas en enteros
selected_columns['year'] = selected_columns['year'].astype(int)
selected_columns['month'] = selected_columns['month'].astype(int)
selected_columns['day'] = selected_columns['day'].astype(int)

In [101]:
# Elimina la columna "date_local"
selected_columns.drop(labels = "date_local" , axis = 1 , inplace = True)

In [102]:
selected_columns.head(3)

Unnamed: 0,parameter_code,time_local,sample_measurement,latitude,longitude,year,month,day
0,88502,00:00,8.8,40.72698,-73.89313,2021,3,4
1,88502,01:00,8.2,40.72698,-73.89313,2021,3,4
2,88502,02:00,8.2,40.72698,-73.89313,2021,3,4


### 2. Detección datos atípicos de "year" , "month" , "day"

In [103]:
# Función para detectar años y dias fuera de rango
def outliers_y_d(col, n , N):
  zone = range(n, N+1)
  registros_fuera_de_rango = 0

  for index, row in selected_columns.iterrows():
      if row[col] not in zone:
          registros_fuera_de_rango += 1

  print(f"Total de registros fuera del rango ({n}, {N}): {registros_fuera_de_rango}")

In [104]:
# Detección de años fuera de rango
outliers_y_d('year' , 2021 , 2023)

Total de registros fuera del rango (2021, 2023): 0


In [105]:
# Detección de meses fuera de rango
def outliers_m(column_name, valid_months):
    records_outside_range = 0

    for index, row in selected_columns.iterrows():
        if row[column_name] not in valid_months:
            records_outside_range += 1

    print(f"Total de registros fuera del rango: {records_outside_range}")

list_months = [3, 8, 12]
outliers_m('month', list_months)

Total de registros fuera del rango: 0


In [106]:
# Detección de dias fuera de rango
outliers_y_d('day', 1 , 31)

Total de registros fuera del rango (1, 31): 0


### Columna "time_local"

### 1. Seleccionar las horas (hh)

In [107]:
# De hh:mm:ss a hh
selected_columns['time_local'] = selected_columns['time_local'].str[:2]

In [108]:
selected_columns['time_local'] = selected_columns['time_local'].astype(int)

In [109]:
time = range(6,21)
selected_columns = selected_columns[selected_columns['time_local'].isin(time)]

In [110]:
for i in sorted(selected_columns['time_local'].unique()):
  print(i)

6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


### 2. Cambiar el nombre a 'hour'

In [111]:
selected_columns.rename(columns={'time_local': 'hour'}, inplace=True)

### 3. Mover la columna al final

In [112]:
def move_column(dataframe, column):
  # Obtén una lista de todas las columnas excepto "column"
  other_columns = [col for col in dataframe.columns if col != column]

  # Reorganiza las columnas colocando "column" al final
  dataframe = dataframe[other_columns + [column]]
  return dataframe

selected_columns = move_column(selected_columns, 'hour')

In [113]:
selected_columns.head(4)

Unnamed: 0,parameter_code,sample_measurement,latitude,longitude,year,month,day,hour
6,88502,7.3,40.72698,-73.89313,2021,3,4,6
7,88502,7.4,40.72698,-73.89313,2021,3,4,7
8,88502,5.5,40.72698,-73.89313,2021,3,4,8
9,88502,4.2,40.72698,-73.89313,2021,3,4,9


### Columna "sample_measurement"

### 1. Tratamiento de faltantes

In [114]:
# Cantidad de filas con datos faltantes
count_nan = selected_columns['sample_measurement'].isna().sum()
print(f"Total de filas con NaN en 'sample_measurement': {count_nan}")

# Cantidad de filas totales del df
count_rows = len(selected_columns)
print(f"Total de filas: {count_rows}")

Total de filas con NaN en 'sample_measurement': 84
Total de filas: 1319


In [115]:
1319 -84

1235

Los datos de esta columna son indispensables para los análisis, la falta de estos hacen que toda la fila pierda su importancia y, por lo tanto, se eliminarán.

In [116]:
# Eliminación de las filas con faltantes en 'sample_measurement'
selected_columns.dropna(inplace = True)

## Información del dataset

In [117]:
# Cantidad de filas totales del df
count_rows = len(selected_columns)
print(f"Total de filas: {count_rows}")

Total de filas: 1235


In [118]:
df_PM25 = selected_columns

In [119]:
df_PM25.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1235 entries, 6 to 2103
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   parameter_code      1235 non-null   int64  
 1   sample_measurement  1235 non-null   float64
 2   latitude            1235 non-null   float64
 3   longitude           1235 non-null   float64
 4   year                1235 non-null   int64  
 5   month               1235 non-null   int64  
 6   day                 1235 non-null   int64  
 7   hour                1235 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 86.8 KB


## ETL del resto de archivos

In [120]:
def ETL_air_quality(df):
  selected_columns = df[['parameter_code', 'date_local', 'time_local', 'sample_measurement',  'latitude', 'longitude']].copy()

  # Columna "parameter_code"
  selected_columns['parameter_code'] = selected_columns['parameter_code'].astype(int)

  # Columna "date_local"
  # Divide la columna 'date_local' en columnas separadas de año, mes y día
  selected_columns[['year', 'month', 'day']] = selected_columns['date_local'].str.split('-', expand=True)
  # Convierte las nuevas columnas en enteros
  selected_columns['year'] = selected_columns['year'].astype(int)
  selected_columns['month'] = selected_columns['month'].astype(int)
  selected_columns['day'] = selected_columns['day'].astype(int)
  # Elimina la columna "date_local"
  selected_columns.drop(labels = "date_local" , axis = 1 , inplace = True)

  # Columna "time_local"
  # De hh:mm:ss a hh
  selected_columns['time_local'] = selected_columns['time_local'].str[:2]
  selected_columns['time_local'] = selected_columns['time_local'].astype(int)
  time = range(6,21)
  selected_columns = selected_columns[selected_columns['time_local'].isin(time)]
  # cambiar de nombre a "hour"
  selected_columns.rename(columns={'time_local': 'hour'}, inplace=True)
  # Mover al final
  def move_column(dataframe, column):
    # Obtén una lista de todas las columnas excepto "column"
    other_columns = [col for col in dataframe.columns if col != column]
    # Reorganiza las columnas colocando "column" al final
    dataframe = dataframe[other_columns + [column]]
    return dataframe

  selected_columns = move_column(selected_columns, 'hour')

  # Columna "sample_measurement"
  # Eliminación de las filas con faltantes en 'sample_measurement'
  selected_columns.dropna(inplace = True)
  return selected_columns

In [121]:
df_SO2 =  ETL_air_quality(df_SO2)

In [122]:
df_SO2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 642 entries, 6 to 1052
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   parameter_code      642 non-null    int64  
 1   sample_measurement  642 non-null    float64
 2   latitude            642 non-null    float64
 3   longitude           642 non-null    float64
 4   year                642 non-null    int64  
 5   month               642 non-null    int64  
 6   day                 642 non-null    int64  
 7   hour                642 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 45.1 KB


In [123]:
df_CO =  ETL_air_quality(df_CO)
df_CO.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1239 entries, 11 to 2049
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   parameter_code      1239 non-null   int64  
 1   sample_measurement  1239 non-null   float64
 2   latitude            1239 non-null   float64
 3   longitude           1239 non-null   float64
 4   year                1239 non-null   int64  
 5   month               1239 non-null   int64  
 6   day                 1239 non-null   int64  
 7   hour                1239 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 87.1 KB


In [124]:
df_NO2 =  ETL_air_quality(df_NO2)
df_NO2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1261 entries, 26 to 2105
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   parameter_code      1261 non-null   int64  
 1   sample_measurement  1261 non-null   float64
 2   latitude            1261 non-null   float64
 3   longitude           1261 non-null   float64
 4   year                1261 non-null   int64  
 5   month               1261 non-null   int64  
 6   day                 1261 non-null   int64  
 7   hour                1261 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 88.7 KB


## Unión y Dataset Final "**Queens_air_quality**"

In [125]:
Queens_air_quality = pd.concat([df_PM25, df_SO2, df_CO, df_NO2], ignore_index=True)

In [126]:
# Agregar un índice numérico predeterminado
Queens_air_quality.set_index(pd.RangeIndex(start=0, stop=len(Queens_air_quality)), inplace=True)

# Convertir el índice en una columna
Queens_air_quality.reset_index(inplace=True)

# Cambiar el nombre de la columna de índice a "ID"
Queens_air_quality.rename(columns={'index': 'ID_aq'}, inplace=True)

In [127]:
Queens_air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4377 entries, 0 to 4376
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID_aq               4377 non-null   int64  
 1   parameter_code      4377 non-null   int64  
 2   sample_measurement  4377 non-null   float64
 3   latitude            4377 non-null   float64
 4   longitude           4377 non-null   float64
 5   year                4377 non-null   int64  
 6   month               4377 non-null   int64  
 7   day                 4377 non-null   int64  
 8   hour                4377 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 307.9 KB


## Guardar el dataset como "**Queens_air_quality.csv**"

In [128]:
Queens_air_quality.to_csv('Datasets/Outputs/Queens_air_quality.csv', index=False)

In [129]:
import pandas as pd

# Crear un DataFrame de ejemplo
data = {'Column1': [1, 2, 3, 4], 'Column2': [5, 6, 7, 8]}
df = pd.DataFrame(data)



# Imprimir el DataFrame con el índice como una columna
print(df)



   Column1  Column2
0        1        5
1        2        6
2        3        7
3        4        8


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Column1  4 non-null      int64
 1   Column2  4 non-null      int64
dtypes: int64(2)
memory usage: 192.0 bytes
