In [1]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
def convertir_ugm3_a_ppb(ugm3, peso_molecular):
    # 24.45 es el volumen molar del gas ideal a 25°C y 1 atm en L/mol
    volumen_molar = 24.45
    ppb = (ugm3 * volumen_molar) / peso_molecular
    return ppb

Pre-procesamiento, eliminación datos faltantes y valores negativos, normalización, y cambio de unidades

In [None]:
dir = '..\\preprocessing_airpollution_data/all_data_2005xstation_15_05_2024'  
dir_sinNaN = "datos_por_estacion_pre_processing"
if not os.path.exists(dir_sinNaN ):
    os.makedirs(dir_sinNaN )
files = os.listdir(dir)

info_datasets = []
columnas = ["file_name", "len_dataset","%datosFaltantes"]

for file_name in files:
    file_path = os.path.join(dir,file_name)
    print(file_path)
    dataset = pd.read_csv(file_path)
    dataset[['year', 'month', 'day']] = dataset['date'].str.split('/', expand=True)
    dataset["hour"] = dataset["day"]
    dataset[["day", "hour"]] = dataset["day"].str.split(' ', expand=True)
    dataset[["hour","minute"]] = dataset["hour"].str.split(':', expand=True)

    porcentaje_total_vacios = round(dataset.isna().mean().mean() * 100,2)
    info_datasets.append([file_name, len(dataset), porcentaje_total_vacios])
    
    #dataset = dataset.dropna(axis=1, how='all')
    df = dataset.copy()

    # Borra por columnas, si toda la columna esta vacía (suma 0) entonces la borra completa
    for i in dataset:
        suma = dataset[i].sum()
        if suma == 0:
            df.drop(columns=[i], inplace=True)

    dataset = df
    # Borra por fila, Drop rows which contain missing values.
    dataset = dataset.dropna(axis=0)
    df = dataset[["date", "year", "month", "day", "hour", "minute"]].copy()
    dataset = dataset.drop(["date", "year", "month", "day", "hour", "minute"], axis=1)
    # Elimina aquellas columnas que tienen solo un valor único en todas sus filas.
    for column in dataset.columns:
        if dataset[column].nunique() == 1:
            dataset = dataset.drop(columns=[column], axis=1)

    # Eliminación de las filas con valores negativos
    dataset = dataset[(dataset >= 0).all(axis=1)]

    # Cambio de unidades. Conversión de ppm a ppb
    if "CO" in dataset.columns:
        dataset["CO"] = dataset["CO"] * 1000

    if len(dataset) > 0 and len(dataset.columns) > 0:
        # Normalización
        scaler = MinMaxScaler()
        df_normalized = scaler.fit_transform(dataset)
        dataset = pd.DataFrame(df_normalized, columns=dataset.columns)
        df.reset_index(inplace=True, drop=True)

        dataset.insert(0, "date", df["date"])
        df = df.drop(["date"], axis=1)
        dataset[["year", "month", "day", "hour", "minute"]] = df    

        name = os.path.join(dir_sinNaN, file_name)
        dataset.to_csv(name, index=False)

info_name = "info_split_datasets.csv"
info_datasets = pd.DataFrame(info_datasets, columns=columnas)
info_datasets.to_csv(info_name, index=False)

Información de los datos

In [None]:
columnas_especificas = ['CO', 'NO', 'NOX', 'NO2', 'O3', 'PM10', 'PM25', 'RH', 'SO2', 'TMP', 'WDR', 'WSP']
dir = '..\\data_processing/datos_por_estacion_pre_processing'  
files = os.listdir(dir)
lista = []
for file_name in files:
    columnas_cont = []
    print(file_name[:-4])
    file_path = os.path.join(dir,file_name)
    df = pd.read_csv(file_path)
    valores_minimos = df.min()
    valores_maximos = df.max()
    df.drop(columns=["date", "year", "month", "day", "hour", "minute"], inplace=True)
    print(valores_minimos.date, valores_maximos.date, len(df), df.columns)
    columnas_cont = [file_name[:-4], valores_minimos.date, valores_minimos.year, valores_maximos.date, valores_maximos.year, len(df)]
    for columna in columnas_especificas:
        if columna in df.columns:
            columnas_cont.append(1)
        else:
            columnas_cont.append(0)
    lista.append(columnas_cont)
    print("")

info_name = "datos_data_pre-processed.csv"
lista_columnas = ['estacion', 'primer_registro','oldest_year', 'ultimo_registro', 'new_year', 'num_datos', 'CO', 'NO', 'NOX', 'NO2', 'O3', 'PM10', 'PM25', 'RH', 'SO2', 'TMP', 'WDR', 'WSP']
info_datasets = pd.DataFrame(lista, columns=lista_columnas)
info_datasets.to_csv(info_name, index=False)

In [2]:
dir = '..\\preprocessing_airpollution_data/all_data_2005xstation_15_05_2024'
archivos = os.listdir(dir)
file_name = "PED.csv"
file_path = os.path.join(dir,file_name)
file_path= "C:/Users/valer/Downloads/PED_raw_data.csv"
print(file_path)
df = pd.read_csv(file_path)
valores_minimos = df.min()
valores_maximos = df.max()
print("Valores mínimos de cada columna:")
print(valores_minimos)
print("\nValores máximos de cada columna:")
print(valores_maximos)
print(len(df))

C:/Users/valer/Downloads/PED_raw_data.csv
Valores mínimos de cada columna:
date    2005/01/01 00:00
CO                 -0.37
NO                  -3.0
NOX                  0.0
NO2                  0.0
O3                   0.0
PM10                 0.0
PM25                 0.0
RH                   0.0
SO2                  0.0
TMP                  0.0
WDR                  0.0
WSP                  0.0
dtype: object

Valores máximos de cada columna:
date    2024/05/15 17:00
CO                   7.5
NO                 360.0
NOX                394.0
NO2                153.0
O3                 220.0
PM10               576.0
PM25               179.0
RH                 100.0
SO2                197.0
TMP                 33.0
WDR                360.0
WSP                  9.8
dtype: object
255672


In [15]:
df.describe()

Unnamed: 0,CO,NO,NOX,NO2,O3,PM10,PM25,RH,SO2,TMP,WDR,WSP
count,242335.0,226956.0,241496.0,241488.0,139269.0,101380.0,60967.0,119018.0,132509.0,119914.0,128717.0,128569.0
mean,0.543302,22.803503,35.850167,25.425483,33.555343,36.105396,19.394935,51.547564,3.823227,16.94893,188.403101,1.688662
std,0.495481,51.440607,27.557762,14.0904,30.135606,22.860302,12.962814,21.566037,6.201895,4.715651,111.24688,0.996897
min,-0.37,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.2,1.0,17.0,15.0,11.0,19.0,10.0,35.0,1.0,14.0,72.0,1.0
50%,0.4,5.0,28.0,23.0,24.0,32.0,17.0,51.0,2.0,16.0,214.0,1.6
75%,0.7,20.0,46.0,32.0,48.0,48.0,26.0,69.0,4.0,20.0,277.0,2.2
max,7.5,360.0,394.0,153.0,220.0,576.0,179.0,100.0,197.0,33.0,360.0,9.8
