In [1]:
from pathlib import Path

import pandas as pd

data_path = Path.cwd() / "raw_data_2" / "Luftqualitaet_Zusammenfassung.csv"

df = pd.read_csv(data_path, encoding="UTF-8", sep=";")
df = df.rename(
    columns={
        "Stationscode": "code",
        "Stationsname": "name",
        "Stationsumgebung": "area",
        "Art der Station": "type",
        "Timestamp": "timestamp",
        "Einheit": "unit",
        "Feinstaub (PM₁₀)": "pm10",
        "Feinstaub (PM₂,₅)": "pm25",
        "Kohlenmonoxid (CO)": "co",
        "Ozon (O₃)": "o3",
        "Schwefeldioxid (SO₂)": "so2",
        "Stickstoffdioxid (NO₂)": "no2",
    }
)
df = df.drop(columns=["co"])

# to numeric, coerce errors to NaN
pollutant_cols = ["pm10", "pm25", "o3", "so2", "no2"]
for col in pollutant_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# to datetime, coerce errors to NaT
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df["time"] = df["timestamp"].dt.time
df["date"] = df["timestamp"].dt.date

df.head()

  df = pd.read_csv(data_path, encoding="UTF-8", sep=";")


Unnamed: 0,code,name,area,type,timestamp,unit,pm10,pm25,o3,so2,no2,time,date
0,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 01:00:00,µg/m³,,,54.0,4.0,8.0,01:00:00,2023-01-01
1,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 02:00:00,µg/m³,,,59.0,2.0,4.0,02:00:00,2023-01-01
2,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 03:00:00,µg/m³,,,59.0,1.0,4.0,03:00:00,2023-01-01
3,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 04:00:00,µg/m³,,,60.0,1.0,4.0,04:00:00,2023-01-01
4,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 05:00:00,µg/m³,,,61.0,,4.0,05:00:00,2023-01-01


In [2]:
after_newyear_df = df[df["date"] > pd.to_datetime("2023-01-01").date()]

In [3]:
def get_hightimes(df, pollutant):
    max_df = df.groupby(["code", "date"])[pollutant].max().dropna()
    return df.merge(max_df, on=["code", "date", pollutant], how="right")

In [4]:
SHOW_N = 6

for pollutant in pollutant_cols:
    pollutant_df = get_hightimes(df, pollutant)
    print(pollutant_df.info())
    print(f"Highest {pollutant} values:")
    print(f"{pollutant_df['timestamp'].dt.hour.mean():.3f}")
    print(f"{pollutant_df['timestamp'].dt.hour.median()}")

    for time, count in list(pollutant_df["time"].value_counts().items())[:SHOW_N]:
        mean = pollutant_df[pollutant_df["time"] == time][pollutant].mean()
        median = pollutant_df[pollutant_df["time"] == time][pollutant].median()
        std = pollutant_df[pollutant_df["time"] == time][pollutant].std()
        print(f"{time} ({count}): {mean:.3f} (median: {median:.3f}, std: {std:.3f})")
    
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198142 entries, 0 to 198141
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   code       198142 non-null  object        
 1   name       198142 non-null  object        
 2   area       198142 non-null  object        
 3   type       198142 non-null  object        
 4   timestamp  198142 non-null  datetime64[ns]
 5   unit       198142 non-null  object        
 6   pm10       198142 non-null  float64       
 7   pm25       150751 non-null  float64       
 8   o3         128208 non-null  float64       
 9   so2        28982 non-null   float64       
 10  no2        184324 non-null  float64       
 11  time       198142 non-null  object        
 12  date       198142 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(7)
memory usage: 19.7+ MB
None
Highest pm10 values:
12.441
12.0
23:00:00 (17297): 19.434 (median: 17.000, std: 12.319)
01:0