In [20]:
from pathlib import Path

import pandas as pd

data_path = Path.cwd() / "raw_data_2" / "Luftqualitaet_Zusammenfassung.csv"

df = pd.read_csv(data_path, encoding="UTF-8", sep=";")
df = df.rename(
    columns={
        "Stationscode": "code",
        "Stationsname": "name",
        "Stationsumgebung": "area",
        "Art der Station": "type",
        "Timestamp": "timestamp",
        "Einheit": "unit",
        "Feinstaub (PM₁₀)": "pm10",
        "Feinstaub (PM₂,₅)": "pm25",
        "Kohlenmonoxid (CO)": "co",
        "Ozon (O₃)": "o3",
        "Schwefeldioxid (SO₂)": "so2",
        "Stickstoffdioxid (NO₂)": "no2",
    }
)
df = df.drop(columns=["co"])

# to numeric, coerce errors to NaN
pollutant_cols = ["pm10", "pm25", "o3", "so2", "no2"]
for col in pollutant_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# to datetime, coerce errors to NaT
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df["time"] = df["timestamp"].dt.time
df["date"] = df["timestamp"].dt.date

df.head()

  df = pd.read_csv(data_path, encoding="UTF-8", sep=";")


Unnamed: 0,code,name,area,type,timestamp,unit,pm10,pm25,o3,so2,no2,time,date
0,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 01:00:00,µg/m³,,,54.0,4.0,8.0,01:00:00,2023-01-01
1,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 02:00:00,µg/m³,,,59.0,2.0,4.0,02:00:00,2023-01-01
2,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 03:00:00,µg/m³,,,59.0,1.0,4.0,03:00:00,2023-01-01
3,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 04:00:00,µg/m³,,,60.0,1.0,4.0,04:00:00,2023-01-01
4,DEBB021,Potsdam-Zentrum,städtisches Gebiet,Hintergrund,2023-01-01 05:00:00,µg/m³,,,61.0,,4.0,05:00:00,2023-01-01


In [52]:
after_newyear_df = df[df["date"] > pd.to_datetime("2023-01-01").date()]

In [58]:
def get_hightimes(df, pollutant):
    max_df = df.groupby(["code", "date"])[pollutant].max(skipna=True).dropna()
    return df.merge(max_df, on=["code", "date", pollutant], how="right")

In [None]:
SHOW_N = 6

for pollutant in pollutant_cols:
    pollutant_df = get_hightimes(df, pollutant)
    print(f"Highest {pollutant} values:")
    print(f"{pollutant_df['timestamp'].dt.hour.mean():.3f}")
    print(f"{pollutant_df['timestamp'].dt.hour.median()}")

    for time, count in list(pollutant_df["time"].value_counts().items())[:SHOW_N]:
        mean = pollutant_df[pollutant_df["time"] == time][pollutant].mean()
        median = pollutant_df[pollutant_df["time"] == time][pollutant].median()
        std = pollutant_df[pollutant_df["time"] == time][pollutant].std()
        print(f"{time} ({count}): {mean:.3f} (median: {median:.3f}, std: {std:.3f})")
    
    print()

Highest pm10 values:
12.441
12.0
23:00:00 (17297): 19.434 (median: 17.000, std: 12.319)
01:00:00 (15069): 21.863 (median: 17.000, std: 39.204)
09:00:00 (12646): 21.856 (median: 19.000, std: 16.357)
22:00:00 (12635): 19.868 (median: 17.000, std: 14.660)
21:00:00 (11160): 20.222 (median: 17.000, std: 15.190)
10:00:00 (10772): 21.170 (median: 19.000, std: 14.148)

Highest pm25 values:
12.187
11.0
23:00:00 (20791): 11.764 (median: 10.000, std: 8.534)
01:00:00 (16528): 13.296 (median: 10.000, std: 24.755)
22:00:00 (15100): 11.139 (median: 9.000, std: 8.666)
09:00:00 (14125): 11.079 (median: 9.000, std: 7.870)
08:00:00 (13155): 10.682 (median: 9.000, std: 7.142)
21:00:00 (12787): 11.188 (median: 9.000, std: 10.357)

Highest o3 values:
14.316
16.0
16:00:00 (15933): 82.802 (median: 81.000, std: 24.156)
17:00:00 (15822): 89.024 (median: 87.000, std: 22.500)
18:00:00 (13893): 93.242 (median: 92.000, std: 22.587)
15:00:00 (13862): 77.286 (median: 75.000, std: 25.001)
19:00:00 (10323): 93.170 (med