# Alle notwendigen Inputdaten einlesen

## Imports & Generische Konfig

In [18]:
#Variables
import config

#Preprocessing
import preprocessing_functions as pf

#Libraries
import requests
import json
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sqlalchemy import text
from datetime import date

#Verbindung zur DB aufbauen mit preprocessing function
pf.db_connect(config.db_name)


## Historische Feinstaub Daten (ZH) auslesen

In [19]:
url = 'https://data.stadt-zuerich.ch/dataset/ugz_luftschadstoffmessung_tageswerte'

# Verbindung zur Webseite herstellen
response = requests.get(url)

# Webseite parsen
soup = BeautifulSoup(response.text, 'html.parser')

# Alle CSV-Links finden
csv_links = []
for a in soup.find_all('a', href=True):
    link = a['href']
    if 'ugz_ogd_air_d1_2' in link and link.endswith('.csv'):
        csv_links.append(link)

# CSV-Dateien herunterladen
for link in csv_links:
    response = requests.get(link)
    with open('csv-files/'+link.split('/')[-1], 'wb') as f:
        f.write(response.content)

# Liste aller ugz CSV-Dateien im Verzeichnis csv-files
csv_files = glob.glob('csv-files/ugz*.csv')

# Leeres DataFrame zum Speichern der gefilterten Daten erstellen
AQI_history_ZH = pd.DataFrame()
    
# Durch alle CSV-Dateien iterieren und Daten filtern
for file in csv_files:
    df = pd.read_csv(file)
    df_filtered = df[df['Parameter'] == 'PM10']
    df_filtered = df_filtered[df_filtered['Standort'] == 'Zch_Stampfenbachstrasse']
    df_filtered['Datum'] = df_filtered['Datum'].apply(pf.format_timestamp)

    AQI_history_ZH = pd.concat([AQI_history_ZH, df_filtered])

# Kommentar Simon: append funktioniert bei mir nicht. Concat schon, deshalb hab' ich es provisorisch getauscht.
    #AQI_history_ZH = AQI_history_ZH.append(df_filtered) 

# Index neu setzen
AQI_history_ZH.reset_index(drop=True, inplace=True)

# Dataframe in die DB schreiben
AQI_history_ZH.to_sql(config.db_AQI_history, config.db_login, if_exists='append', index=False)


Unnamed: 0,Datum,Standort,Parameter,Intervall,Einheit,Wert,Status
0,04.01.01,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,22.84,bereinigt
1,05.01.01,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,22.63,bereinigt
2,06.01.01,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,10.93,bereinigt
3,07.01.01,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,,bereinigt
4,08.01.01,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,18.01,bereinigt
...,...,...,...,...,...,...,...
8163,15.05.23,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,7.07,provisorisch
8164,16.05.23,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,15.43,provisorisch
8165,17.05.23,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,9.23,provisorisch
8166,18.05.23,Zch_Stampfenbachstrasse,PM10,d1,µg/m3,11.80,provisorisch


## Historische Wetterdaten (ZH) aus csv einlesen

In [20]:
# CSV-File ins Dataframe einlesen
weather_history_ZH = pd.read_csv(config.csv_weatherhistory)

weather_history_ZH['datetime'] = weather_history_ZH['datetime'].apply(pf.format_timestamp)


# Index neu setzen
weather_history_ZH.reset_index(drop=True, inplace=True)

# Dataframe in die DB schreiben
weather_history_ZH.to_sql(config.db_weather_history, config.db_login, if_exists='append', index=False)


Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,Zch_Stampfenbachstrasse,01.01.01,1.3,-5.9,-1.8,1.3,-5.9,-2.2,-4.5,81.9,...,,,,2001-01-01T08:13:21,2001-01-01T16:45:47,0.22,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,066890999990667009999906660099999
1,Zch_Stampfenbachstrasse,02.01.01,5.6,0.9,3.4,5.3,-1.7,2.1,2.0,90.1,...,,,,2001-01-02T08:13:19,2001-01-02T16:46:46,0.25,"Snow, Rain, Overcast",Cloudy skies throughout the day with a chance ...,rain,066890999990667009999906660099999
2,Zch_Stampfenbachstrasse,03.01.01,7.2,2.1,5.1,4.6,0.8,2.6,1.7,78.7,...,,,,2001-01-03T08:13:14,2001-01-03T16:47:46,0.29,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...,rain,066890999990667009999906660099999
3,Zch_Stampfenbachstrasse,04.01.01,6.6,0.1,3.3,4.0,0.1,2.1,0.5,82.0,...,,,,2001-01-04T08:13:07,2001-01-04T16:48:49,0.32,"Rain, Partially cloudy",Partly cloudy throughout the day with late aft...,rain,066890999990667009999906660099999
4,Zch_Stampfenbachstrasse,05.01.01,7.3,1.1,4.3,6.4,1.1,3.9,1.9,84.8,...,,,,2001-01-05T08:12:57,2001-01-05T16:49:54,0.35,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,066890999990667009999906660099999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8151,Zch_Stampfenbachstrasse,27.04.23,17.8,7.9,12.5,17.8,7.9,12.5,5.9,67.0,...,16.2,10.0,10.0,2023-04-27T06:16:38,2023-04-27T20:31:25,0.25,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"06670099999,06647099999,06673099999,0666009999..."
8152,Zch_Stampfenbachstrasse,28.04.23,14.5,10.4,12.0,14.5,10.4,12.0,10.5,90.9,...,4.4,3.0,10.0,2023-04-28T06:14:56,2023-04-28T20:32:48,0.26,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"06670099999,06647099999,06673099999,C3619,0666..."
8153,Zch_Stampfenbachstrasse,29.04.23,19.6,12.0,15.2,19.6,12.0,15.2,10.3,75.3,...,12.3,8.0,10.0,2023-04-29T06:13:16,2023-04-29T20:34:11,0.30,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"06679099999,06670099999,06647099999,0667309999..."
8154,Zch_Stampfenbachstrasse,30.04.23,15.9,10.2,12.7,15.9,10.2,12.7,7.3,70.3,...,8.0,4.0,10.0,2023-04-30T06:11:37,2023-04-30T20:35:34,0.33,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"06670099999,06673099999,06660099999,0667409999..."


In [23]:
# Anreicherung der Daten:

# Simple-Moving-Average (7, 14, 30 Tage Temperatur)
window_sizes_temp = [7, 14, 30]

for window_size_temp in window_sizes_temp:
    column_name = f"SMA_temp{window_size_temp}"
    weather_history_ZH[column_name] = weather_history_ZH['temp'].rolling(window=window_size_temp).mean()

# Simple-Moving-Average (7, 14, 30 Tage Luftfeuchtigkeit)
window_sizes_hum = [7, 14, 30]

for window_size_hum in window_sizes_hum:
    column_name = f"SMA_humidity{window_size_hum}"
    weather_history_ZH[column_name] = weather_history_ZH['humidity'].rolling(window=window_size_hum).mean()

# Temperatur Range
weather_history_ZH['Temp_Range'] = weather_history_ZH['tempmax'] - weather_history_ZH['tempmin']

# Jahreszeit 
weather_history_ZH['Season'] = (weather_history_ZH['datetime'].apply(lambda x: int(x.split('.')[1]))%12 + 3)//3
seasons = {1: 'Winter', 2: 'Frühling', 3: 'Sommer', 4: 'Herbst'}
weather_history_ZH['Season'] = weather_history_ZH['Season'].map(seasons)

# Wetterkonditionen
def weather_type(condition):
    if 'cloudy' in condition.lower():
        return 'Cloudy'
    elif 'rain' in condition.lower():
        return 'Rainy'
    elif 'snow' in condition.lower():
        return 'Snowy'
    else:
        return 'Clear'

weather_history_ZH['Weather_Type'] = weather_history_ZH['conditions'].apply(weather_type)

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,SMA_temp14,SMA_temp30,SMA_humidity7,SMA_humidity14,SMA_humidity30,Temp_Range,Season,Is_Raining,Is_Snowing,Weather_Type
0,Zch_Stampfenbachstrasse,01.01.01,1.3,-5.9,-1.8,1.3,-5.9,-2.2,-4.5,81.9,...,,,,,,7.2,Winter,0,0,Cloudy
1,Zch_Stampfenbachstrasse,02.01.01,5.6,0.9,3.4,5.3,-1.7,2.1,2.0,90.1,...,,,,,,4.7,Winter,1,1,Rainy
2,Zch_Stampfenbachstrasse,03.01.01,7.2,2.1,5.1,4.6,0.8,2.6,1.7,78.7,...,,,,,,5.1,Winter,1,0,Cloudy
3,Zch_Stampfenbachstrasse,04.01.01,6.6,0.1,3.3,4.0,0.1,2.1,0.5,82.0,...,,,,,,6.5,Winter,1,0,Cloudy
4,Zch_Stampfenbachstrasse,05.01.01,7.3,1.1,4.3,6.4,1.1,3.9,1.9,84.8,...,,,,,,6.2,Winter,1,0,Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8151,Zch_Stampfenbachstrasse,27.04.23,17.8,7.9,12.5,17.8,7.9,12.5,5.9,67.0,...,8.635714,8.073333,74.600000,76.778571,71.556667,9.9,Frühling,1,0,Cloudy
8152,Zch_Stampfenbachstrasse,28.04.23,14.5,10.4,12.0,14.5,10.4,12.0,10.5,90.9,...,9.028571,8.123333,75.542857,78.528571,72.563333,4.1,Frühling,1,0,Cloudy
8153,Zch_Stampfenbachstrasse,29.04.23,19.6,12.0,15.2,19.6,12.0,15.2,10.3,75.3,...,9.642857,8.213333,76.071429,78.707143,72.806667,7.6,Frühling,1,0,Cloudy
8154,Zch_Stampfenbachstrasse,30.04.23,15.9,10.2,12.7,15.9,10.2,12.7,7.3,70.3,...,10.078571,8.293333,75.114286,77.014286,72.823333,5.7,Frühling,0,0,Cloudy


## DB Testen für eingelesene Daten

In [None]:
# Check Output von DB für AQI_history
SQLquery = text('SELECT * FROM ' + config.db_AQI_history + ' AS AQI_History')
df_AQIHistoryDB = pd.read_sql(SQLquery, con=config.db_login.connect())

df_AQIHistoryDB.head()

In [None]:
# Check Output von DB für weather_history
SQLquery = text('SELECT * FROM ' + config.db_weather_history + ' AS weather_History')
df_weatherHistory = pd.read_sql(SQLquery, con=config.db_login.connect())

df_weatherHistory.head()

In [None]:
#Gemeinsame Abfrage der Tabellen
SQLquery = text('SELECT * FROM ' + config.db_weather_history + ' AS w JOIN ' + config.db_AQI_history + ' AS aqi ON w."datetime" = aqi."Datum"')
df_METEODB = pd.read_sql(SQLquery, con=config.db_login.connect())

df_METEODB.head()