### Retrieve wind speed data from DWD's [Open Data Server](https://www.dwd.de/EN/ourservices/opendata/opendata.html).

In [1]:
from bs4 import BeautifulSoup
import requests

#### List files to be downloaded

In [2]:
url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/wind/recent/'
ext = 'zip'

def listFD(url, ext=''):
    page = requests.get(url).text
#     print(page)
    soup = BeautifulSoup(page, 'html.parser')
    return [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]

urls = listFD(url, ext)

#### Download zip files, extract CSVs from them, and merge into dataframe
(Method adopted from [this blog post](https://stereopickle.medium.com/how-to-download-unzip-zip-files-in-python-5f326bb1a829))

In [3]:
from io import BytesIO
import pandas as pd
from zipfile import ZipFile
from urllib.request import urlopen

In [4]:
full_df = pd.DataFrame()

for link in urls:
    # open url
    resp = urlopen(link)
    # read zipfile
    zipfile = ZipFile(BytesIO(resp.read()))
    # get the csv file name
    fname = zipfile.namelist()[-1]
    # convert to pandas dateframe
    df = pd.read_csv(zipfile.open(fname), delimiter=';', dtype=object)
    # close zipfile we don't need
    zipfile.close()
    
    # concatenate dataframes
    full_df = pd.concat([full_df, df], ignore_index=True, sort=False)
    
    # let me know how much it is completed.
    print(f'{fname} Completed')

produkt_ff_stunde_20200819_20200930_00011.txt Completed
produkt_ff_stunde_20200819_20220219_00090.txt Completed
produkt_ff_stunde_20200819_20220219_00096.txt Completed
produkt_ff_stunde_20200819_20220219_00102.txt Completed
produkt_ff_stunde_20200819_20220131_00125.txt Completed
produkt_ff_stunde_20200819_20220219_00161.txt Completed
produkt_ff_stunde_20200819_20220219_00164.txt Completed
produkt_ff_stunde_20200819_20220219_00183.txt Completed
produkt_ff_stunde_20200819_20220219_00197.txt Completed
produkt_ff_stunde_20200819_20220219_00198.txt Completed
produkt_ff_stunde_20200819_20220219_00232.txt Completed
produkt_ff_stunde_20200819_20220219_00282.txt Completed
produkt_ff_stunde_20200819_20220219_00298.txt Completed
produkt_ff_stunde_20200819_20220219_00303.txt Completed
produkt_ff_stunde_20200819_20220219_00342.txt Completed
produkt_ff_stunde_20200819_20220131_00368.txt Completed
produkt_ff_stunde_20200819_20220219_00427.txt Completed
produkt_ff_stunde_20200819_20210505_00430.txt Co

produkt_ff_stunde_20200819_20220219_03534.txt Completed
produkt_ff_stunde_20200819_20220131_03623.txt Completed
produkt_ff_stunde_20200819_20220219_03631.txt Completed
produkt_ff_stunde_20200819_20220131_03639.txt Completed
produkt_ff_stunde_20200819_20220219_03651.txt Completed
produkt_ff_stunde_20200819_20220219_03660.txt Completed
produkt_ff_stunde_20200819_20220219_03668.txt Completed
produkt_ff_stunde_20200819_20220219_03730.txt Completed
produkt_ff_stunde_20200819_20220219_03761.txt Completed
produkt_ff_stunde_20200819_20220219_03811.txt Completed
produkt_ff_stunde_20200819_20220219_03821.txt Completed
produkt_ff_stunde_20200819_20220219_03897.txt Completed
produkt_ff_stunde_20200819_20220219_03905.txt Completed
produkt_ff_stunde_20200819_20220219_03925.txt Completed
produkt_ff_stunde_20200819_20220219_03946.txt Completed
produkt_ff_stunde_20200819_20220219_03987.txt Completed
produkt_ff_stunde_20200819_20220219_04024.txt Completed
produkt_ff_stunde_20200819_20220219_04032.txt Co

In [5]:
df = full_df.copy()
df.shape

(3730620, 6)

In [6]:
df.columns = df.columns.str.strip()

In [7]:
df = df.set_index('STATIONS_ID')

In [8]:
df.iloc[:, [1,2,3]] = df.iloc[:,[1,2,3]].astype('float')
df.iloc[:, [1,2,3]] = df.iloc[:, [1,2,3]].clip(lower=0)

In [9]:
df.iloc[:,0] = pd.to_datetime(df.iloc[:,0], format='%Y%m%d%H')

In [10]:
df = df[(df.MESS_DATUM >= '2022-01-01')]
df.shape

(332461, 5)

In [11]:
df = df.drop(columns=['eor'])

In [12]:
df.to_csv('./wind_speed.csv')