In [1]:
server = "opendata.dwd.de"
user   = "anonymous"
passwd = ""

In [2]:
# The topic of interest.
topic_dir = "/hourly/precipitation/historical/"
#topic_dir = "/annual/kl/historical/"

# This is the search pattern common to ALL station description file names 
station_desc_pattern = "_Beschreibung_Stationen.txt"

# Below this directory tree node all climate data are stored.
ftp_climate_data_dir = "/climate_environment/CDC/observations_germany/climate/"
ftp_dir =  ftp_climate_data_dir + topic_dir

In [3]:
local_ftp_dir         = "../data/original/DWD/"      # Local directory to store local ftp data copies, the local data source or input data. 
local_ftp_station_dir = local_ftp_dir + topic_dir # Local directory where local station info is located
local_ftp_ts_dir      = local_ftp_dir + topic_dir # Local directory where time series downloaded from ftp are located

local_generated_dir   = "../data/generated/DWD/" # The generated of derived data in contrast to local_ftp_dir
local_station_dir     = local_generated_dir + topic_dir # Derived station data, i.e. the CSV file
local_ts_merged_dir   = local_generated_dir + topic_dir # Parallelly merged time series, wide data frame with one TS per column
local_ts_appended_dir = local_generated_dir + topic_dir # Serially appended time series, long data frame for QGIS TimeManager Plugin

In [4]:
print(local_ftp_dir)
print(local_ftp_station_dir)
print(local_ftp_ts_dir)
print()
print(local_generated_dir)
print(local_station_dir)
print(local_ts_merged_dir)
print(local_ts_appended_dir)

../data/original/DWD/
../data/original/DWD//hourly/precipitation/historical/
../data/original/DWD//hourly/precipitation/historical/

../data/generated/DWD/
../data/generated/DWD//hourly/precipitation/historical/
../data/generated/DWD//hourly/precipitation/historical/
../data/generated/DWD//hourly/precipitation/historical/


In [5]:
import os
os.makedirs(local_ftp_dir,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_ftp_station_dir,exist_ok = True)
os.makedirs(local_ftp_ts_dir,exist_ok = True)

os.makedirs(local_generated_dir,exist_ok = True)
os.makedirs(local_station_dir,exist_ok = True)
os.makedirs(local_ts_merged_dir,exist_ok = True)
os.makedirs(local_ts_appended_dir,exist_ok = True)

In [6]:
import ftplib
ftp = ftplib.FTP(server)
res = ftp.login(user=user, passwd = passwd)
print(res)

230 Login successful.


In [7]:
ret = ftp.cwd(".")

In [8]:
def grabFile(ftpfullname,localfullname):
    try:
        ret = ftp.cwd(".") # A dummy action to chack the connection and to provoke an exception if necessary.
        localfile = open(localfullname, 'wb')
        ftp.retrbinary('RETR ' + ftpfullname, localfile.write, 1024)
        localfile.close()
    
    except ftplib.error_perm:
        print("FTP ERROR. Operation not permitted. File not found?")

    except ftplib.error_temp:
        print("FTP ERROR. Timeout.")

    except ConnectionAbortedError:
        print("FTP ERROR. Connection aborted.")

In [9]:
import pandas as pd
import os

def gen_df_from_ftp_dir_listing(ftp, ftpdir):
    lines = []
    flist = []
    try:    
        res = ftp.retrlines("LIST "+ftpdir, lines.append)
    except:
        print("Error: ftp.retrlines() failed. ftp timeout? Reconnect!")
        return
        
    if len(lines) == 0:
        print("Error: ftp dir is empty")
        return
    
    for line in lines:
#        print(line)
        [ftype, fsize, fname] = [line[0:1], int(line[31:42]), line[56:]]
#        itemlist = [line[0:1], int(line[31:42]), line[56:]]
#        flist.append(itemlist)
        
        fext = os.path.splitext(fname)[-1]
        
        if fext == ".zip":
            station_id = int(fname.split("_")[2])
        else:
            station_id = -1 
        
        flist.append([station_id, fname, fext, fsize, ftype])
        
        

    df_ftpdir = pd.DataFrame(flist,columns=["station_id", "name", "ext", "size", "type"])
    return(df_ftpdir)

In [10]:
df_ftpdir = gen_df_from_ftp_dir_listing(ftp, ftp_dir)

In [11]:
df_ftpdir.head(10)

Unnamed: 0,station_id,name,ext,size,type
0,-1,BESCHREIBUNG_obsgermany_climate_hourly_precipi...,.pdf,71445,-
1,-1,DESCRIPTION_obsgermany_climate_hourly_precipit...,.pdf,69716,-
2,-1,RR_Stundenwerte_Beschreibung_Stationen.txt,.txt,209079,-
3,3,stundenwerte_RR_00003_19950901_20110401_hist.zip,.zip,419265,-
4,20,stundenwerte_RR_00020_20040814_20191231_hist.zip,.zip,407378,-
5,44,stundenwerte_RR_00044_20070401_20191231_hist.zip,.zip,320516,-
6,53,stundenwerte_RR_00053_20051001_20191231_hist.zip,.zip,361931,-
7,71,stundenwerte_RR_00071_20041022_20191231_hist.zip,.zip,402880,-
8,73,stundenwerte_RR_00073_20070401_20191231_hist.zip,.zip,333070,-
9,78,stundenwerte_RR_00078_20041101_20191231_hist.zip,.zip,384729,-


In [12]:
#df_ftpdir["ext"]==".zip"
df_zips = df_ftpdir[df_ftpdir["ext"]==".zip"]
df_zips.set_index("station_id", inplace = True)
df_zips.head(10)

Unnamed: 0_level_0,name,ext,size,type
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,stundenwerte_RR_00003_19950901_20110401_hist.zip,.zip,419265,-
20,stundenwerte_RR_00020_20040814_20191231_hist.zip,.zip,407378,-
44,stundenwerte_RR_00044_20070401_20191231_hist.zip,.zip,320516,-
53,stundenwerte_RR_00053_20051001_20191231_hist.zip,.zip,361931,-
71,stundenwerte_RR_00071_20041022_20191231_hist.zip,.zip,402880,-
73,stundenwerte_RR_00073_20070401_20191231_hist.zip,.zip,333070,-
78,stundenwerte_RR_00078_20041101_20191231_hist.zip,.zip,384729,-
87,stundenwerte_RR_00087_20050201_20191231_hist.zip,.zip,379869,-
91,stundenwerte_RR_00091_20040901_20191231_hist.zip,.zip,394972,-
103,stundenwerte_RR_00103_20040701_20191231_hist.zip,.zip,402997,-


In [13]:
station_fname = df_ftpdir[df_ftpdir['name'].str.contains(station_desc_pattern)]["name"].values[0]
print(station_fname)

# ALternative
#station_fname2 = df_ftpdir[df_ftpdir["name"].str.match("^.*Beschreibung_Stationen.*txt$")]["name"].values[0]
#print(station_fname2)

RR_Stundenwerte_Beschreibung_Stationen.txt


In [14]:
print("grabFile: ")
print("From: " + ftp_dir + station_fname)
print("To:   " + local_ftp_station_dir + station_fname)
grabFile(ftp_dir + station_fname, local_ftp_station_dir + station_fname)

grabFile: 
From: /climate_environment/CDC/observations_germany/climate//hourly/precipitation/historical/RR_Stundenwerte_Beschreibung_Stationen.txt
To:   ../data/original/DWD//hourly/precipitation/historical/RR_Stundenwerte_Beschreibung_Stationen.txt


In [15]:
# extract column names. They are in German (de)
# We have to use codecs because of difficulties with character encoding (German Umlaute)
import codecs

def station_desc_txt_to_csv(txtfile, csvfile):
    file = codecs.open(txtfile,"r","utf-8")
    r = file.readline()
    file.close()
    colnames_de = r.split()
    colnames_de
    
    translate = \
    {'Stations_id':'station_id',
     'von_datum':'date_from',
     'bis_datum':'date_to',
     'Stationshoehe':'altitude',
     'geoBreite': 'latitude',
     'geoLaenge': 'longitude',
     'Stationsname':'name',
     'Bundesland':'state'}
    
    colnames_en = [translate[h] for h in colnames_de]
    
    # Skip the first two rows and set the column names.
    df = pd.read_fwf(txtfile,skiprows=2,names=colnames_en, parse_dates=["date_from","date_to"],index_col = 0)
    
    # write csv
    df.to_csv(csvfile, sep = ";")
    return(df)

In [16]:
basename = os.path.splitext(station_fname)[0]
df_stations = station_desc_txt_to_csv(local_ftp_station_dir + station_fname, local_station_dir + basename + ".csv")
df_stations.head()

Unnamed: 0_level_0,date_from,date_to,altitude,latitude,longitude,name,state
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
20,2004-08-14,2021-03-20,432,48.922,9.9129,Abtsgm�nd-Untergr�ningen,Baden-W�rttemberg
44,2007-04-01,2021-03-20,44,52.9336,8.237,Gro�enkneten,Niedersachsen
53,2005-10-01,2021-03-20,60,52.585,13.5634,Ahrensfelde,Brandenburg
71,2004-10-22,2020-01-01,759,48.2156,8.9784,Albstadt-Badkap,Baden-W�rttemberg


In [17]:
station_ids_selected = df_stations[df_stations['state'].str.contains("Nordrhein")].index
station_ids_selected

Int64Index([    3,   216,   326,   389,   390,   554,   555,   599,   603,
              613,   617,   644,   796,   871,   902,   934,   989,  1024,
             1046,  1078,  1241,  1246,  1300,  1303,  1327,  1590,  1595,
             1766,  2027,  2110,  2254,  2473,  2483,  2497,  2629,  2667,
             2703,  2810,  2947,  2968,  2999,  3028,  3031,  3081,  3098,
             3215,  3321,  3339,  3499,  3540,  3591,  3795,  3913,  4063,
             4127,  4150,  4154,  4313,  4368,  4371,  4400,  4488,  4692,
             4741,  4849,  5064,  5347,  5360,  5468,  5480,  5513,  5619,
             5699,  5717,  5719,  5733,  6197,  6264,  6276,  6313,  6337,
             7106,  7330,  7344,  7374,  7378, 13669, 13670, 13671, 13696,
            13700, 13713, 15000],
           dtype='int64', name='station_id')

In [18]:
# Create variable with TRUE if state is Nordrhein-Westfalen

# isNRW = df_stations['state'] == "Nordrhein-Westfalen"
isNRW = df_stations['state'].str.contains("Nordrhein")

# Create variable with TRUE if date_to is latest date (indicates operation up to now)
isOperational = df_stations['date_to'] == df_stations.date_to.max() 

#isBefore1950 = df_stations['date_from'] < '1950'
#dfNRW = df_stations[isNRW & isOperational & isBefore1950]

# select on both conditions

dfNRW = df_stations[isNRW & isOperational]

#print("Number of stations in NRW: \n", dfNRW.count())
dfNRW

Unnamed: 0_level_0,date_from,date_to,altitude,latitude,longitude,name,state
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
216,2004-10-01,2021-03-20,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
389,2009-11-01,2021-03-20,436,51.0148,8.4318,"Berleburg, Bad-Arfeld",Nordrhein-Westfalen
390,2004-07-01,2021-03-20,610,50.9837,8.3683,"Berleburg, Bad-St�nzel",Nordrhein-Westfalen
554,1995-09-01,2021-03-20,23,51.8293,6.5365,Bocholt-Liedern (Wasserwerk),Nordrhein-Westfalen
613,2004-11-01,2021-03-20,206,51.5677,9.2324,Borgentreich,Nordrhein-Westfalen
...,...,...,...,...,...,...,...
13671,2007-12-01,2021-03-20,221,50.9655,7.2753,Overath-B�ke,Nordrhein-Westfalen
13696,2007-12-01,2021-03-20,60,51.5966,7.4048,Waltrop-Abdinghof,Nordrhein-Westfalen
13700,2008-05-01,2021-03-20,205,51.3329,7.3411,Gevelsberg-Oberbr�king,Nordrhein-Westfalen
13713,2007-11-01,2021-03-20,386,51.0899,7.6289,Meinerzhagen-Redlendorf,Nordrhein-Westfalen


In [19]:
print(df_zips)

                                                        name   ext    size  \
station_id                                                                   
3           stundenwerte_RR_00003_19950901_20110401_hist.zip  .zip  419265   
20          stundenwerte_RR_00020_20040814_20191231_hist.zip  .zip  407378   
44          stundenwerte_RR_00044_20070401_20191231_hist.zip  .zip  320516   
53          stundenwerte_RR_00053_20051001_20191231_hist.zip  .zip  361931   
71          stundenwerte_RR_00071_20041022_20191231_hist.zip  .zip  402880   
...                                                      ...   ...     ...   
15478       stundenwerte_RR_15478_20150201_20191231_hist.zip  .zip  126658   
15490       stundenwerte_RR_15490_20161201_20191231_hist.zip  .zip   81166   
15512       stundenwerte_RR_15512_20160901_20191231_hist.zip  .zip   83934   
15514       stundenwerte_RR_15514_20171101_20191231_hist.zip  .zip   56846   
15555       stundenwerte_RR_15555_20160501_20191231_hist.zip  .z

In [20]:
list(dfNRW.index)

[216,
 389,
 390,
 554,
 613,
 617,
 644,
 796,
 871,
 902,
 934,
 989,
 1024,
 1046,
 1078,
 1241,
 1246,
 1300,
 1303,
 1327,
 1590,
 1595,
 1766,
 2027,
 2110,
 2254,
 2473,
 2483,
 2497,
 2629,
 2667,
 2810,
 2947,
 2968,
 2999,
 3028,
 3031,
 3081,
 3098,
 3215,
 3321,
 3339,
 3499,
 3540,
 3591,
 3795,
 3913,
 4063,
 4127,
 4150,
 4313,
 4368,
 4371,
 4400,
 4488,
 4741,
 4849,
 5064,
 5347,
 5360,
 5480,
 5513,
 5619,
 5699,
 5717,
 5733,
 6197,
 6264,
 6313,
 6337,
 7106,
 7330,
 7344,
 7374,
 7378,
 13669,
 13670,
 13671,
 13696,
 13700,
 13713,
 15000]

In [21]:
# Add the names of the zip files only to a list. 
local_zip_list = []

station_ids_selected = list(dfNRW.index)

for station_id in station_ids_selected:
    try:
        fname = df_zips["name"][station_id]
        print(fname)
        grabFile(ftp_dir + fname, local_ftp_ts_dir + fname)
        local_zip_list.append(fname)
    except:
        print("WARNING: TS file for key %d not found in FTP directory." % station_id)

stundenwerte_RR_00216_20041001_20191231_hist.zip
stundenwerte_RR_00389_20091101_20191231_hist.zip
stundenwerte_RR_00390_20040701_20191231_hist.zip
stundenwerte_RR_00554_19950901_20191231_hist.zip
stundenwerte_RR_00613_20041101_20191231_hist.zip
stundenwerte_RR_00617_20040601_20191231_hist.zip
stundenwerte_RR_00644_20050101_20191231_hist.zip
stundenwerte_RR_00796_20041101_20191231_hist.zip
stundenwerte_RR_00871_20050801_20191231_hist.zip
stundenwerte_RR_00902_20061001_20191231_hist.zip
stundenwerte_RR_00934_20041001_20191231_hist.zip
stundenwerte_RR_00989_20050201_20191231_hist.zip
stundenwerte_RR_01024_20060801_20191231_hist.zip
stundenwerte_RR_01046_20041001_20191231_hist.zip
stundenwerte_RR_01078_19950901_20191231_hist.zip
stundenwerte_RR_01241_20061201_20191231_hist.zip
stundenwerte_RR_01246_20150801_20191231_hist.zip
stundenwerte_RR_01300_20040601_20191231_hist.zip
stundenwerte_RR_01303_19950901_20191231_hist.zip
stundenwerte_RR_01327_20040801_20191231_hist.zip
stundenwerte_RR_0159

In [22]:
from datetime import datetime

def prec_ts_to_df(fname):
    
    dateparse = lambda dates: [datetime.strptime(str(d), '%Y%m%d%H') for d in dates]

    df = pd.read_csv(fname, delimiter=";", encoding="utf8", index_col="MESS_DATUM", parse_dates = ["MESS_DATUM"], date_parser = dateparse, na_values = [-999.0, -999])

    #df = pd.read_csv(fname, delimiter=";", encoding="iso8859_2",\
    #             index_col="MESS_DATUM", parse_dates = ["MESS_DATUM"], date_parser = dateparse)
    
    # https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd

    # Column headers: remove leading blanks (strip), replace " " with "_", and convert to lower case.
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    df.index.name = df.index.name.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
    return(df)

In [23]:
from datetime import datetime

def temp_ts_to_df(fname):
    
    dateparse = lambda dates: [datetime.strptime(str(d), '%Y%m%d') for d in dates]

    df = pd.read_csv(fname, delimiter=";", encoding="utf8", index_col="MESS_DATUM_BEGINN", parse_dates = ["MESS_DATUM_BEGINN"], date_parser = dateparse, na_values = [-999.0, -999])

    #df = pd.read_csv(fname, delimiter=";", encoding="iso8859_2",\
    #             index_col="MESS_DATUM", parse_dates = ["MESS_DATUM"], date_parser = dateparse)
    
    # https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd

    # Column headers: remove leading blanks (strip), replace " " with "_", and convert to lower case.
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    df.index.name = df.index.name.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
    return(df)

In [24]:
from zipfile import ZipFile

In [25]:
# PRECIPITATION
def prec_ts_merge():
    # Very compact code.
    df = pd.DataFrame()
    for elt in local_zip_list:
        ffname = local_ftp_ts_dir + elt
        print("Zip archive: " + ffname)
        with ZipFile(ffname) as myzip:
            # read the time series data from the file starting with "produkt"
            prodfilename = [elt for elt in myzip.namelist() if elt.split("_")[0]=="produkt"][0] 
            print("Extract product file: %s" % prodfilename)
            print()
            with myzip.open(prodfilename) as myfile:
                dftmp = prec_ts_to_df(myfile)
                s = dftmp["r1"].rename(dftmp["stations_id"][0]).to_frame()
                # outer merge.
                df = pd.merge(df, s, left_index=True, right_index=True, how='outer')

    #df.index.names = ["year"]
    df.index.rename(name = "time", inplace = True)
    return(df)

In [26]:
df_merged_ts = prec_ts_merge()

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00216_20041001_20191231_hist.zip
Extract product file: produkt_rr_stunde_20041001_20191231_00216.txt



  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00389_20091101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20091101_20191231_00389.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00390_20040701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040701_20191231_00390.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00554_19950901_20191231_hist.zip
Extract product file: produkt_rr_stunde_19950901_20191231_00554.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00613_20041101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20041101_20191231_00613.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00617_20040601_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040601_20191231_00617.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenw

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_03913_20040701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040701_20191231_03913.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04063_20030701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20030701_20191231_04063.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04127_20050101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20050101_20191231_04127.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04150_20051201_20191231_hist.zip
Extract product file: produkt_rr_stunde_20051201_20191231_04150.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04313_20040801_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040801_20191231_04313.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenw

In [27]:
df_merged_ts.shape

(213225, 82)

In [28]:
df_merged_ts.tail()

Unnamed: 0_level_0,216,389,390,554,613,617,644,796,871,902,...,7344,7374,7378,13669,13670,13671,13696,13700,13713,15000
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31 19:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 22:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# prec or temp?
filepathname = local_ts_merged_dir + "prec_ts_merged.csv"
print("df_merged_ts is saved to: %s" % (filepathname))
df_merged_ts.to_csv(filepathname,sep=";")

df_merged_ts is saved to: ../data/generated/DWD//hourly/precipitation/historical/prec_ts_merged.csv


In [30]:
df_merged_ts_transposed = df_merged_ts.transpose()

In [31]:
df_merged_ts_transposed.index.names = ['station_id']

In [32]:
df_merged_ts_transposed.shape

(82, 213225)

In [33]:
df_merged_ts_transposed.head()

time,1995-09-01 00:00:00,1995-09-01 01:00:00,1995-09-01 02:00:00,1995-09-01 03:00:00,1995-09-01 04:00:00,1995-09-01 05:00:00,1995-09-01 06:00:00,1995-09-01 07:00:00,1995-09-01 08:00:00,1995-09-01 09:00:00,...,2019-12-31 14:00:00,2019-12-31 15:00:00,2019-12-31 16:00:00,2019-12-31 17:00:00,2019-12-31 18:00:00,2019-12-31 19:00:00,2019-12-31 20:00:00,2019-12-31 21:00:00,2019-12-31 22:00:00,2019-12-31 23:00:00
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
216,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
filepathname = local_ts_merged_dir + "prec_ts_merged_transposed.csv"
print("df_merged_ts_transposed is saved to: %s" % (filepathname))
df_merged_ts_transposed.to_csv(filepathname,sep=";")

df_merged_ts_transposed is saved to: ../data/generated/DWD//hourly/precipitation/historical/prec_ts_merged_transposed.csv


In [35]:
def ts_append():
    # Very compact code.
    df = pd.DataFrame()
    for elt in local_zip_list:
        ffname = local_ftp_ts_dir + elt
        print("Zip archive: " + ffname)
        with ZipFile(ffname) as myzip:
            # read the time series data from the file starting with "produkt"
            prodfilename = [elt for elt in myzip.namelist() if elt.split("_")[0]=="produkt"][0] 
            print("Extract product file: %s" % prodfilename)
            print()
            with myzip.open(prodfilename) as myfile:
# TEMPERATURE                dftmp = temp_ts_to_df(myfile)
# PRECIPIATION
                dftmp = prec_ts_to_df(myfile)
                dftmp = dftmp.merge(df_stations,how="inner",left_on="stations_id",right_on="station_id",right_index=True)
#                print(dftmp.head(5))
                df = df.append(dftmp)

    #df.index.names = ["year"]
    #df.index.rename(name = "time", inplace = True)
    return(df)

In [36]:
df_appended_ts = ts_append()

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00216_20041001_20191231_hist.zip
Extract product file: produkt_rr_stunde_20041001_20191231_00216.txt



  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00389_20091101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20091101_20191231_00389.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00390_20040701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040701_20191231_00390.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00554_19950901_20191231_hist.zip
Extract product file: produkt_rr_stunde_19950901_20191231_00554.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00613_20041101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20041101_20191231_00613.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_00617_20040601_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040601_20191231_00617.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenw

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_03913_20040701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040701_20191231_03913.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04063_20030701_20191231_hist.zip
Extract product file: produkt_rr_stunde_20030701_20191231_04063.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04127_20050101_20191231_hist.zip
Extract product file: produkt_rr_stunde_20050101_20191231_04127.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04150_20051201_20191231_hist.zip
Extract product file: produkt_rr_stunde_20051201_20191231_04150.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenwerte_RR_04313_20040801_20191231_hist.zip
Extract product file: produkt_rr_stunde_20040801_20191231_04313.txt

Zip archive: ../data/original/DWD//hourly/precipitation/historical/stundenw

In [37]:
df_appended_ts.shape

(10903220, 13)

In [38]:
filepathname = local_ts_appended_dir + "prec_ts_appended.csv"
print("df_appended_ts saved to: %s" % (filepathname))
df_appended_ts.to_csv(filepathname,sep=";")

df_appended_ts saved to: ../data/generated/DWD//hourly/precipitation/historical/prec_ts_appended.csv


In [39]:
df_appended_ts[["stations_id","r1","altitude","latitude", "longitude","name", "state"]]

Unnamed: 0_level_0,stations_id,r1,altitude,latitude,longitude,name,state
mess_datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-10-01 00:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2004-10-01 01:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2004-10-01 02:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2004-10-01 03:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2004-10-01 04:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
...,...,...,...,...,...,...,...
2019-12-31 19:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2019-12-31 20:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2019-12-31 21:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2019-12-31 22:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen


In [40]:
idx = (df_appended_ts.index >= '2017-04-16 00:00:00') & (df_appended_ts.index < '2017-08-17 00:00:00')

In [41]:
df_appended_ts_all = df_appended_ts[idx][["stations_id","r1","altitude","latitude", "longitude","name", "state"]]

In [42]:
filepathname = local_ts_appended_dir + "prec_ts_appended_all.csv"
print("df_appended_ts_all saved to: %s" % (filepathname))
df_appended_ts_all.to_csv(filepathname,sep=";")

df_appended_ts_all saved to: ../data/generated/DWD//hourly/precipitation/historical/prec_ts_appended_all.csv


In [43]:
df_appended_ts_all

Unnamed: 0_level_0,stations_id,r1,altitude,latitude,longitude,name,state
mess_datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-16 00:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2017-04-16 01:00:00,216,0.1,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2017-04-16 02:00:00,216,0.1,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2017-04-16 03:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
2017-04-16 04:00:00,216,0.0,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen
...,...,...,...,...,...,...,...
2017-08-16 19:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2017-08-16 20:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2017-08-16 21:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
2017-08-16 22:00:00,15000,0.0,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen


In [44]:
df = df_appended_ts_all[(df_appended_ts_all['stations_id']==216) |
               (df_appended_ts_all['stations_id']==1300) |
               (df_appended_ts_all['stations_id']==389) |
                        (df_appended_ts_all['stations_id']==390) |
                        (df_appended_ts_all['stations_id']==613) |
                        (df_appended_ts_all['stations_id']==644) |
                        (df_appended_ts_all['stations_id']==796) |
                        (df_appended_ts_all['stations_id']==934) |
                        (df_appended_ts_all['stations_id']==3028) |
                        (df_appended_ts_all['stations_id']==3031) |
                        (df_appended_ts_all['stations_id']==3098) |
                        (df_appended_ts_all['stations_id']==3499) |
                        (df_appended_ts_all['stations_id']==4217) |
                        (df_appended_ts_all['stations_id']==4313) |
                        (df_appended_ts_all['stations_id']==4368) |
                        (df_appended_ts_all['stations_id']==4400) |
                        (df_appended_ts_all['stations_id']==4692) |
                        (df_appended_ts_all['stations_id']==5347) |
                        (df_appended_ts_all['stations_id']==5360) |
                        (df_appended_ts_all['stations_id']==5480) |
                        (df_appended_ts_all['stations_id']==5619) |
                        (df_appended_ts_all['stations_id']==5699) |
                        (df_appended_ts_all['stations_id']==6276) |
                        (df_appended_ts_all['stations_id']==6313) |
                        (df_appended_ts_all['stations_id']==13700) |
                        (df_appended_ts_all['stations_id']==13713) |
               (df_appended_ts_all['stations_id']==2483) |
               (df_appended_ts_all['stations_id']==2947) |
               (df_appended_ts_all['stations_id']==3215) |
               (df_appended_ts_all['stations_id']==4488) |
               (df_appended_ts_all['stations_id']==5468) |
               (df_appended_ts_all['stations_id']==6264) |
               (df_appended_ts_all['stations_id']==7330)]

In [45]:
df.to_csv(r"13_counties_stations_4_months_data.csv")