# Download and process BAST traffic count data
Source: https://www.bast.de/BASt_2017/DE/Verkehrstechnik/Fachthemen/v2-verkehrszaehlung/zaehl_node.html

In [1]:
import os
import requests

import geopandas as gpd
import numpy as np
import pandas as pd

from zipfile import ZipFile
from bs4 import BeautifulSoup


### Functions for download 
Functions for download and unzip BAST hourly data

In [2]:
#actual downloading -> Author: Johannes Gensheimer
def DownloadURL(url, save_path):
    #function from: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url/14260592
    
    chunk_size=128
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

#call download and unzip
def DownloadHourData_BAST(CounterNumber, year, FolderSave):

    BAST_Link = 'https://www.bast.de/videos/' + str(year) + '/zst' + str(CounterNumber) + '.zip'
    FileName = FolderSave + 'zst' + str(CounterNumber) + '_' + str(year) + '.zip'

    #download zip
    DownloadURL(BAST_Link, FileName)
    
    #unzip file if downloading was successfull
    zf = ZipFile(FileName, 'r')
    zf.extractall(FolderSave)
    zf.close()

    #remove zip file
    os.system('rm ' + FileName)

### Function to process download request 
DZ_Nr_unique: Number of counting stations <br>
years: years that should be downloaded <br>
path: path to temp directory where the downloaded data is temporary stored <br>
keepOrigData: Bool if downloaded raw data should be deleted: True: not deleted, False: deleted <br>

In [3]:
def Process(DZ_Nr_unique, years, path, keepOrigData):

    #sort years array ascending
    years.sort()
    print(years)
    
    #create temporary directory
    if not os.path.exists(path):
        os.system('mkdir ' + path)

    #total number of counting stations that should be processed
    total_count = len(DZ_Nr_unique)

    #some prints
    print('All counter stations: ', DZ_Nr_unique)
    print('In total # = ', total_count)
    count = 1
    
    # empty dataframe for all data
    df_all = pd.DataFrame()
    
    #loop over all counting stations
    for nr in range(0,len(DZ_Nr_unique)):
        
        DZ_Nr = DZ_Nr_unique[nr]
        print('Processing ' , count, ' of ', total_count)
        count = count + 1
        
        #loop over years
        for year in years:
            
            #filename in temp dir
            f = 'zst' + str(DZ_Nr) + '_' + str(year) + '.csv'
            try:
                #download data
                DownloadHourData_BAST(DZ_Nr, year, path)
                
                df = pd.read_csv(path+f, delimiter=';')
                df_all = pd.concat([df_all, df], axis = 0)
                    
                #remove downloaded file if keepOrigData not True
                if not keepOrigData:
                    os.system('rm ' + path + f)
            except Exception as e:
                print("Could not download data for:"+str(DZ_Nr_unique[nr])+"/"+str(year))

    return df_all

In [4]:
def delete_unused_locations(location_list, path):
    locations = ['BY' + str(location) for location in location_list]
    # print(f"Deleting data of unused locations")
    for folder in os.listdir(path):
        if folder == '.DS_Store': continue
        new_path = os.path.join(path, folder)
        for file in os.listdir(new_path):
            temp_path = os.path.join(new_path, file)
            if os.path.isfile(temp_path) and not (str(file[:6]) in locations):
                    os.system('rm ' + temp_path)

In [5]:
def extract_append_raw_data(df_old, path, newFile='BAST_CountingStations_daily_new.csv'):
     
    for folder in os.listdir(path):
        if folder == '.DS_Store': continue
        print(f"Processing {folder}")
        new_path = os.path.join(path, folder + '/')
        for file in os.listdir(new_path):

            # Handle header rows individually, due to different number of elements
            df_header1 = pd.read_csv(new_path + file, encoding= 'unicode_escape', nrows=1, header=None, delim_whitespace=True)
            df_header2 = pd.read_csv(new_path + file, encoding= 'unicode_escape', nrows=1, header=None, delim_whitespace=True, skiprows=1)
            df_header3 = pd.read_csv(new_path + file, encoding= 'unicode_escape', nrows=1, header=None, delim_whitespace=True, skiprows=2)

            # Copy Dataframe Schema
            df_new = pd.DataFrame().reindex_like(df_old).dropna()
            df_body = pd.read_csv(new_path + file, encoding= 'unicode_escape', skiprows=3, header=None,  delim_whitespace=True)   
            
            # Drop faulty line (necessary because of measurements in march for some reason)
            df_body = df_body.drop(df_body[df_body[0].astype(str).str.contains('i', regex=False)].index)

        # Extract data from header
            df_new['Datum'] = df_body[0]
            df_new['Stunde'] = df_body[1].str.split(":").str[0].astype(int)
            df_new['KFZ_R1'] = df_body[2]
            df_new['Wotag'] = pd.to_datetime(df_new['Datum'].astype(str), format='%y%m%d').dt.weekday
            df_new['Wotag'] += 1 # dt.weekday specifies Monday=0 .. Sunday=6

            # Fill NaN's with 0
            df_new = df_new.fillna(0)

            # Extract necessary information
            df_new = df_new.assign(TKNR=df_header1.iloc[0, 0][1:5])
            df_new = df_new.assign(Zst = df_header1.iloc[0, 0][5:9])
            df_new = df_new.assign(Land = df_header1.iloc[0, 1])
            df_new = df_new.assign(Strklas = df_header1.iloc[0, 2])
            df_new = df_new.assign(Strnum = df_header1.iloc[0, 3])
            df_new = df_new.assign(Fahrtzw = 'n')

            # number of lanes in each direction
            lanes_per_direction = [int(df_header2.iloc[0, 0][1:]), int(df_header2.iloc[0, 1])]
            offset_date_time = 2
            offset_kfz_sv = 2 * (lanes_per_direction[0] + lanes_per_direction[1])

            # For some reason 1 station is different than all the other stations..
            if 'BY9223' in file:
                column = 'KFZ_R'

                for direction in range(2): # code only works for 2 directions
                    # KFZ and LKW (SV)
                    count_name = column + str(direction + 1)
                    check_name = 'K_' + column + str(direction + 1)
                    df_temp_count = df_body.iloc[:][offset_date_time + (direction * lanes_per_direction[0])].str.strip().str[:-1].astype(int)
                    df_temp_check = df_body.iloc[:][offset_date_time + (direction * lanes_per_direction[0])].str.strip().str[-1:]
                    for lane in range(1, lanes_per_direction[direction]):
                        df_temp_count += df_body.iloc[:][offset_date_time + (direction * lanes_per_direction[0]) + lane].str.strip().str[:-1].astype(int)

                    df_new[count_name] = df_temp_count
                    df_new[check_name] = df_temp_check
            else: 

                columns_kfz_lkw = ['KFZ_R', 'Lkw_R']
                columns_rest = ['Mot_R', 'Pkw_R', 'Lfw_R', 'PmA_R', 'Bus_R',
                                'LoA_R', 'Lzg_R', 'Sat_R', 'Son_R'] # attention, Lzg = LmA + Sat
                columns_calc = ['PLZ_R', 'Lzg_R'] # Lzg = LmA + Sat, PLZ = Mot + Pkw + Lfw

                for direction in range(2):
                    # KFZ and LKW (SV)
                    for idx, column in enumerate(columns_kfz_lkw):
                        count_name = column + str(direction + 1)
                        check_name = 'K_' + column + str(direction + 1)
                        df_temp_count = df_body.iloc[:][offset_date_time + idx + (direction * 2 * lanes_per_direction[0])].str.strip().str[:-1].astype(int)
                        df_temp_check = df_body.iloc[:][offset_date_time + idx + (direction * 2 * lanes_per_direction[0])].str.strip().str[-1:]
                        for lane in range(1, lanes_per_direction[direction]):
                            df_temp_count += df_body.iloc[:][offset_date_time + idx + (direction * 2 * lanes_per_direction[0]) + (2 * lane)].str.strip().str[:-1].astype(int)

                        df_new[count_name] = df_temp_count
                        df_new[check_name] = df_temp_check
                    # Other classes
                    for idx, column in enumerate(columns_rest):
                        count_name = column + str(direction + 1)
                        check_name = 'K_' + column + str(direction + 1)
                        df_temp_count = df_body.iloc[:][offset_date_time + offset_kfz_sv + idx + (direction * 9 * lanes_per_direction[0])].str.strip().str[:-1].astype(int)
                        df_temp_check = df_body.iloc[:][offset_date_time + offset_kfz_sv + idx + (direction * 9 * lanes_per_direction[0])].str.strip().str[-1:]
                        for lane in range(1, lanes_per_direction[direction]):
                            df_temp_count += df_body.iloc[:][offset_date_time + offset_kfz_sv + idx + (direction * 9 * lanes_per_direction[0]) + (9 * lane)].str.strip().str[:-1].astype(int)
                        df_new[count_name] = df_temp_count
                        df_new[check_name] = df_temp_check

                    # Lzg = LmA + Sat
                    df_new['Lzg_R' + str(direction + 1)] += df_new['Sat_R' + str(direction + 1)]
                    # PLZ = Mot + Pkw + Lfw
                    df_new['PLZ_R' + str(direction + 1)] = df_new['Mot_R' + str(direction + 1)] + df_new['Pkw_R' + str(direction + 1)] + df_new['Lfw_R' + str(direction + 1)]
                    df_new['K_PLZ_R' + str(direction + 1)] = df_new['K_Mot_R' + str(direction + 1)] # Not sure how to handle this  
            df_old = pd.concat([df_old, df_new])
        print(f"Completed {folder}")
    print(f"Writing to file {newFile}")
    df_old.to_csv(newFile)
    return df_old


# Run the functions

In [6]:
bast_locations = gpd.read_file('bast_locations_selected.gpkg')
station_ids = np.array(bast_locations['MST_ID'].unique()) # counting station number

years = np.array([2018,2019,2020,2021]) # years of interst

# download processed datasets from BAST homepage
df_daily = Process(station_ids, years, path='temp/', keepOrigData=False)
df_daily.to_csv('BAST_CountingStations_daily.csv')

[2018 2019 2020 2021]
All counter stations:  [9140 9141 9772 9773 9082 9003 9083 9115 9066 9063 9242 9064 9217 9006
 9987 9986 9985 9151 9043 9222 9106 9207 9775 9774 9215 9212 9213 9220
 9219 9218 9211 9155 9228 9244 9229 9221 9830 9820 9810 9320 9102 9988]
In total # =  42
Processing  1  of  42
Processing  2  of  42
Processing  3  of  42
Could not download data for:9772/2021
Processing  4  of  42
Could not download data for:9773/2018
Could not download data for:9773/2019
Could not download data for:9773/2020
Could not download data for:9773/2021
Processing  5  of  42
Could not download data for:9082/2018
Could not download data for:9082/2019
Could not download data for:9082/2020
Processing  6  of  42
Processing  7  of  42
Could not download data for:9083/2018
Could not download data for:9083/2019
Could not download data for:9083/2020
Processing  8  of  42
Could not download data for:9115/2018
Could not download data for:9115/2019
Processing  9  of  42
Processing  10  of  42
Could not

In [7]:
# Raw data

# create temporary directory to save raw data for the raw data download  
temp_path = 'temp_raw/'
if not os.path.exists(temp_path):
    os.makedirs(temp_path)
    
# Download raw data from BAST was performed manually from the follwowing link and unzipped into the temp_raw/ folder
# URL = 'https://www.bast.de/DE/Publikationen/Daten/Verkehrstechnik/DZ.html'
# the downloaded zip file contains zip files for each month

# Remove stations not in DZ_Nr_arr
delete_unused_locations(station_ids, temp_path)
# extract and append raw data to previous df
extract_append_raw_data(df_daily, temp_path, 'BAST_CountingStations_daily_new.csv')

Processing DZ_2022_01_Rohdaten
Completed DZ_2022_01_Rohdaten
Processing DZ_2022_07_Rohdaten
Completed DZ_2022_07_Rohdaten
Processing DZ_2022_06_Rohdaten
Completed DZ_2022_06_Rohdaten
Processing DZ_2022_10_Rohdaten
Completed DZ_2022_10_Rohdaten
Processing DZ_2022_11_Rohdaten
Completed DZ_2022_11_Rohdaten
Processing DZ_2022_09_Rohdaten
Completed DZ_2022_09_Rohdaten
Processing DZ_2022_08_Rohdaten
Completed DZ_2022_08_Rohdaten
Processing DZ_2022_03_Rohdaten
Completed DZ_2022_03_Rohdaten
Processing DZ_2022_02_Rohdaten
Completed DZ_2022_02_Rohdaten
Processing DZ_2022_12_Rohdaten
Completed DZ_2022_12_Rohdaten
Processing DZ_2022_04_Rohdaten
Completed DZ_2022_04_Rohdaten
Processing DZ_2022_05_Rohdaten
Completed DZ_2022_05_Rohdaten
Writing to file BAST_CountingStations_daily_new.csv


Unnamed: 0,TKNR,Zst,Land,Strklas,Strnum,Datum,Wotag,Fahrtzw,Stunde,KFZ_R1,...,Bus_R2,K_Bus_R2,LoA_R2,K_LoA_R2,Lzg_R2,K_Lzg_R2,Sat_R2,K_Sat_R2,Son_R2,K_Son_R2
0,7834,9140,9,A,8,180101,1,s,1,76,...,0,a,0,a,0,a,0,a,0,a
1,7834,9140,9,A,8,180101,1,s,2,222,...,0,a,0,a,0,a,0,a,0,a
2,7834,9140,9,A,8,180101,1,s,3,237,...,0,a,0,a,0,a,0,a,0,a
3,7834,9140,9,A,8,180101,1,s,4,149,...,0,a,0,a,0,a,0,a,0,a
4,7834,9140,9,A,8,180101,1,s,5,95,...,0,a,0,a,0,a,0,a,0,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,7936,9244,9,A,99,220531,2,n,20,1967,...,8,-,40,-,184,-,154,-,25,-
740,7936,9244,9,A,99,220531,2,n,21,1302,...,11,-,20,-,121,-,103,-,11,-
741,7936,9244,9,A,99,220531,2,n,22,865,...,2,-,17,-,105,-,81,-,20,-
742,7936,9244,9,A,99,220531,2,n,23,733,...,4,-,12,-,112,-,94,-,11,-


In [None]:
df_test = pd.read_csv('BAST_CountingStations_daily_new.csv', index_col=0).sort_values('Datum')
df_test.tail()