
*Technical University of Munich<br>
Professorship of Environmental Sensing and Modeling<br><br>*
**Author:**  Ali Ahmad Khan<br>
**Date:**  13.11.2023

--- 

# LHM Counting Data processing

This script loads the 'Jahresexport_MST_Detektoren*.csv', 'Q*_2019.csv' files, cleans the dataset and converts the data into a predetermined data model. Only the sensors present in 'mst_locations_selected.gpkg' are worked upon<br>

**Required steps**
- Import file and convert columns to meaningful datatypes
- Delete meaningsless columns and rows for detectors not included in locations
- Convert the ART column into given vehicle classes
- Merge counting data with location data 

In [1]:
import sys
import glob
import numpy as np
import pandas as pd
import geopandas as gpd

# import custom modules
sys.path.append('../../utils/')
import data_paths

### Import and Clean raw data from *.csv file

In [2]:
# path to mst counting data
data_path = data_paths.MST_COUNTING_PATH

# read the MST Locatins geo packaged file
mst_loc = gpd.read_file(data_path+'mst_locations_selected.gpkg')

# list of file patterns to match
file_patterns = ['Jahresexport_MST_Detektoren*.csv', 'Q*_2019.csv']

# Initializes an empty DataFrame
mst_raw_combined_df = pd.DataFrame()

# Iterates over each file pattern
for file_pattern in file_patterns:

    # gets a list of file paths that match the pattern
    file_paths = glob.glob(data_path + file_pattern)

    # Iterates over the file paths and read each CSV file
    for file_path in file_paths:

        df = pd.read_csv(file_path, delimiter=';', decimal=',', encoding='ISO-8859-1')

        # rename the columns of the all dfs retrived to match the first df retrieved
        if not mst_raw_combined_df.empty:
            
            df.rename(index=str, columns=dict(zip(df.columns.to_list(), mst_raw_combined_df.columns.to_list())), inplace=True)

        # concat the dataframes to contain data of all available years
        mst_raw_combined_df = pd.concat([mst_raw_combined_df, df])

# Keep rows only with MST_IDs that are present in out geopackage
mst_raw_combined_df = mst_raw_combined_df[mst_raw_combined_df['MST'].isin(mst_loc['MST_ID'])]

# Convert the Datetime format to YYYY-MM-DD
mst_raw_combined_df['date'] = pd.to_datetime(mst_raw_combined_df['DATUM'],format='%d.%m.%Y')

# Remove unnecessary columns
mst_raw_combined_df = mst_raw_combined_df.drop(['DATUM','MST','MQ','Unnamed: 30'], axis = 1)

# Rename the columns to their english alternatives
mst_raw_combined_df.rename(columns={'DETEKTOR_ID': 'detector_id', 'TAGES_SUMME':'daily_value'}, inplace=True)

## Data Transformation

### Create Datframe for volume of traffic for lhm

In [3]:
# Dict to convert ART volume values to vehicle class
art_to_vehicle_class = {   
                    'QPKW': 'PC',
                    'QLFW': 'LCV',
                    'QPKWA': 'PC',
                    'QLKWA': 'SNF',
                    'QLKW': 'SNF',
                    'QSATTEL_KFZ': 'SNF',
                    'QBUS': 'BUS',
                    'QKRAD': 'MOT'
                }

# create raw volume dataframe 
mst_raw_volume = mst_raw_combined_df.copy()

# map the art volume categories to vehicles class
mst_raw_volume['vehicle_class'] = mst_raw_volume['ART'].map(art_to_vehicle_class)

# drop the art column
mst_raw_volume = mst_raw_volume.drop(['ART'], axis = 1)

# group by all vehicles classes
mst_raw_volume = mst_raw_volume.groupby(['date', 'detector_id', 'vehicle_class'], as_index=False).sum()

# assign the detectors their type
mst_raw_volume['detector_type'] = np.where(mst_raw_volume['vehicle_class'].isna(), np.NaN, '8+1')

# create a metric column with volume value 
mst_raw_volume['metric'] = 'volume'

### Create Datframe for speed of traffic for lhm

In [4]:
# Dict to convert ART speed values to vehicle class 
art_to_vehicle_class = {   
                    'VPKW': 'PC',
                    'VLFW': 'LCV',
                    'VPKWA': 'PC',
                    'VLKWA': 'SNF',
                    'VLKW': 'SNF',
                    'VSATTEL_KFZ': 'SNF',
                    'VBUS': 'BUS',
                    'VKRAD': 'MOT'
                }

# create raw speed dataframe 
mst_raw_speed = mst_raw_combined_df.copy()

# map the art speed categories to vehicles class
mst_raw_speed['vehicle_class'] = mst_raw_speed['ART'].map(art_to_vehicle_class)

# map the art speed categories to vehicles class
mst_raw_speed = mst_raw_speed.drop(['ART'], axis = 1)

# group by all vehicles classes
mst_raw_speed = mst_raw_speed.groupby(['date', 'detector_id', 'vehicle_class'], as_index=False).mean()

# assign the detectors their type
mst_raw_speed['detector_type'] = np.where(mst_raw_speed['vehicle_class'].isna(), np.NaN, '8+1')

# create a metric column with speed value 
mst_raw_speed['metric'] = 'speed'

### Concatenate the volume and speed dataframes for lhm & Merge with location data

In [5]:
# Concat the volume and speed dataframes together
mst_concat = pd.concat([mst_raw_speed, mst_raw_volume])

# Join the mst_concate dataframe with the locations data
mst_preprocessed = mst_concat.merge(mst_loc, how = 'left', left_on = 'detector_id', right_on = 'DETEKTOR_ID')

# Ordered the columns into predetermined order
mst_preprocessed = mst_preprocessed[['date','road_link_id','detector_id','detector_type','vehicle_class','metric','daily_value',
       "00:00-01:00", '01:00-02:00', '02:00-03:00', '03:00-04:00',
       '04:00-05:00', '05:00-06:00', '06:00-07:00', '07:00-08:00',
       '08:00-09:00', '09:00-10:00', '10:00-11:00', '11:00-12:00',
       '12:00-13:00', '13:00-14:00', '14:00-15:00', '15:00-16:00',
       '16:00-17:00', '17:00-18:00', '18:00-19:00', '19:00-20:00',
       '20:00-21:00', '21:00-22:00', '22:00-23:00', '23:00-24:00']]

## Store as Parquet File

In [6]:
# Store the dataframe as a parquet file
mst_preprocessed.to_parquet(data_path+'preprocessed_lhm_counting_data.parquet', index=False)

In [7]:
mst_preprocessed

Unnamed: 0,date,road_link_id,detector_id,detector_type,vehicle_class,metric,daily_value,00:00-01:00,01:00-02:00,02:00-03:00,...,14:00-15:00,15:00-16:00,16:00-17:00,17:00-18:00,18:00-19:00,19:00-20:00,20:00-21:00,21:00-22:00,22:00-23:00,23:00-24:00
0,2018-01-01,80645.0,4010011,8+1,PC,speed,0.0,49.0,49.0,50.0,...,48.0,46.0,48.0,45.0,47.0,48.0,0.0,0.0,48.0,0.0
1,2018-01-01,80645.0,4010011,8+1,SNF,speed,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-01-01,80645.0,4010012,8+1,PC,speed,0.0,53.0,51.0,51.0,...,48.0,48.0,49.0,46.0,50.0,51.0,0.0,0.0,54.0,0.0
3,2018-01-01,80645.0,4010012,8+1,SNF,speed,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-01-01,80645.0,4010013,8+1,PC,speed,0.0,54.0,52.0,51.0,...,49.0,48.0,49.0,46.0,49.0,50.0,0.0,0.0,52.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000201,2022-12-31,13587.0,4173023,8+1,BUS,volume,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000202,2022-12-31,13587.0,4173023,8+1,LCV,volume,108.0,2.0,3.0,0.0,...,14.0,10.0,8.0,2.0,3.0,2.0,1.0,2.0,0.0,0.0
2000203,2022-12-31,13587.0,4173023,8+1,MOT,volume,23.0,0.0,0.0,0.0,...,4.0,5.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2000204,2022-12-31,13587.0,4173023,8+1,PC,volume,1910.0,40.0,31.0,16.0,...,146.0,160.0,168.0,146.0,109.0,97.0,45.0,35.0,25.0,18.0
