The first step is to convert the raw .json file to .csv.

In [10]:
import pandas as pd
import json
from pathlib import Path

# --- Path handling for Jupyter Notebooks ---
# Get the current working directory
CURRENT_DIR = Path.cwd()

# Define file paths relative to the notebook's new location
json_file_name = CURRENT_DIR.parent / 'sources' / 'raw_post.json'
csv_file_name = CURRENT_DIR.parent / 'sources' / 'deutschepost_raw.csv'
# -----------------------------------------------------------

try:
    # Open and load the entire JSON file
    with open(json_file_name, encoding='utf-8') as f:
        data = json.load(f)

    # Use the correct key to access the list of data
    list_of_records = data['pfLocations']

    # Create a DataFrame from this list
    df = pd.DataFrame(list_of_records)
    
    # Save the DataFrame to CSV
    df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')

    print(f"✅ File '{json_file_name.name}' successfully converted to '{csv_file_name.name}'")

except FileNotFoundError:
    print(f"❌ ERROR: File '{json_file_name.name}' not found. Make sure it is in the 'sources' subfolder.")
except KeyError:
    print(f"❌ KEY ERROR: Key 'pfLocations' not found! Please check the JSON file again.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ File 'raw_post.json' successfully converted to 'deutschepost_raw.csv'


Next, we extract the opening hours and geo-coordinates, and rename the days of the week from numbers to names.

In [11]:
import ast

# --- SETTINGS ---
CURRENT_DIR = Path.cwd()
input_file_name = 'deutschepost_raw.csv'
output_file_name = 'deutschepost_final_data_raw.csv' 
input_csv_file = CURRENT_DIR.parent / 'sources' / input_file_name
output_csv_file = CURRENT_DIR.parent / 'sources' / output_file_name
hours_column = 'pfTimeinfos'
geo_column = 'geoPosition'
# --------------------

def format_opening_hours_as_string(data_list):
    if not isinstance(data_list, list):
        return "Not available"

    weekday_map = {
        1: 'Mo', 2: 'Tu', 3: 'We', 4: 'Th',
        5: 'Fr', 6: 'Sa', 7: 'Su'
    }

    # First, extract hours into a dictionary to allow sorting by day number
    hours_by_day_num = {}
    for item in data_list:
        if item.get('type') == 'OPENINGHOUR':
            weekday_num = item.get('weekday')
            time_from = item.get('timefrom', 'N/A')
            time_to = item.get('timeto', 'N/A')
            if weekday_num in weekday_map:
                 hours_by_day_num[weekday_num] = f"{time_from}-{time_to}"

    if not hours_by_day_num:
        return "Not available"

    # Sort by day number (keys of the dictionary) and build the final string
    sorted_hours = []
    for day_num in sorted(hours_by_day_num.keys()):
        day_name = weekday_map[day_num]
        times = hours_by_day_num[day_num]
        sorted_hours.append(f"{day_name}: {times}")

    return "; ".join(sorted_hours)

# 1. Read the initial source CSV file
df = pd.read_csv(input_csv_file)

# --- PART 1: PROCESS OPENING HOURS  ---
# Create a single 'opening_hours' column with a formatted string
df['opening_hours'] = df[hours_column].apply(ast.literal_eval).apply(format_opening_hours_as_string)

# --- PART 2: PROCESS GEOPOSITION  ---
# This part remains useful as lat/lon are distinct fields
geo_df = df[geo_column].apply(ast.literal_eval).apply(pd.Series)
df = pd.concat([df, geo_df], axis=1)

# --- FINAL CLEANUP AND SAVE ---
columns_to_drop = [hours_column, geo_column, 'distance']
# Use .get('columns', []) to safely handle cases where 'distance' might not exist
df_final = df.drop(columns=columns_to_drop, errors='ignore')

df_final.to_csv(output_csv_file, index=False, encoding='utf-8-sig')
print(f"✅ Done! All data processed. Final file saved as '{output_file_name}'")

# --- Display the final result ---
print("\nFinal DataFrame Info:")
df_final.info()
print("\nFirst 5 rows of the final DataFrame:")
print(df_final.head())

✅ Done! All data processed. Final file saved as 'deutschepost_final_data_raw.csv'

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   250 non-null    int64  
 1   city                      250 non-null    object 
 2   district                  246 non-null    object 
 3   additionalInfo            0 non-null      float64
 4   street                    250 non-null    object 
 5   houseNo                   250 non-null    object 
 6   format1                   250 non-null    object 
 7   format2                   250 non-null    object 
 8   keyWord                   250 non-null    object 
 9   locationType              250 non-null    object 
 10  locationName              240 non-null    object 
 11  primaryKeyDeliverySystem  250 non-null    int64  
 12  primaryKeyZipRe

Since we are only focusing on post office locations, verify that no other value types are present in the data.

In [12]:
df=df_final
df['keyWord'].unique()

array(['Postfiliale', 'Postbank Filiale', 'Poststation'], dtype=object)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   250 non-null    int64  
 1   city                      250 non-null    object 
 2   district                  246 non-null    object 
 3   additionalInfo            0 non-null      float64
 4   street                    250 non-null    object 
 5   houseNo                   250 non-null    object 
 6   format1                   250 non-null    object 
 7   format2                   250 non-null    object 
 8   keyWord                   250 non-null    object 
 9   locationType              250 non-null    object 
 10  locationName              240 non-null    object 
 11  primaryKeyDeliverySystem  250 non-null    int64  
 12  primaryKeyZipRegion       228 non-null    float64
 13  systemID                  250 non-null    int64  
 14  primaryKey

Some unwanted 'Poststation' rows were found in the data and will now be removed.

In [14]:
#  Filter the DataFrame by 'keyWord'

# Define the list of values to keep in the 'keyWord' column
values_to_keep = ['Postfiliale', 'Postbank Filiale']

# Filter the DataFrame, keeping only the rows where 'keyWord' is in our list
# The result is saved back into the same 'df' variable
df = df[df['keyWord'].isin(values_to_keep)].copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 249
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   244 non-null    int64  
 1   city                      244 non-null    object 
 2   district                  240 non-null    object 
 3   additionalInfo            0 non-null      float64
 4   street                    244 non-null    object 
 5   houseNo                   244 non-null    object 
 6   format1                   244 non-null    object 
 7   format2                   244 non-null    object 
 8   keyWord                   244 non-null    object 
 9   locationType              244 non-null    object 
 10  locationName              234 non-null    object 
 11  primaryKeyDeliverySystem  244 non-null    int64  
 12  primaryKeyZipRegion       228 non-null    float64
 13  systemID                  244 non-null    int64  
 14  primaryKeyPF   

The next step involves several actions:
* Dropping the unneeded columns.
* Renaming the columns that we need.
* Changing the data formats of these columns.

In [15]:
columns_to_drop =['additionalInfo','format1', 'format2', 'systemID','primaryKeyPF','pfServicetypes','pfAccessibilitytypes','pfOtherinfos','poststationID','keyWord','district', 'primaryKeyZipRegion']
df = df.drop(columns=columns_to_drop)
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 249
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   244 non-null    int64  
 1   city                      244 non-null    object 
 2   street                    244 non-null    object 
 3   houseNo                   244 non-null    object 
 4   locationType              244 non-null    object 
 5   locationName              234 non-null    object 
 6   primaryKeyDeliverySystem  244 non-null    int64  
 7   pfClosureperiods          244 non-null    object 
 8   opening_hours             244 non-null    object 
 9   latitude                  244 non-null    float64
 10  longitude                 244 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 22.9+ KB


In [16]:
# Create a dictionary to define the changes
columns_to_rename = {
    'zipCode': 'zip_code',
    'houseNo': 'house_no',
    'locationName': 'location_name',
    'locationType': 'location_type',
    'primaryKeyDeliverySystem': 'id',
    'pfClosureperiods': 'closure_periods'
}

df = df.rename(columns=columns_to_rename)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 249
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         244 non-null    int64  
 1   city             244 non-null    object 
 2   street           244 non-null    object 
 3   house_no         244 non-null    object 
 4   location_type    244 non-null    object 
 5   location_name    234 non-null    object 
 6   id               244 non-null    int64  
 7   closure_periods  244 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         244 non-null    float64
 10  longitude        244 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 22.9+ KB


In [17]:
# Create a list of columns we want to change
columns_to_change = ['zip_code', 'id']

# Select these columns and apply the type change
df[columns_to_change] = df[columns_to_change].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 249
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         244 non-null    object 
 1   city             244 non-null    object 
 2   street           244 non-null    object 
 3   house_no         244 non-null    object 
 4   location_type    244 non-null    object 
 5   location_name    234 non-null    object 
 6   id               244 non-null    object 
 7   closure_periods  244 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         244 non-null    float64
 10  longitude        244 non-null    float64
dtypes: float64(2), object(9)
memory usage: 22.9+ KB


In [20]:
df.head()

Unnamed: 0,zip_code,city,street,house_no,location_type,location_name,id,closure_periods,opening_hours,latitude,longitude
0,10178,Berlin,Spandauer Str.,2,RETAIL_OUTLET,City Shop,4340626,[],Mo: 08:00-18:00; Tu: 08:00-18:00; We: 08:00-18...,52.521144,13.403767
1,10178,Berlin,Rathausstr.,5,POSTBANK_FINANCE_CENTER,Postbank Filiale,6730,"[{'type': 'closure', 'fromDate': '2025-10-29T0...",Mo: 09:30-18:30; Tu: 09:30-18:30; We: 09:30-18...,52.519737,13.411517
2,10178,Berlin,Karl-Liebknecht-Str.,13,RETAIL_OUTLET,Lotto Post Tabak,4307374,[],Mo: 08:00-19:00; Tu: 08:00-19:00; We: 08:00-19...,52.522327,13.408074
3,10179,Berlin,Grunerstr.,20,RETAIL_OUTLET,"GECO im ALEXA, Untergeschoss/Baseme",4125530,[],Mo: 09:00-19:45; Tu: 09:00-19:45; We: 09:00-19...,52.518764,13.416384
4,10179,Berlin,Brückenstr.,1a,RETAIL_OUTLET,Lotto-Post-Schreibwaren,4326999,[],Mo: 09:00-19:00; Tu: 09:00-19:00; We: 09:00-19...,52.511505,13.416914


In [21]:
# Save the final result to a file

# Define the path to the file inside the existing 'clean' folder
# We add '../' to go up one level from the 'scripts' folder
full_path = Path('../clean/deutschepost_clean.csv')

# Save the DataFrame using the constructed path
df.to_csv(full_path, index=False, encoding='utf-8-sig')