The first step is to convert the raw .json file to .csv.

In [None]:
import pandas as pd
import json
from pathlib import Path

# --- Path handling for Jupyter Notebooks ---
# Get the current working directory
CURRENT_DIR = Path.cwd()

# Define file paths relative to the notebook's new location
json_file_name = CURRENT_DIR / 'sources' / 'raw_post.json'
csv_file_name = CURRENT_DIR / 'sources' / 'deutschepost_raw.csv'
# -----------------------------------------------------------

try:
    # Open and load the entire JSON file
    with open(json_file_name, encoding='utf-8') as f:
        data = json.load(f)

    # Use the correct key to access the list of data
    list_of_records = data['pfLocations']

    # Create a DataFrame from this list
    df = pd.DataFrame(list_of_records)
    
    # Save the DataFrame to CSV
    df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')

    print(f"✅ File '{json_file_name.name}' successfully converted to '{csv_file_name.name}'")

except FileNotFoundError:
    print(f"❌ ERROR: File '{json_file_name.name}' not found. Make sure it is in the 'sources' subfolder.")
except KeyError:
    print(f"❌ KEY ERROR: Key 'pfLocations' not found! Please check the JSON file again.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ File 'raw_post.json' successfully converted to 'deutschepost_raw.csv'


Next, we extract the opening hours and geo-coordinates, and rename the days of the week from numbers to names.

In [12]:
import ast


# --- SETTINGS ---
CURRENT_DIR = Path.cwd()
input_file_name = 'deutschepost_raw.csv'
output_file_name = 'deutschepost_final_data_raw.csv'

input_csv_file = CURRENT_DIR / 'sources' / input_file_name
output_csv_file = CURRENT_DIR / 'sources' / output_file_name

hours_column = 'pfTimeinfos'
geo_column = 'geoPosition'
# --------------------

def parse_opening_hours(data_list):
    if not isinstance(data_list, list):
        return {}
    opening_hours = {}
    for item in data_list:
        if item.get('type') == 'OPENINGHOUR':
            weekday = item.get('weekday')
            time_from = item.get('timefrom', 'N/A')
            time_to = item.get('timeto', 'N/A')
            opening_hours[weekday] = f"{time_from}-{time_to}"
    return opening_hours

# 1. Read the initial source CSV file
df = pd.read_csv(input_csv_file)

# --- PART 1: PROCESS OPENING HOURS ---
df['opening_hours_dict'] = df[hours_column].apply(ast.literal_eval).apply(parse_opening_hours)
hours_df = df['opening_hours_dict'].apply(pd.Series)
weekday_map = {
    1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday',
    5: 'Friday', 6: 'Saturday', 7: 'Sunday'
}
hours_df = hours_df.rename(columns=weekday_map)
df = pd.concat([df, hours_df], axis=1)

# --- PART 2: PROCESS GEOPOSITION ---
geo_df = df[geo_column].apply(ast.literal_eval).apply(pd.Series)
df = pd.concat([df, geo_df], axis=1)

# --- FINAL CLEANUP AND SAVE ---
columns_to_drop = [hours_column, geo_column, 'opening_hours_dict', 'distance']
df_final = df.drop(columns=columns_to_drop)

df_final.to_csv(output_csv_file, index=False, encoding='utf-8-sig')

print(f"✅ Done! All data processed. Final file saved as '{output_file_name}'")

# --- Display the final result ---
print("\nFinal DataFrame Info:")
df_final.info()

print("\nFirst 5 rows of the final DataFrame:")
df_final.head()

✅ Done! All data processed. Final file saved as 'deutschepost_final_data_raw.csv'

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   250 non-null    int64  
 1   city                      250 non-null    object 
 2   district                  246 non-null    object 
 3   additionalInfo            0 non-null      float64
 4   street                    250 non-null    object 
 5   houseNo                   250 non-null    object 
 6   format1                   250 non-null    object 
 7   format2                   250 non-null    object 
 8   keyWord                   250 non-null    object 
 9   locationType              250 non-null    object 
 10  locationName              240 non-null    object 
 11  primaryKeyDeliverySystem  250 non-null    int64  
 12  primaryKeyZipRe

Unnamed: 0,zipCode,city,district,additionalInfo,street,houseNo,format1,format2,keyWord,locationType,...,poststationID,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,latitude,longitude
0,10178,Berlin,Mitte,,Spandauer Str.,2,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,,08:00-18:00,08:00-18:00,08:00-18:00,08:00-18:00,08:00-18:00,08:00-16:00,,52.521144,13.403767
1,10178,Berlin,Mitte,,Rathausstr.,5,csbnan7s8n2s3,csbn2s3n1,Postbank Filiale,POSTBANK_FINANCE_CENTER,...,,09:30-18:30,09:30-18:30,09:30-18:30,09:30-18:30,09:30-18:30,09:00-14:00,,52.519737,13.411517
2,10178,Berlin,Mitte,,Karl-Liebknecht-Str.,13,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,,08:00-19:00,08:00-19:00,08:00-19:00,08:00-19:00,08:00-19:00,08:00-19:00,,52.522327,13.408074
3,10179,Berlin,Mitte,,Grunerstr.,20,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,,09:00-19:45,09:00-19:45,09:00-19:45,09:00-19:45,09:00-19:45,09:00-19:45,,52.518764,13.416384
4,10179,Berlin,Mitte,,Brückenstr.,1a,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,,09:00-19:00,09:00-19:00,09:00-19:00,09:00-19:00,09:00-19:00,09:00-15:00,,52.511505,13.416914
