The first step is to convert the raw .json file to .csv.

In [48]:
import pandas as pd
import json
from pathlib import Path

# --- Path handling for Jupyter Notebooks ---
# Get the current working directory
CURRENT_DIR = Path.cwd()

# Define file paths relative to the notebook's new location
json_file_name = CURRENT_DIR / 'sources' / 'raw_post.json'
csv_file_name = CURRENT_DIR / 'sources' / 'deutschepost_raw.csv'
# -----------------------------------------------------------

try:
    # Open and load the entire JSON file
    with open(json_file_name, encoding='utf-8') as f:
        data = json.load(f)

    # Use the correct key to access the list of data
    list_of_records = data['pfLocations']

    # Create a DataFrame from this list
    df = pd.DataFrame(list_of_records)
    
    # Save the DataFrame to CSV
    df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')

    print(f"✅ File '{json_file_name.name}' successfully converted to '{csv_file_name.name}'")

except FileNotFoundError:
    print(f"❌ ERROR: File '{json_file_name.name}' not found. Make sure it is in the 'sources' subfolder.")
except KeyError:
    print(f"❌ KEY ERROR: Key 'pfLocations' not found! Please check the JSON file again.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ File 'raw_post.json' successfully converted to 'deutschepost_raw.csv'


Next, we extract the opening hours and geo-coordinates, and rename the days of the week from numbers to names.

In [49]:
import ast

# --- SETTINGS ---
CURRENT_DIR = Path.cwd()
# Define file names
input_file_name = 'deutschepost_raw.csv'
output_file_name = 'deutschepost_final_data_raw.csv'

# Build the full paths relative to the notebook's location
input_csv_file = CURRENT_DIR / 'sources' / input_file_name
output_csv_file = CURRENT_DIR / 'sources' / output_file_name

# Names of the columns to process
hours_column = 'pfTimeinfos'
geo_column = 'geoPosition'
# --------------------

def parse_hours_to_dict(data_list):
    """
    This function takes a list of dictionaries and extracts
    the opening hours into a single, clean dictionary.
    """
    if not isinstance(data_list, list):
        return None # Return None if the cell is empty or not a list
    
    weekday_map = {
        1: 'Mo', 2: 'Tu', 3: 'We', 4: 'Th',
        5: 'Fr', 6: 'Sa', 7: 'Su'
    }
    opening_hours = {}
    
    for item in data_list:
        if item.get('type') == 'OPENINGHOUR':
            weekday_num = item.get('weekday')
            weekday_name = weekday_map.get(weekday_num) # Convert number to name
            
            if weekday_name:
                time_from = item.get('timefrom', 'N/A')
                time_to = item.get('timeto', 'N/A')
                opening_hours[weekday_name] = f"{time_from}-{time_to}"
    
    # Return the dictionary only if it's not empty, otherwise return None
    return opening_hours if opening_hours else None

# 1. Read the initial source CSV file
df = pd.read_csv(input_csv_file)

# --- PART 1: PROCESS OPENING HOURS INTO A SINGLE COLUMN ---
df['opening_hours'] = df[hours_column].apply(ast.literal_eval).apply(parse_hours_to_dict)

# --- PART 2: PROCESS GEOPOSITION ---
geo_df = df[geo_column].apply(ast.literal_eval).apply(pd.Series)
df = pd.concat([df, geo_df], axis=1)

# --- FINAL CLEANUP AND SAVE ---
columns_to_drop = [hours_column, geo_column, 'distance']
df_final = df.drop(columns=columns_to_drop)

# Save the final result to a new file
df_final.to_csv(output_csv_file, index=False, encoding='utf-8-sig')


print(f"✅ Done! LLM-ready data has been saved to '{output_file_name}'")

# --- Display the final result ---
print("\nFinal DataFrame Info:")
df_final.info()

print("\nFirst 5 rows of the final DataFrame:")
df_final.head()

✅ Done! LLM-ready data has been saved to 'deutschepost_final_data_raw.csv'

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   250 non-null    int64  
 1   city                      250 non-null    object 
 2   district                  246 non-null    object 
 3   additionalInfo            0 non-null      float64
 4   street                    250 non-null    object 
 5   houseNo                   250 non-null    object 
 6   format1                   250 non-null    object 
 7   format2                   250 non-null    object 
 8   keyWord                   250 non-null    object 
 9   locationType              250 non-null    object 
 10  locationName              240 non-null    object 
 11  primaryKeyDeliverySystem  250 non-null    int64  
 12  primaryKeyZipRegion   

Unnamed: 0,zipCode,city,district,additionalInfo,street,houseNo,format1,format2,keyWord,locationType,...,systemID,primaryKeyPF,pfServicetypes,pfAccessibilitytypes,pfClosureperiods,pfOtherinfos,poststationID,opening_hours,latitude,longitude
0,10178,Berlin,Mitte,,Spandauer Str.,2,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,8,8-4340626,[],[],[],"[{'type': 'Bezeichnung', 'content': 'Berlin 3'...",,"{'Mo': '08:00-18:00', 'Tu': '08:00-18:00', 'We...",52.521144,13.403767
1,10178,Berlin,Mitte,,Rathausstr.,5,csbnan7s8n2s3,csbn2s3n1,Postbank Filiale,POSTBANK_FINANCE_CENTER,...,8,8-6730,[],[],"[{'type': 'closure', 'fromDate': '2025-10-29T0...","[{'type': 'christmasHintEN', 'content': ' '}, ...",,"{'Mo': '09:30-18:30', 'Tu': '09:30-18:30', 'We...",52.519737,13.411517
2,10178,Berlin,Mitte,,Karl-Liebknecht-Str.,13,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,8,8-4307374,[],[],[],"[{'type': 'christmasHintEN', 'content': ' '}, ...",,"{'Mo': '08:00-19:00', 'Tu': '08:00-19:00', 'We...",52.522327,13.408074
3,10179,Berlin,Mitte,,Grunerstr.,20,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,8,8-4125530,[],[],[],"[{'type': 'christmasHintEN', 'content': ' '}, ...",,"{'Mo': '09:00-19:45', 'Tu': '09:00-19:45', 'We...",52.518764,13.416384
4,10179,Berlin,Mitte,,Brückenstr.,1a,csbnan7s8n2s3,csbn2s3n1,Postfiliale,RETAIL_OUTLET,...,8,8-4326999,[],[],[],"[{'type': 'christmasHintEN', 'content': ' '}, ...",,"{'Mo': '09:00-19:00', 'Tu': '09:00-19:00', 'We...",52.511505,13.416914


The next step involves several actions:
* Dropping the unneeded columns.
* Renaming the columns that we need.
* Changing the data formats of these columns.

In [50]:
columns_to_drop =['additionalInfo','format1', 'format2', 'systemID','primaryKeyPF','pfServicetypes','pfAccessibilitytypes','pfOtherinfos','poststationID','keyWord','district', 'primaryKeyZipRegion']
df_final = df_final.drop(columns=columns_to_drop)
df_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   zipCode                   250 non-null    int64  
 1   city                      250 non-null    object 
 2   street                    250 non-null    object 
 3   houseNo                   250 non-null    object 
 4   locationType              250 non-null    object 
 5   locationName              240 non-null    object 
 6   primaryKeyDeliverySystem  250 non-null    int64  
 7   pfClosureperiods          250 non-null    object 
 8   opening_hours             244 non-null    object 
 9   latitude                  250 non-null    float64
 10  longitude                 250 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 21.6+ KB


In [51]:
# Create a dictionary to define the changes
columns_to_rename = {
    'zipCode': 'zip_code',
    'houseNo': 'house_no',
    'locationName': 'location_name',
    'locationType': 'location_type',
    'primaryKeyDeliverySystem': 'id',
    'pfClosureperiods': 'closure_periods'
}

df_final = df_final.rename(columns=columns_to_rename)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         250 non-null    int64  
 1   city             250 non-null    object 
 2   street           250 non-null    object 
 3   house_no         250 non-null    object 
 4   location_type    250 non-null    object 
 5   location_name    240 non-null    object 
 6   id               250 non-null    int64  
 7   closure_periods  250 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         250 non-null    float64
 10  longitude        250 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 21.6+ KB


In [53]:
# Create a list of columns you want to change
columns_to_change = ['zip_code', 'city', 'street','house_no','location_type','location_name', 'id']

# Select these columns and apply the type change
df_final[columns_to_change] = df_final[columns_to_change].astype(object)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         250 non-null    object 
 1   city             250 non-null    object 
 2   street           250 non-null    object 
 3   house_no         250 non-null    object 
 4   location_type    250 non-null    object 
 5   location_name    250 non-null    object 
 6   id               250 non-null    object 
 7   closure_periods  250 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         250 non-null    float64
 10  longitude        250 non-null    float64
dtypes: float64(2), object(9)
memory usage: 21.6+ KB


In [57]:
print(df_final['location_type'].value_counts())

location_type
RETAIL_OUTLET              229
POSTBANK_FINANCE_CENTER     15
POSTSTATION                  6
Name: count, dtype: int64


In [None]:
df_final.head()
# Save the final result to a file
df_final.to_csv(output_csv_file, index=False, encoding='utf-8-sig')

In [None]:
# --- SETTINGS ---
CURRENT_DIR = Path.cwd()
# The input is the file you created in the previous step
input_file_name = 'deutschepost_final_data_raw.csv'
# The final, clean output file
output_file_name = 'deutschepost_final_data_clean.csv'

input_csv_file = CURRENT_DIR / 'sources' / input_file_name
output_csv_file = CURRENT_DIR / 'sources' / output_file_name
# --------------------


def parse_closure_periods(data_string):
    """Parses closure periods into a human-readable summary."""
    try:
        periods_list = ast.literal_eval(data_string)
        if not isinstance(periods_list, list) or not periods_list:
            return None
    except (ValueError, SyntaxError):
        return None
    
    summaries = []
    for period in periods_list:
        closure_type = period.get('type', 'Info').capitalize()
        from_date_str = period.get('fromDate', '')
        to_date_str = period.get('toDate')
        from_date = from_date_str[:10] if from_date_str else 'N/A'
        summary = f"{closure_type}: from {from_date}"
        if to_date_str:
            to_date = to_date_str[:10]
            summary = f"{closure_type}: {from_date} to {to_date}"
        summaries.append(summary)
    return "; ".join(summaries)

# --- FINAL PROCESSING STEPS ---

# 1. Load the partially processed file
df = pd.read_csv(input_csv_file)
print(f"Loaded {len(df)} rows from '{input_file_name}'")

# 2. Process the remaining complex column: pfClosureperiods
df['closure_summary'] = df['pfClosureperiods'].apply(parse_closure_periods)

# 3. Filter out rows that are temporarily closed
df = df[~df['closure_summary'].str.contains('Closure', na=False)]
print(f"Rows remaining after filtering out closures: {len(df)}")

# 4. Drop all remaining unnecessary columns
columns_to_drop = [
    # The last complex column
    'pfClosureperiods',
    
    # Other unnecessary columns
    'additionalInfo',
    'format1',
    'format2',
    'systemID',
    'primaryKeyPF',
    'pfServicetypes',
    'pfAccessibilitytypes',
    'pfOtherinfos',
    'poststationID'
]
df_final = df.drop(columns=columns_to_drop)

# 5. Save the final, fully cleaned file
df_final.to_csv(output_csv_file, index=False, encoding='utf-8-sig')

print(f"\n✅ Done! Final cleaned data saved to '{output_file_name}'")
print("\nFinal DataFrame structure:")
df_final.info()

In [36]:
duplicate_count = df_final.duplicated(subset=['primaryKeyZipRegion']).sum()

print(f"Number of duplicate values found: {duplicate_count}")

Number of duplicate values found: 80


In [39]:
is_duplicate = df_final.duplicated(subset=['primaryKeyZipRegion'], keep=False)

# Filter the DataFrame to show only the duplicate rows and sort them for easy comparison
duplicate_rows = df_final[is_duplicate].sort_values(by='primaryKeyZipRegion')

# Display the result
print(f"Found {len(duplicate_rows)} rows that are part of a duplicate set:")
duplicate_rows

Found 128 rows that are part of a duplicate set:


Unnamed: 0,zipCode,city,district,street,houseNo,keyWord,locationType,locationName,primaryKeyDeliverySystem,primaryKeyZipRegion,pfClosureperiods,opening_hours,latitude,longitude
72,12435,Berlin,Alt-Treptow,Karl-Kunger-Str.,56,Postfiliale,RETAIL_OUTLET,Lotto Tabak Post,4288412,414.0,[],"{'Mo': '06:00-21:00', 'Tu': '06:00-21:00', 'We...",52.490893,13.448134
184,14059,Berlin,Charlottenburg,Nehringstr.,1,Postfiliale,RETAIL_OUTLET,Lotto Tabak,4341616,414.0,[],"{'Mo': '09:00-20:00', 'Tu': '09:00-20:00', 'We...",52.517513,13.293425
88,10827,Berlin,Schöneberg,Hauptstr.,152,Postfiliale,RETAIL_OUTLET,,4138988,415.0,[],"{'Mo': '07:00-20:00', 'Tu': '07:00-20:00', 'We...",52.488446,13.358844
68,13353,Berlin,Wedding,Burgsdorfstr.,3a,Postfiliale,RETAIL_OUTLET,Drinks House,4287503,415.0,[],"{'Mo': '11:30-23:00', 'Tu': '11:30-23:00', 'We...",52.542620,13.362131
175,10589,Berlin,Charlottenburg,Osnabrücker Str.,24,Postfiliale,RETAIL_OUTLET,Lotto Tabak Post,4337729,426.0,[],"{'Mo': '08:00-19:00', 'Tu': '08:00-19:00', 'We...",52.525889,13.300534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,12681,Berlin,Marzahn,Märkische Allee,166-172,Postfiliale,RETAIL_OUTLET,Tabakwaren-Lotto im Kaufland,4085701,,[],"{'Mo': '08:00-14:00', 'Tu': '08:00-14:00', 'We...",52.533376,13.538268
232,12359,Berlin,Britz,Gutschmidtstr.,17-19,Postfiliale,RETAIL_OUTLET,Lorenz Tabakwaren,4076881,,[],"{'Mo': '09:00-20:00', 'Tu': '09:00-20:00', 'We...",52.436764,13.447597
239,13439,Berlin,Märkisches Viertel,Senftenberger Ring,17,Poststation,POSTSTATION,"Märkische Zeile, am Parkhaus",840,,[],,52.599460,13.355556
243,12439,Berlin,Niederschöneweide,Schnellerstr.,21,Postfiliale,RETAIL_OUTLET,CIGO,4319710,,[],"{'Mo': '08:00-19:00', 'Tu': '08:00-19:00', 'We...",52.456415,13.509211
