In [36]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

In [37]:
pip install --upgrade pandas

Note: you may need to restart the kernel to use updated packages.


Adding geo-data to df

In [38]:
# --- 1. Load Your Data ---

# Path to the CSV file we are enriching
csv_path = Path('../clean/deutschepost_clean.csv')
df = pd.read_csv(csv_path)

# Path to the GeoJSON file (in the same 'scripts' folder)
geojson_path = 'lor_ortsteile.geojson'
gdf_polygons = gpd.read_file(geojson_path)


# --- 2. Create GeoDataFrames ---

# Convert your DataFrame of post offices into a GeoDataFrame
gdf_points = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326"
)

# Ensure both GeoDataFrames use the same Coordinate Reference System (CRS)
gdf_polygons = gdf_polygons.to_crs(gdf_points.crs)


# --- 3. Perform the Spatial Join ---

# This finds which polygon (neighborhood) each point is in
gdf_joined = gpd.sjoin(gdf_points, gdf_polygons, how="left", predicate='within')


# --- 4. Add the New Columns to Your Original DataFrame ---

# We use the final column names we identified
district_col_name = 'BEZIRK'
neighborhood_col_name = 'OTEIL'

# Add the new columns from the joined data back to your original DataFrame
df['district'] = gdf_joined[district_col_name].reset_index(drop=True)
df['neighborhood'] = gdf_joined[neighborhood_col_name].reset_index(drop=True)


# --- 5. Check the Result ---
print("New columns have been added successfully! ✅")
# The redundant 'neighborhood_id' column has been removed from the check
print(df[['city', 'district', 'neighborhood']].head())




New columns have been added successfully! ✅
     city district neighborhood
0  Berlin    Mitte        Mitte
1  Berlin    Mitte        Mitte
2  Berlin    Mitte        Mitte
3  Berlin    Mitte        Mitte
4  Berlin    Mitte        Mitte


Now we need to add a column district_id and neighborhood_id from database tables layered-populate-data-pool-da\post_offices\sources\districts.csv and layered-populate-data-pool-da\post_offices\sources\neighborhoods.csv 

In [39]:
# --- Load the new lookup tables ---

districts_path = Path('../sources/districts.csv')
neighborhoods_path = Path('../sources/neighborhoods.csv')

# Read the CSV files into DataFrames
districts_df = pd.read_csv(districts_path)
neighborhoods_df = pd.read_csv(neighborhoods_path)

# --- Inspect the DataFrames ---

print("--- Districts Table Info ---")
districts_df.info()
print("\n--- Districts Table Head ---")
print(districts_df.head())

print("\n" + "="*50 + "\n") # A separator for clarity

print("--- Neighborhoods Table Info ---")
neighborhoods_df.info()
print("\n--- Neighborhoods Table Head ---")
print(neighborhoods_df.head())

--- Districts Table Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   district_id  12 non-null     int64 
 1   district     12 non-null     object
 2   geometry     12 non-null     object
dtypes: int64(1), object(2)
memory usage: 420.0+ bytes

--- Districts Table Head ---
   district_id                    district  \
0     11012012               Reinickendorf   
1     11004004  Charlottenburg-Wilmersdorf   
2     11009009            Treptow-Köpenick   
3     11003003                      Pankow   
4     11008008                    Neukölln   

                                            geometry  
0  MULTIPOLYGON (((13.320744327762688 52.62659906...  
1  MULTIPOLYGON (((13.321109641281137 52.52446299...  
2  MULTIPOLYGON (((13.579253945950567 52.39083025...  
3  MULTIPOLYGON (((13.504807966473637 52.61959821...  
4  MULTIPOLYGON (((13.45832

In [40]:
# --- 1. Merge with Districts Table to add 'district_id' ---

# We only need the ID and the key from the districts table
districts_lookup = districts_df[['district_id', 'district']]

# Perform the merge
# 'how="left"' keeps all rows from original 'df'
df = pd.merge(df, districts_lookup, on='district', how='left')


# --- 2. Merge with Neighborhoods Table to add 'neighborhood_id' ---

# We only need the ID and the key from the neighborhoods table
neighborhoods_lookup = neighborhoods_df[['neighborhood_id', 'neighborhood']]

# Perform the second merge
df = pd.merge(df, neighborhoods_lookup, on='neighborhood', how='left')


# --- 3. Final Check ---
print("IDs have been added successfully! ✅")

# Display the key columns to verify the result
print(df[['district', 'district_id', 'neighborhood', 'neighborhood_id']].head())

IDs have been added successfully! ✅
  district  district_id neighborhood  neighborhood_id
0    Mitte     11001001        Mitte              101
1    Mitte     11001001        Mitte              101
2    Mitte     11001001        Mitte              101
3    Mitte     11001001        Mitte              101
4    Mitte     11001001        Mitte              101


Now we can drop the 'district' and 'neighborhood' columns, as there's no need to duplicate them in this table

In [41]:
df=df.drop(columns=['district','neighborhood'])

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         244 non-null    int64  
 1   city             244 non-null    object 
 2   street           244 non-null    object 
 3   house_no         244 non-null    object 
 4   location_type    244 non-null    object 
 5   location_name    234 non-null    object 
 6   id               244 non-null    int64  
 7   closure_periods  244 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         244 non-null    float64
 10  longitude        244 non-null    float64
 11  district_id      244 non-null    int64  
 12  neighborhood_id  244 non-null    int64  
dtypes: float64(2), int64(4), object(7)
memory usage: 24.9+ KB


We need to change some columns types.

In [43]:
# Create a list of columns we want to change
columns_to_change = ['zip_code', 'id', 'district_id', 'neighborhood_id']

# Select these columns and apply the type change
df[columns_to_change] = df[columns_to_change].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         244 non-null    object 
 1   city             244 non-null    object 
 2   street           244 non-null    object 
 3   house_no         244 non-null    object 
 4   location_type    244 non-null    object 
 5   location_name    234 non-null    object 
 6   id               244 non-null    object 
 7   closure_periods  244 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         244 non-null    float64
 10  longitude        244 non-null    float64
 11  district_id      244 non-null    object 
 12  neighborhood_id  244 non-null    object 
dtypes: float64(2), object(11)
memory usage: 24.9+ KB


In [44]:
#Save the Final Enriched File
save_path = Path('../clean/deutschepost_clean_with_distr.csv')
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"\nDataFrame successfully saved to '{save_path}'")



DataFrame successfully saved to '..\clean\deutschepost_clean_with_distr.csv'


Validation & Quality Checks:
  - Check for duplicate rows. 
  - Check coordinates - location within Berlin boundaries.
  - Check final row count.

In [45]:
df.duplicated().sum()

np.int64(0)

In [46]:
df['district_id'].isnull().sum()


np.int64(0)

In [47]:
df['neighborhood_id'].isnull().sum()

np.int64(0)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zip_code         244 non-null    object 
 1   city             244 non-null    object 
 2   street           244 non-null    object 
 3   house_no         244 non-null    object 
 4   location_type    244 non-null    object 
 5   location_name    234 non-null    object 
 6   id               244 non-null    object 
 7   closure_periods  244 non-null    object 
 8   opening_hours    244 non-null    object 
 9   latitude         244 non-null    float64
 10  longitude        244 non-null    float64
 11  district_id      244 non-null    object 
 12  neighborhood_id  244 non-null    object 
dtypes: float64(2), object(11)
memory usage: 24.9+ KB


In [49]:
df.head()

Unnamed: 0,zip_code,city,street,house_no,location_type,location_name,id,closure_periods,opening_hours,latitude,longitude,district_id,neighborhood_id
0,10178,Berlin,Spandauer Str.,2,RETAIL_OUTLET,City Shop,4340626,[],Mo: 08:00-18:00; Tu: 08:00-18:00; We: 08:00-18...,52.521144,13.403767,11001001,101
1,10178,Berlin,Rathausstr.,5,POSTBANK_FINANCE_CENTER,Postbank Filiale,6730,"[{'type': 'closure', 'fromDate': '2025-10-29T0...",Mo: 09:30-18:30; Tu: 09:30-18:30; We: 09:30-18...,52.519737,13.411517,11001001,101
2,10178,Berlin,Karl-Liebknecht-Str.,13,RETAIL_OUTLET,Lotto Post Tabak,4307374,[],Mo: 08:00-19:00; Tu: 08:00-19:00; We: 08:00-19...,52.522327,13.408074,11001001,101
3,10179,Berlin,Grunerstr.,20,RETAIL_OUTLET,"GECO im ALEXA, Untergeschoss/Baseme",4125530,[],Mo: 09:00-19:45; Tu: 09:00-19:45; We: 09:00-19...,52.518764,13.416384,11001001,101
4,10179,Berlin,Brückenstr.,1a,RETAIL_OUTLET,Lotto-Post-Schreibwaren,4326999,[],Mo: 09:00-19:00; Tu: 09:00-19:00; We: 09:00-19...,52.511505,13.416914,11001001,101
