In [1]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

Adding geo-data to df

In [2]:
# --- 1. Load Your Data ---

# Path to the CSV file we are enriching
csv_path = Path('../clean/night_clubs_clean.csv')
df = pd.read_csv(csv_path)

# Path to the GeoJSON file (in the same 'scripts' folder)
geojson_path = 'lor_ortsteile.geojson'
gdf_polygons = gpd.read_file(geojson_path)


# --- 2. Create GeoDataFrames ---

# Convert your DataFrame of post offices into a GeoDataFrame
gdf_points = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326"
)

# Ensure both GeoDataFrames use the same Coordinate Reference System (CRS)
gdf_polygons = gdf_polygons.to_crs(gdf_points.crs)


# --- 3. Perform the Spatial Join ---

# This finds which polygon (neighborhood) each point is in
gdf_joined = gpd.sjoin(gdf_points, gdf_polygons, how="left", predicate='within')


# --- 4. Add the New Columns to Your Original DataFrame ---

# We use the final column names we identified
district_col_name = 'BEZIRK'
neighborhood_col_name = 'OTEIL'

# Add the new columns from the joined data back to your original DataFrame
df['district'] = gdf_joined[district_col_name].reset_index(drop=True)
df['neighborhood'] = gdf_joined[neighborhood_col_name].reset_index(drop=True)


# --- 5. Check the Result ---
print("New columns have been added successfully! ✅")
# The redundant 'neighborhood_id' column has been removed from the check
print(df[['city', 'district', 'neighborhood']].head())




New columns have been added successfully! ✅
     city                  district     neighborhood
0  Berlin                     Mitte            Mitte
1  Berlin                     Mitte            Mitte
2  Berlin  Friedrichshain-Kreuzberg   Friedrichshain
3  Berlin  Friedrichshain-Kreuzberg        Kreuzberg
4  Berlin                    Pankow  Prenzlauer Berg


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    135 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                140 non-null    float64
 11  street                  141 non-null    object 
 12  suburb                  114 non-null    object 
 13  opening_hours           53 non-null     object 
 14  live_music              10 non-null     ob

Now I need to check that all clubs are within Berlin.

In [4]:
from shapely.ops import unary_union
import geopandas as gpd # Import in case this is in a new cell

# --- 1. Create a single "Berlin" polygon ---
# We use 'gdf_polygons', which you already loaded and projected
berlin_boundary = gpd.GeoSeries(unary_union(gdf_polygons.geometry), crs=gdf_polygons.crs)

# --- 2. Perform the check ---
# We use 'gdf_points', which you already created
# .within() checks if each point is inside the berlin_boundary
is_inside_berlin = gdf_points.within(berlin_boundary.geometry[0])

# --- 3. Report the results ---
num_outside = (~is_inside_berlin).sum() # ~ inverts True/False, counting the Falses (those outside)

if num_outside == 0:
    print("✅ All 142 points are correctly located within the Berlin boundaries.")
else:
    print(f"⚠️ Warning: Found {num_outside} point(s) outside the Berlin boundaries.")
    
    # (Optional) Show the rows that are outside
    # We use the original 'df' for a clean text report
    print("\nClubs located outside Berlin:")
    print(df[~is_inside_berlin][['club_name', 'street', 'city', 'latitude', 'longitude']])

✅ All 142 points are correctly located within the Berlin boundaries.


As all clubs are within Berlin, I'll fill missing values in the 'city' column.

In [5]:
df['city'] = df['city'].fillna('Berlin')

Now we need to add a column district_id and neighborhood_id from database tables districts.csv and neighborhoods.csv 

In [6]:
# --- Load the new lookup tables ---

districts_path = Path('../source/districts.csv')
neighborhoods_path = Path('../source/neighborhoods.csv')

# Read the CSV files into DataFrames
districts_df = pd.read_csv(districts_path)
neighborhoods_df = pd.read_csv(neighborhoods_path)

# --- Inspect the DataFrames ---

print("--- Districts Table Info ---")
districts_df.info()
print("\n--- Districts Table Head ---")
print(districts_df.head())

print("\n" + "="*50 + "\n") # A separator for clarity

print("--- Neighborhoods Table Info ---")
neighborhoods_df.info()
print("\n--- Neighborhoods Table Head ---")
print(neighborhoods_df.head())

--- Districts Table Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   district_id  12 non-null     int64 
 1   district     12 non-null     object
 2   geometry     12 non-null     object
dtypes: int64(1), object(2)
memory usage: 420.0+ bytes

--- Districts Table Head ---
   district_id                    district  \
0     11012012               Reinickendorf   
1     11004004  Charlottenburg-Wilmersdorf   
2     11009009            Treptow-Köpenick   
3     11003003                      Pankow   
4     11008008                    Neukölln   

                                            geometry  
0  MULTIPOLYGON (((13.320744327762688 52.62659906...  
1  MULTIPOLYGON (((13.321109641281137 52.52446299...  
2  MULTIPOLYGON (((13.579253945950567 52.39083025...  
3  MULTIPOLYGON (((13.504807966473637 52.61959821...  
4  MULTIPOLYGON (((13.45832

In [7]:
# --- 1. Merge with Districts Table to add 'district_id' ---

# We only need the ID and the key from the districts table
districts_lookup = districts_df[['district_id', 'district']]

# Perform the merge
# 'how="left"' keeps all rows from original 'df'
df = pd.merge(df, districts_lookup, on='district', how='left')


# --- 2. Merge with Neighborhoods Table to add 'neighborhood_id' ---

# We only need the ID and the key from the neighborhoods table
neighborhoods_lookup = neighborhoods_df[['neighborhood_id', 'neighborhood']]

# Perform the second merge
df = pd.merge(df, neighborhoods_lookup, on='neighborhood', how='left')


# --- 3. Final Check ---
print("IDs have been added successfully! ✅")

# Display the key columns to verify the result
print(df[['district', 'district_id', 'neighborhood', 'neighborhood_id', 'suburb']].head())

IDs have been added successfully! ✅
                   district  district_id     neighborhood  neighborhood_id  \
0                     Mitte     11001001            Mitte              101   
1                     Mitte     11001001            Mitte              101   
2  Friedrichshain-Kreuzberg     11002002   Friedrichshain              201   
3  Friedrichshain-Kreuzberg     11002002        Kreuzberg              202   
4                    Pankow     11003003  Prenzlauer Berg              301   

            suburb  
0            Mitte  
1            Mitte  
2   Friedrichshain  
3              NaN  
4  Prenzlauer Berg  


I'll drop the 'suburb' column as it's not full and we now have everything we need in the 'neighborhood_id' column. And we can drop the 'district' and 'neighborhood' columns, as there's no need to duplicate them in this table.

In [8]:
df.drop(columns=['district', 'neighborhood', 'suburb'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    142 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                140 non-null    float64
 11  street                  141 non-null    object 
 12  opening_hours           53 non-null     object 
 13  live_music              10 non-null     object 
 14  longitude               142 non-null    fl

We need to change some columns types.

In [16]:
# This handles NaNs and removes the .0 from floats
df['postcode'] = df['postcode'].astype(pd.Int64Dtype()).astype(str).replace('<NA>', None)

# Convert IDs to string
df['district_id'] = df['district_id'].astype(str)
df['neighborhood_id'] = df['neighborhood_id'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    142 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                140 non-null    object 
 11  street                  141 non-null    object 
 12  opening_hours           53 non-null     object 
 13  live_music              10 non-null     object 
 14  longitude               142 non-null    fl

In [17]:
# Find rows where 'club_name' is null (NaN)
row_with_no_name = df[df['club_name'].isnull()]

# .to_string() ensures that all 18 columns are displayed
print(row_with_no_name.to_string())

                   id club_name phone website wheelchair email toilets_wheelchair wheelchair_description    city house_num postcode     street opening_hours live_music  longitude   latitude district_id neighborhood_id
134  node/11435535669       NaN   NaN     NaN        NaN   NaN                NaN                    NaN  Berlin       NaN    13347  Seestraße           NaN        NaN  13.354057  52.550957    11001001             105


I will drop this row as it lacks the necessary information.

In [18]:
# Find the index of the row(s) where 'club_name' is null 
index_to_drop = df[df['club_name'].isnull()].index

# Drop those rows from the DataFrame
# inplace=True modifies the 'df' directly
df.drop(index_to_drop, inplace=True)

print("Row(s) with null 'club_name' have been dropped.")
df.info()

Row(s) with null 'club_name' have been dropped.
<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 141
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      141 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    141 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                139 non-null    object 
 11  street                  140 non-null    object 
 12  opening_hours           53 non-null     object 
 13  live_music              10 non-null     object 
 14 

In [19]:
#Save the Final Enriched File
save_path = Path('../clean/night_clubs_clean_with_distr.csv')
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"\nDataFrame successfully saved to '{save_path}'")


DataFrame successfully saved to '..\clean\night_clubs_clean_with_distr.csv'


Validation & Quality Checks:
  - Check for duplicate rows. 
  - Check final row count.

In [20]:
df.duplicated().sum()

np.int64(0)

In [21]:
df['district_id'].isnull().sum()

np.int64(0)

In [22]:
df['neighborhood_id'].isnull().sum()

np.int64(0)

In [23]:
df['id'].nunique()

141

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 141
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      141 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    141 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                139 non-null    object 
 11  street                  140 non-null    object 
 12  opening_hours           53 non-null     object 
 13  live_music              10 non-null     object 
 14  longitude               141 non-null    float64