## üõ† Step 2: Data Transformation

### Public Parks (Gr√ºnenanlage) Berlin

In [84]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from time import sleep
import psycopg2
from sqlalchemy import create_engine, text
import re
import warnings
warnings.filterwarnings("ignore")

In [85]:
# Load your CSV
df = pd.read_csv("../sources/public_parks.csv", sep=';')


In [86]:
df.head()

Unnamed: 0,Technischer Schl√ºssel,Schl√ºssel,Objektnummer,Bezirk,Ortsteil,Art der Gr√ºnanlage,Name der Gr√ºnanlage,Namenszusatz der Gr√ºnanlage,Baujahr,letztes Sanierungsjahr,Gr√∂√üe in m¬≤ (Kataster),Widmung,Nummer des Planungsraumes,Name des Planungsraumes
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,-,-,1699150,gewidmet,12400721,Frohnau Ost
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,-,-,5222460,gewidmet,12500929,L√ºbars
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,-,-,301200,gewidmet,12400722,Hermsdorf West
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",-,-,-,337420,gewidmet,12500929,L√ºbars
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,-,-,312200,gewidmet,12100206,Humboldtstra√üe


In [87]:
df.columns

Index(['Technischer Schl√ºssel', 'Schl√ºssel', 'Objektnummer', 'Bezirk',
       'Ortsteil', 'Art der Gr√ºnanlage', 'Name der Gr√ºnanlage',
       'Namenszusatz der Gr√ºnanlage', 'Baujahr', 'letztes Sanierungsjahr',
       'Gr√∂√üe in m¬≤ (Kataster)', 'Widmung', 'Nummer des Planungsraumes',
       'Name des Planungsraumes'],
      dtype='object')

### Rename the Columns 

In [88]:
df.rename(columns={
    'Technischer Schl√ºssel': 'Technical ID',
    'Schl√ºssel': 'Key',
    'Objektnummer': 'Object Number',
    'Bezirk': 'neighborhood',
    'Ortsteil': 'Locality',
    'Art der Gr√ºnanlage': 'Type of Green Space',
    'Name der Gr√ºnanlage': 'Green Space Name',
    'Namenszusatz der Gr√ºnanlage': 'Name Extension',
    'Baujahr': 'Year Built',
    'letztes Sanierungsjahr': 'Last Renovation Year',
    'Gr√∂√üe in m¬≤ (Kataster)': 'Size sqm',
    'Widmung': 'Dedication',
    'Nummer des Planungsraumes': 'Planning Area Number',
    'Name des Planungsraumes': 'Planning Area Name'
}, inplace=True)


In [89]:
df.columns


Index(['Technical ID', 'Key', 'Object Number', 'neighborhood', 'Locality',
       'Type of Green Space', 'Green Space Name', 'Name Extension',
       'Year Built', 'Last Renovation Year', 'Size sqm', 'Dedication',
       'Planning Area Number', 'Planning Area Name'],
      dtype='object')

### Change column names to snake_case

In [90]:
def to_snake_case(col):
    col = col.strip()  # remove leading/trailing whitespace
    col = re.sub(r'[\s\-]+', '_', col)  # replace spaces and hyphens with underscores
    col = re.sub(r'([a-z])([A-Z])', r'\1_\2', col)  # add underscore between camelCase words
    col = col.lower()
    return col

In [91]:
df.columns = [to_snake_case(col) for col in df.columns]

In [92]:
df.columns

Index(['technical_id', 'key', 'object_number', 'neighborhood', 'locality',
       'type_of_green_space', 'green_space_name', 'name_extension',
       'year_built', 'last_renovation_year', 'size_sqm', 'dedication',
       'planning_area_number', 'planning_area_name'],
      dtype='object')

### Check for null values 

In [93]:
print(df.isnull().sum())

technical_id            0
key                     0
object_number           0
neighborhood            0
locality                0
type_of_green_space     0
green_space_name        0
name_extension          0
year_built              0
last_renovation_year    0
size_sqm                0
dedication              0
planning_area_number    0
planning_area_name      0
dtype: int64


### Change the DataType of the columns for analysis 

In [94]:
df.head(5)

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,-,-,1699150,gewidmet,12400721,Frohnau Ost
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,-,-,5222460,gewidmet,12500929,L√ºbars
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,-,-,301200,gewidmet,12400722,Hermsdorf West
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",-,-,-,337420,gewidmet,12500929,L√ºbars
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,-,-,312200,gewidmet,12100206,Humboldtstra√üe


In [95]:
# Replace "-" with NaN
df["size_sqm"] = df["size_sqm"].replace("-", np.nan)

# Convert to string for cleaning, then back to float
df["size_sqm"] = df["size_sqm"].astype(str) \
    .str.replace(".", "", regex=False) \
    .str.replace(",", ".", regex=False)

# Replace "nan" string with actual NaN and convert to float
df["size_sqm"] = df["size_sqm"].replace("nan", np.nan).astype(float)

# Convert year columns to numeric (invalid entries like "-" will become NaN)
df["year_built"] = pd.to_numeric(df["year_built"], errors="coerce")
df["last_renovation_year"] = pd.to_numeric(df["last_renovation_year"], errors="coerce")

# Convert planning area number to numeric (if needed)
df["planning_area_number"] = pd.to_numeric(df["planning_area_number"], errors="coerce")

# Replace "-" with NaN in name_extension
df["name_extension"] = df["name_extension"].replace("-", np.nan)

# Convert selected text columns to category type to reduce memory usage
cat_columns = [
    "neighborhood", "locality", "type_of_green_space",
    "green_space_name", "name_extension", "dedication"
]
df[cat_columns] = df[cat_columns].astype('category')


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   technical_id          2556 non-null   object  
 1   key                   2556 non-null   object  
 2   object_number         2556 non-null   object  
 3   neighborhood          2556 non-null   category
 4   locality              2556 non-null   category
 5   type_of_green_space   2556 non-null   category
 6   green_space_name      2556 non-null   category
 7   name_extension        1411 non-null   category
 8   year_built            362 non-null    float64 
 9   last_renovation_year  167 non-null    float64 
 10  size_sqm              2553 non-null   float64 
 11  dedication            2556 non-null   category
 12  planning_area_number  2555 non-null   float64 
 13  planning_area_name    2556 non-null   object  
dtypes: category(6), float64(4), object(4)
memory usage: 310.

### Handle Missing Values after change the datatype

In [97]:
df.isna().sum()

technical_id               0
key                        0
object_number              0
neighborhood               0
locality                   0
type_of_green_space        0
green_space_name           0
name_extension          1145
year_built              2194
last_renovation_year    2389
size_sqm                   3
dedication                 0
planning_area_number       1
planning_area_name         0
dtype: int64

* Fill missing values (imputation) For numeric columns:

In [98]:
df["size_sqm"] = df["size_sqm"].fillna(df["size_sqm"].median())
df["planning_area_number"] = df["planning_area_number"].fillna(df["planning_area_number"].median())

In [99]:
df.isna().sum()

technical_id               0
key                        0
object_number              0
neighborhood               0
locality                   0
type_of_green_space        0
green_space_name           0
name_extension          1145
year_built              2194
last_renovation_year    2389
size_sqm                   0
dedication                 0
planning_area_number       0
planning_area_name         0
dtype: int64

In [101]:
print(df['year_built'].min())
print(df['year_built'].max())
print(df['year_built'].isna().sum())

1229.0
2024.0
2194


In [102]:
print(df['last_renovation_year'].min())
print(df['last_renovation_year'].max())
print(df['last_renovation_year'].isna().sum())

1922.0
2024.0
2389


### Create Address1 Column with Green Space Name

In [103]:
df['address1'] = (
    df['green_space_name'].astype(str) + ", Berlin, Germany" #change the datatype before apply the geocoding
)

df.head()

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,,,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,,,52224.6,gewidmet,12500929.0,L√ºbars,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger..."
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,,,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",,,,3374.2,gewidmet,12500929.0,L√ºbars,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G..."
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,,,,3122.0,gewidmet,12100206.0,Humboldtstra√üe,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."


## Lets look for duplicates 

### Check if parks with the same name exist in different locations:

In [104]:
# Group by park name and check if it appears in more than one neighborhood or locality
duplicates_check = df.groupby("green_space_name").agg({
    "neighborhood": pd.Series.nunique,
    "locality": pd.Series.nunique,
    "object_number": "count"
}).reset_index()

# Filter only those parks that appear in more than one neighborhood or locality
potential_duplicates = duplicates_check[
    (duplicates_check["neighborhood"] > 1) | (duplicates_check["locality"] > 1)
]

# Display potentially duplicated parks
print(potential_duplicates)


                        green_space_name  neighborhood  locality  \
1328                        Leopoldplatz             2         2   
1768                         Rathauspark             2         2   
2077  Spreeuferpromenade Holsteiner Ufer             1         2   
2527              √ñffentliche Gr√ºnanlage             1         2   

      object_number  
1328              2  
1768              2  
2077              2  
2527              2  


### Visualize duplicates based only on name

In [105]:
# Show all entries where the park name is duplicated (for manual inspection)
df[df.duplicated(subset=["green_space_name"], keep=False)].sort_values("green_space_name")



Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1
2324,00008100_0030314c,00008100:0030314c,110231,Treptow-K√∂penick,Friedrichshagen,Gr√ºnanlage,Am Neuhagener M√ºhlenflie√ü Fr GA,LSG von Landesgrenze bis Br√ºcke Hauptweg KGA W...,,,9455.0,-,9501736.0,Hirschgarten,"Am Neuhagener M√ºhlenflie√ü Fr GA, Berlin, Germany"
2294,00008100_003031db,00008100:003031db,110241,Treptow-K√∂penick,Friedrichshagen,Gr√ºnanlage,Am Neuhagener M√ºhlenflie√ü Fr GA,LSG von Br√ºcke Hauptweg KGA Wiesengrund bis Br...,,,5520.0,-,9501736.0,Hirschgarten,"Am Neuhagener M√ºhlenflie√ü Fr GA, Berlin, Germany"
2499,00008100_0035ab9e,00008100:0035ab9e,211420,Mitte,Moabit,Gr√ºnanlage,D√∂beritzer Gr√ºnzug S√ºd,zw. Lehrter Str. Stichstr. und Minna-Cauer-Str.,,,3183.0,gewidmet,1200624.0,Heidestra√üe,"D√∂beritzer Gr√ºnzug S√ºd, Berlin, Germany"
493,00008100_0014b9f1,00008100:0014b9f1,211430,Mitte,Moabit,Gr√ºnanlage,D√∂beritzer Gr√ºnzug S√ºd,zw. D√∂beritzer Verbindung und Minna.Cauer-Str.,,,35724.0,in Widmung,1200624.0,Heidestra√üe,"D√∂beritzer Gr√ºnzug S√ºd, Berlin, Germany"
2270,00008100_003031f6,00008100:003031f6,110261,Treptow-K√∂penick,Friedrichshagen,Gr√ºnanlage,Erpetal/ Neuhagener M√ºhlenflie√ü Fr GA,s√ºdlich von KGA Am Kurpark,,,153.0,gewidmet,9501736.0,Hirschgarten,"Erpetal/ Neuhagener M√ºhlenflie√ü Fr GA, Berlin,..."
2308,00008100_003031ee,00008100:003031ee,110251,Treptow-K√∂penick,Friedrichshagen,Gr√ºnanlage,Erpetal/ Neuhagener M√ºhlenflie√ü Fr GA,LSG von Br√ºcke KGA Am Kurpark bis Hinter dem K...,,,11026.0,-,9501736.0,Hirschgarten,"Erpetal/ Neuhagener M√ºhlenflie√ü Fr GA, Berlin,..."
1958,00008100_0024345c,00008100:0024345c,213051,Mitte,Moabit,Gr√ºnanlage,Fritz-Schlo√ü-Park,"Minigolfanlage (PO 213010, BG 200)",,,2454.0,gewidmet,1200623.0,Stephankiez,"Fritz-Schlo√ü-Park, Berlin, Germany"
1944,00008100_0014bc3b,00008100:0014bc3b,213010,Mitte,Moabit,Gr√ºnanlage,Fritz-Schlo√ü-Park,,,,102608.0,gewidmet,1200623.0,Stephankiez,"Fritz-Schlo√ü-Park, Berlin, Germany"
1897,00008100_0014ba6b,00008100:0014ba6b,339920,Mitte,Wedding,Gr√ºnanlage,Goethepark,Parkteil AK 3,1929.0,,21162.0,gewidmet,1400938.0,Kameruner Stra√üe,"Goethepark, Berlin, Germany"
1988,00008100_0014b9b3,00008100:0014b9b3,339910,Mitte,Wedding,Gr√ºnanlage,Goethepark,Parkteil AK 2,,,72938.0,gewidmet,1400938.0,Kameruner Stra√üe,"Goethepark, Berlin, Germany"


### Drop duplicates 

In [106]:
df_cleaned = df.drop_duplicates(subset=[
    "green_space_name",
    "neighborhood",
    "locality",
    "object_number"
])

In [107]:
df_cleaned.head(5)

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,,,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,,,52224.6,gewidmet,12500929.0,L√ºbars,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger..."
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,,,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",,,,3374.2,gewidmet,12500929.0,L√ºbars,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G..."
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,,,,3122.0,gewidmet,12100206.0,Humboldtstra√üe,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."


### Create a unique address DataFrame

In [108]:
unique_addresses = df_cleaned[['address1']].copy()
unique_addresses

Unnamed: 0,address1
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger..."
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G..."
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."
...,...
2551,"Essener Park, Berlin, Germany"
2552,"Nauener Platz, Berlin, Germany"
2553,"Leopoldplatz an der Alten Nazarethkirche, Berl..."
2554,"Mollstr. 15-18, Berlin, Germany"


### Geocode only the sample_df (10 rows) of unique addresses using OpenStreetMap‚Äôs Nominatim API 

In [109]:
sample_df = unique_addresses.loc[0:10]
sample_df

Unnamed: 0,address1
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger..."
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G..."
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."
5,"Avenue Charles de Gaulle 32-33, Berlin, Germany"
6,"Platz der US-Berlin-Brigaden WG, Berlin, Germany"
7,"Sch√ºnemannweg N, Berlin, Germany"
8,"Grabens. Hlgs., Lindengraben, Berlin, Germany"
9,"BAB, √úberbauung Tunnel Tegel, Berlin, Germany"


In [110]:
geolocator = Nominatim(user_agent="berlin-geocoder")

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        sleep(1)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else: 
            return pd.Series([None, None])
    except:
        return pd.Series([None, None])

In [111]:
# Geocode sample_df
sample_df[['latitude', 'longitude']] = sample_df['address1'].apply(geocode_address)

In [112]:
sample_df

Unnamed: 0,address1,latitude,longitude
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger...",,
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G...",,
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
5,"Avenue Charles de Gaulle 32-33, Berlin, Germany",,
6,"Platz der US-Berlin-Brigaden WG, Berlin, Germany",,
7,"Sch√ºnemannweg N, Berlin, Germany",52.444492,13.352584
8,"Grabens. Hlgs., Lindengraben, Berlin, Germany",,
9,"BAB, √úberbauung Tunnel Tegel, Berlin, Germany",,


### Geocode the unique addresses of the entire dataset using OpenStreetMap‚Äôs Nominatim API 

In [113]:
geolocator = Nominatim(user_agent="berlin-geocoder")

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        sleep(1)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else: 
            return pd.Series([None, None])
    except:
        return pd.Series([None, None])

In [114]:
# Geocode unique addresses
unique_addresses[['latitude', 'longitude']] = unique_addresses['address1'].apply(geocode_address)

In [115]:
unique_addresses

Unnamed: 0,address1,latitude,longitude
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger...",,
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G...",,
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
...,...,...,...
2551,"Essener Park, Berlin, Germany",52.524730,13.340990
2552,"Nauener Platz, Berlin, Germany",52.551406,13.367170
2553,"Leopoldplatz an der Alten Nazarethkirche, Berl...",,
2554,"Mollstr. 15-18, Berlin, Germany",52.523370,13.424753


In [116]:
df.head()

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,,,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,,,52224.6,gewidmet,12500929.0,L√ºbars,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger..."
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,,,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",,,,3374.2,gewidmet,12500929.0,L√ºbars,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G..."
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,,,,3122.0,gewidmet,12100206.0,Humboldtstra√üe,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."


### For full workflow is recomendable to save the table into a csv for avoid the long wait (101 minutes) for run the geocoder with Nominatim API

1. After geocoding (takes 101 mins):

In [117]:
unique_addresses.to_csv("unique_addresses_geocoded.csv", index=False)

2. In future sessions (fast, ~2s):

In [118]:
unique_addresses = pd.read_csv("unique_addresses_geocoded.csv")

3. Then merge cleanly:

In [119]:
df = df.drop(columns=[col for col in df.columns if 'latitude' in col or 'longitude' in col], errors='ignore')
df = df.merge(unique_addresses[['address1', 'latitude', 'longitude']], on='address1', how='left')
df.head()

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1,latitude,longitude
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,,,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,,,52224.6,gewidmet,12500929.0,L√ºbars,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger...",,
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,,,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",,,,3374.2,gewidmet,12500929.0,L√ºbars,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G...",,
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,,,,3122.0,gewidmet,12100206.0,Humboldtstra√üe,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,


### Save the final table in CSV Format 

In [122]:
df.to_csv('public_parks_transformed.csv', index=False)


In [128]:
import os
print(os.getcwd())


/Users/dianaterraza/Desktop/webeet.io/layered-populate-data-pool-da/recreational_zones/scripts


In [130]:
df = pd.read_csv("../sources/public_parks_transformed.csv")

In [131]:
df.head(10)

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1,latitude,longitude
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Gr√ºnanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,,,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,L√ºbars,Gr√ºnanlage,Kl√∂tzbecken bis Zabel-Kr√ºger-Damm,einschl. Kl√∂tzbecken,,,52224.6,gewidmet,12500929.0,L√ºbars,"Kl√∂tzbecken bis Zabel-Kr√ºger-Damm, Berlin, Ger...",,
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Gr√ºnanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,,,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,L√ºbars,Gr√ºnanlage,"Wittenauer Str., s√ºdl. AEG-Siedlung",,,,3374.2,gewidmet,12500929.0,L√ºbars,"Wittenauer Str., s√ºdl. AEG-Siedlung, Berlin, G...",,
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Gr√ºnanlage,Kuhnpromenade u. Lindauer Allee 59/61,,,,3122.0,gewidmet,12100206.0,Humboldtstra√üe,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
5,00008100_00104357,00008100:00104357,35060,Reinickendorf,Wittenau,Gr√ºnanlage,Avenue Charles de Gaulle 32-33,hinter Nimrodstr. u. am Packereigraben,,,8738.3,gewidmet,12500927.0,Wittenau Nord,"Avenue Charles de Gaulle 32-33, Berlin, Germany",,
6,00008100_00315c33,00008100:00315c33,103014,Steglitz-Zehlendorf,Lichterfelde,Gr√ºnanlage,Platz der US-Berlin-Brigaden WG,,,,2194.0,gewidmet,6300632.0,Schweizer Viertel,"Platz der US-Berlin-Brigaden WG, Berlin, Germany",,
7,00008100_000e3bb3,00008100:000e3bb3,102180,Steglitz-Zehlendorf,Steglitz,Gr√ºnanlage,Sch√ºnemannweg N,Sch√ºnemannweg 6A u.16A; Tuttlinger Weg,,,3889.0,gewidmet,6100205.0,S√ºdende,"Sch√ºnemannweg N, Berlin, Germany",52.444492,13.352584
8,00008100_00104488,00008100:00104488,599,Reinickendorf,Heiligensee,Gr√ºnanlage,"Grabens. Hlgs., Lindengraben",,,,12490.9,gewidmet,12400617.0,Alt-Heiligensee,"Grabens. Hlgs., Lindengraben, Berlin, Germany",,
9,00008100_00104409,00008100:00104409,714,Reinickendorf,Tegel,Gr√ºnanlage,"BAB, √úberbauung Tunnel Tegel",Ernststr. - Waidmannsluster Damm,,,29894.7,gewidmet,12500824.0,Ziekowstra√üe/Freie Scholle,"BAB, √úberbauung Tunnel Tegel, Berlin, Germany",,


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596 entries, 0 to 2595
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   technical_id          2596 non-null   object 
 1   key                   2596 non-null   object 
 2   object_number         2596 non-null   object 
 3   neighborhood          2596 non-null   object 
 4   locality              2596 non-null   object 
 5   type_of_green_space   2596 non-null   object 
 6   green_space_name      2596 non-null   object 
 7   name_extension        1445 non-null   object 
 8   year_built            368 non-null    float64
 9   last_renovation_year  169 non-null    float64
 10  size_sqm              2596 non-null   float64
 11  dedication            2596 non-null   object 
 12  planning_area_number  2596 non-null   float64
 13  planning_area_name    2596 non-null   object 
 14  address1              2596 non-null   object 
 15  latitude             

### Append to DB (Populate the Database)

In [134]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://diana_user:npg_ED89FugWtSQz"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [135]:
# Creating the new table with the specified schema
with engine.connect() as conn:
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS test_berlin_data.green_spaces (
        technical_id VARCHAR(20) NOT NULL,
        key VARCHAR(20) NOT NULL,
        object_number INT,
        neighborhood VARCHAR(100) NOT NULL, -- FK (foreign key),
        locality VARCHAR(100) NOT NULL,
        type_of_green_space VARCHAR(100) NOT NULL,
        green_space_name VARCHAR(100) NOT NULL,
        name_extension VARCHAR(100),
        year_built INT,
        last_renovation_year INT,
        size_sqm DECIMAL(9,6) NOT NULL,
        dedication VARCHAR(20),
        planning_area_number DECIMAL(9,6) NOT NULL,
        planning_area_name VARCHAR(20),
        address1 VARCHAR(225) NOT NULL,
        longitude DECIMAL(9,6),
        latitude DECIMAL(9,6),
        PRIMARY KEY (technical_id)
    );
    """))
conn.commit()

In [136]:
query = "SELECT * FROM test_berlin_data.green_spaces LIMIT 5;"
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1,longitude,latitude


In [137]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT inet_server_addr();"))
    print(result.fetchone())

('::1',)
