In [1]:
import requests
import csv
import time
import pandas as pd
from osm_script import osm_extractor_groups, fetch_osm_region
from reverse_geocode import geocode_lat_lon
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [3]:
#Several APIs available 

main_overpass_api = "https://overpass-api.de/api/interpreter"
lz4_overpass_api='https://lz4.overpass-api.de/api/interpreter'
osm_overpass_api='https://overpass.openstreetmap.ru/api/interpreter'

BUNDES_GROUPS = [
    ["Baden-Württemberg"],
    ["Bayern"],
    ["Berlin", "Bremen"],
    ["Brandenburg"],
    ["Hamburg"],
    ["Hessen","Mecklenburg-Vorpommern","Rheinland-Pfalz"],
    ["Niedersachsen"],
    ["Nordrhein-Westfalen"],
    ["Saarland","Sachsen","Sachsen-Anhalt", "Schleswig-Holstein","Thüringen"]
]

In [None]:
df_raw=osm_extractor_groups(BUNDES_GROUPS) #for first time getting the data



In [2]:

#for later import the latest file 
df_raw = pd.read_csv("awo_20250926-135243_osm_raw.csv")
df_raw.shape

(4604, 14)

In [7]:
#it seems like empty cells contain empty string, therefore it needs to be replaced with NA values 
#df_raw = df_raw.replace(r'^\s*$', pd.NA, regex=True)
df_raw.sample(10)

Unnamed: 0,osm_id,region,type,name,street,housenumber,postcode,city,lat,lon,phone,email,website,amenity
1956,8379708140,Niedersachsen,node,AWO Kita Saline,,,,,52.067313,10.012777,,,,kindergarten
4560,896967253,Thüringen,way,AWO Sozialzentrum Eisenberg,Klosterlausnitzer Straße,19,7607.0,Eisenberg,50.967206,11.897322,,,,
3675,563100226,Saarland,way,Arbeiterwohlfahrt,,,,,49.495393,6.836162,,,,social_facility
2129,209497004,Niedersachsen,way,Awo Tagesklinik Gifhorn,Campus,7,38518.0,Gifhorn,52.497244,10.54534,,,http://www.awo-psychiatriezentrum.de,clinic
3521,1311603269,Nordrhein-Westfalen,way,AWO Waldkindergarten,,,,,50.951345,7.144397,+49-160-8808680,waldkiga-frankenforst@awo-rhein-oberberg.de,,kindergarten
4304,366789642,Thüringen,node,Rabennest,Am Rabenhügel,31a,99099.0,Erfurt,50.962802,11.057175,+49 361 411052,,,kindergarten
3536,1372459327,Nordrhein-Westfalen,way,AWO Kindertagesstätte Am Rosenberg,Am Rosenberg,18a,45525.0,Hattingen,51.400061,7.196429,+49 2324 97 70 416,kita-rosenberg@awo-en.de,https://kita-am-rosenberg.awo-en.de/awo_rosenb...,kindergarten
1914,4061996670,Niedersachsen,node,Ausgang BraWo-Park,,,,,52.253055,10.540006,,,,
1081,9403761628,Bremen,node,Alte Hafenstraße,Alte Hafenstraße,64-66,28757.0,Bremen,53.170731,8.62618,,,,kindergarten
406,4240919792,Bayern,node,"AWO Kinderhaus Außenstelle ""Rasselbande""",Fasanenstraße,30,85591.0,Vaterstetten,48.109001,11.765877,+49 8106 32028,,https://awo-kv-ebe.de/kitas/khvco/,kindergarten


In [6]:
df_raw.region.value_counts()

region
Nordrhein-Westfalen       1236
Bayern                     679
Niedersachsen              513
Thüringen                  300
Baden-Württemberg          299
Sachsen                    281
Hessen                     237
Schleswig-Holstein         190
Brandenburg                163
Mecklenburg-Vorpommern     156
Sachsen-Anhalt             142
Saarland                   121
Rheinland-Pfalz             94
Berlin                      84
Bremen                      69
Hamburg                     40
Name: count, dtype: int64

In [8]:
df_raw.isnull().sum() 

osm_id            0
region            0
type              0
name            413
street         1705
housenumber    1740
postcode       1825
city           1842
lat               0
lon               0
phone          3138
email          3827
website        2846
amenity        1308
dtype: int64

Reverse geocoding to fill address for rows having only lat and lon 


In [9]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4604 entries, 0 to 4603
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   osm_id       4604 non-null   int64  
 1   region       4604 non-null   object 
 2   type         4604 non-null   object 
 3   name         4191 non-null   object 
 4   street       2899 non-null   object 
 5   housenumber  2864 non-null   object 
 6   postcode     2779 non-null   float64
 7   city         2762 non-null   object 
 8   lat          4604 non-null   float64
 9   lon          4604 non-null   float64
 10  phone        1466 non-null   object 
 11  email        777 non-null    object 
 12  website      1758 non-null   object 
 13  amenity      3296 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 503.7+ KB


In [10]:
mask_empty_address = df_raw['city'].isna()
dict_empty_address = df_raw.loc[mask_empty_address].apply(lambda row:geocode_lat_lon(row['lat'], row['lon']), axis=1)


In [37]:
type(dict_empty_address)

pandas.core.series.Series

In [12]:
add_addresses = pd.DataFrame(list(dict_empty_address), index=dict_empty_address.index)
df_raw.loc[mask_empty_address, add_addresses.columns] = add_addresses

  df_raw.loc[mask_empty_address, add_addresses.columns] = add_addresses


In [14]:
df_raw.isna().sum()

osm_id            0
region            0
type              0
name            413
street           16
housenumber      47
postcode         18
city              0
lat               0
lon               0
phone          3138
email          3827
website        2846
amenity        1308
dtype: int64

In [None]:
df_raw[(df_raw['name'].isna()) & (df_raw['email'].notna())] # check which faclities have no name but have email and fill those values with email domain

Unnamed: 0,osm_id,region,type,name,street,housenumber,postcode,city,lat,lon,phone,email,website,amenity
1276,1104913364,Brandenburg,way,,Pietschkerstraße,44,14480.0,Potsdam,52.378131,13.125325,+49 331 887 49580,pfiffikus@awo-potsdam.de,https://awo-potsdam.de/de/standort/kita-pfiffi...,kindergarten
1994,12327386579,Niedersachsen,node,,Dorfstraße,58a,21365.0,Adendorf,53.287278,10.445362,+49 4131 898 00 78,info@awosozial.de,,social_facility
2304,1341369737,Niedersachsen,way,,Im Tale,2-4,,Celle,52.654001,10.070353,,kita.imtale.celle@awo-juki.de,https://www.awo-juki.de/index.php?id=930&L=it%...,kindergarten
3847,273517107,Sachsen,way,,Gerichtsstraße,2,2779.0,Großschönau,50.896862,14.665772,+49 35841 2430,kinderland@awo-oberlausitz.de,https://awo-oberlausitz.de/kindertagesstaetten...,childcare
4416,124874406,Thüringen,way,,Mörlaer Straße,8b,7407.0,Rudolstadt,50.719988,11.322286,+49 3672 422 552,feste-burg@awo-rudolstadt.de,http://www.awo-rudolstadt.de,
4444,181239665,Thüringen,way,,Löwentorstraße,33,99752.0,Bleicherode,51.442816,10.569425,+49 36338 42486,leiterin@kita-bleicherode.de,,


In [19]:
df_raw.loc[df_raw['name'].isna(), 'name'] = df_raw['email'].str.split('@').str[1]

In [None]:
df_raw.isna().sum() # postcode is missing, same function for reverse geocoding needs t obe applied with rows having NaN for postcode , street and housenumber

osm_id            0
region            0
type              0
name            407
street           16
housenumber      47
postcode         18
city              0
lat               0
lon               0
phone          3138
email          3827
website        2846
amenity        1308
dtype: int64

In [34]:
postcode_mask = df_raw['postcode'].isna()
postcodes = df_raw.loc[postcode_mask].apply(lambda row: geocode_lat_lon(row['lat'], row['lon']), axis=1)
add_postcodes = pd.DataFrame(list(postcodes), index=postcodes.index)


In [39]:
street_mask = df_raw['street'].isna()
streets = df_raw.loc[street_mask].apply(lambda row: geocode_lat_lon(row['lat'], row['lon']), axis=1)
add_streets = pd.DataFrame(list(streets), index=streets.index)

In [40]:
df_raw.loc[street_mask, add_streets.columns] = add_streets

In [41]:
housenumber_mask = df_raw['housenumber'].isna()
hnumbers =df_raw.loc[housenumber_mask].apply(lambda row: geocode_lat_lon(row['lat'], row['lon']), axis=1)
add_hnumbers = pd.DataFrame(list(hnumbers), index=hnumbers.index)
df_raw.loc[housenumber_mask, add_hnumbers.columns] = add_hnumbers

In [43]:
df_raw.isna().sum() 

osm_id            0
region            0
type              0
name            407
street            0
housenumber       0
postcode          0
city              0
lat               0
lon               0
phone          3138
email          3827
website        2846
amenity        1308
dtype: int64

In [25]:
df_raw[df_raw['name'].isna()]['amenity'].value_counts()

amenity
recycling                 128
parking                    64
kindergarten               25
social_facility            11
childcare                   7
bicycle_parking             7
social_centre               3
atm                         2
charging_station            2
nursing_home                2
fountain                    1
shelter                     1
bicycle_repair_station      1
community_centre            1
waste_basket                1
events_venue                1
drinking_water              1
waste_disposal              1
vending_machine             1
cafe                        1
give_box                    1
public_bookcase             1
parking_entrance            1
Name: count, dtype: int64

In [None]:
current_db = pd.read_excel("2025_09_16_Einrichtunsdatenbank_Export_descriptions_final.xlsx" , sheet_name = 'Facilities')
current_db.sample(6)

Unnamed: 0,verband_id,name,rechtsform,adresse_strasse,adresse_zusatz,adresse_plz,adresse_ort,adresse_telefon,adresse_email,postfach_adresse,...,bagfw_kategorie,anzahl_vollzeit,anzahl_teilzeit,anzahl_ehrenamt,anzahl_plaetze,anzahl_fsj,anzahl_bfd,target_group,tags,carrier_id
7990,2458,"AWO-Kinderhaus ""Rhein-Au""",,Hauensteinstraße 49,,79713,Bad Säckingen,07751 91120,anonymisiert@awo-waldshut.de,,...,2.2.1,0,0,0,70,0,0,,"Kinder,Tageseinrichtungen,Kita,Kinderkrippen,K...",9201700000.0
7001,11621,Kindertagesstätte Pusteblume,,Stargarder Str. 32,,17094,Groß Nemerow,,,,...,2.2.1,6,10,0,54,0,0,,"Kinder,Tageseinrichtungen,Kita,Kinderkrippen,K...",12001500000.0
824,5033,Tagesgruppe nach § 32 SGB VIII,,Erich-Weinert-Promenade 2,,23966,Wismar,,anonymisiert@awo-wismar.de,,...,,0,3,0,8,0,0,,"Teilstationäre Einrichtungen,Kinder,Jugendliche",12000200000.0
2592,700,"AWO-Kindertagesstätte ""Die Okerknirpse""",e.V.,In den Schönen Morgen 12,,38300,Wolfenbüttel,,,,...,2.2.1,9,7,0,45,1,0,Kinder,"Kinder,Tageseinrichtungen,Kita,Kinderkrippen,K...",
6779,560,"BISS Schaumburg Beratung Ehe-, Erziehungs-/Leb...",,Bahnhofstr. 15 a,,31655,Stadthagen,05121-995127,,,...,3.3.1,0,0,0,0,0,0,,"Beratungsstellen,Familie,Erziehung,Gewalt",5202100000.0
3534,3045,Mobiler Sozialer Hilfsdienst,,Kornmarkt 24,,97421,Schweinfurt,,anonymisiert@awo-schweinfurt.de,,...,,2,0,0,0,0,0,,Senioren,10400800000.0


It could be hard to compare since names might differ : Kita or Kindertagesstätte or AWO-Kindertagesstätte, then names are under " " while in osm file are not , then some names have "-" in between while others just space (example "Rhein-Au" vs. "Rhein Au"). This might be because db file contains official names while osm is filled manually by users. Also, what I noticed in osm file is that some cell values are ordered in correct column 

One option is to standardise names in both files and replace accordingly. 

In [27]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4604 entries, 0 to 4603
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   osm_id       4604 non-null   int64  
 1   region       4604 non-null   object 
 2   type         4604 non-null   object 
 3   name         4197 non-null   object 
 4   street       4588 non-null   object 
 5   housenumber  4557 non-null   object 
 6   postcode     4586 non-null   object 
 7   city         4604 non-null   object 
 8   lat          4604 non-null   float64
 9   lon          4604 non-null   float64
 10  phone        1466 non-null   object 
 11  email        777 non-null    object 
 12  website      1758 non-null   object 
 13  amenity      3296 non-null   object 
dtypes: float64(2), int64(1), object(11)
memory usage: 503.7+ KB


In [None]:
#name_datetime = time.strftime("%Y%m%d-%H%M%S")
df_raw.to_csv(f"awo_{name_datetime}_osm_raw.csv", index=False, encoding='utf-8')