In [703]:
import pandas as pd
import json
from pathlib import Path

input_json_file = Path('../source/doctors_and_clinics_raw.geojson')
output_dir = Path('../source')
output_csv_file = output_dir / 'doctors.csv'

# A list to store our flat data
flattened_data = []

print(f"Loading {input_json_file}...")
# Open and load the Overpass JSON
with open(input_json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

print("File loaded. Processing 'elements'...")
# Iterate over each "element" in the 'elements' list
for element in data['elements']:
    
    if 'tags' in element:
        properties = element['tags'].copy() # .copy()
                
        properties['osm_id'] = element['id']
        properties['osm_type'] = element['type']
        
        if element['type'] == 'node':
            properties['longitude'] = element.get('lon')
            properties['latitude'] = element.get('lat')
            
        elif 'center' in element:
            properties['longitude'] = element['center'].get('lon')
            properties['latitude'] = element['center'].get('lat')
        
        else:
            properties['longitude'] = None
            properties['latitude'] = None
        
        # Add the assembled row to our main list
        flattened_data.append(properties)

if not flattened_data:
    print("Warning: No elements with tags were found in the file.")
else:
    print(f"Found {len(flattened_data)} elements with tags.")

# Create a DataFrame from list
df = pd.DataFrame(flattened_data)

# 9. Save the DataFrame to a CSV
try:
    df.to_csv(output_csv_file, index=False, encoding='utf-8')
    print(f"✅ File '{output_csv_file}' was successfully created!")
except Exception as e:
    print(f"❌ An error occurred while saving the CSV: {e}")

Loading ..\source\doctors_and_clinics_raw.geojson...
File loaded. Processing 'elements'...
Found 1657 elements with tags.
✅ File '..\source\doctors.csv' was successfully created!


In [704]:
df.head()

Unnamed: 0,addr:city,addr:housenumber,addr:postcode,addr:street,amenity,healthcare,healthcare:speciality,name,opening_hours,operator,...,work_accident,building:material,type,building:parts,contact:city,contact:country,contact:housenumber,contact:postcode,contact:street,contact:suburb
0,Berlin,122,12621.0,Münsterberger Weg,doctors,doctor,ophthalmology,Augenarzt Dr. med. Bashar Moustafa,Mo 08:00-14:00; Tu 08:00-14:00; We 13:00-18:00...,Bashar Moustafa,...,,,,,,,,,,
1,Berlin,5B,13353.0,Willdenowstraße,doctors,doctor,general,Dr. Katja Meißner,"Mo,Th 09:00-12:00,16:00-18:00; We,Fr 09:00-12:...",,...,,,,,,,,,,
2,Berlin,22A,13593.0,Obstallee,doctors,centre,general;gynaecology;paediatrics;internal;psych...,Medizinisches Versorgungszentrum Heerstraße Nord,,,...,,,,,,,,,,
3,Berlin,12a,12621.0,Karlstraße,doctors,doctor,,,"Mo 08:15-13:00,15:00-18:00;Tu 08:15-14:00;We 0...",,...,,,,,,,,,,
4,,,,,doctors,doctor,,Polimedica,,,...,,,,,,,,,,


In [705]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Columns: 210 entries, addr:city to contact:suburb
dtypes: float64(2), int64(1), object(207)
memory usage: 2.7+ MB


I'll inspect all the columns and drop columns we don't need.

In [706]:
print(df.columns.to_list())

['addr:city', 'addr:housenumber', 'addr:postcode', 'addr:street', 'amenity', 'healthcare', 'healthcare:speciality', 'name', 'opening_hours', 'operator', 'website', 'osm_id', 'osm_type', 'longitude', 'latitude', 'addr:country', 'addr:suburb', 'check_date:opening_hours', 'contact:phone', 'contact:website', 'level', 'wheelchair', 'description', 'short_name', 'email', 'fax', 'health_facility:type', 'health_specialty:family_medicine', 'health_specialty:internal_medicine', 'medical_system:western', 'phone', 'toilets:wheelchair', 'check_date', 'contact:email', 'contact:fax', 'opening_hours:url', 'image', 'outdoor_seating', 'dispensing', 'office', 'comment', 'wheelchair:description', 'health_specialty:obstetrics', 'health_specialty:reproductive_medicine', 'source', 'healthcare:alternative', 'operator:type', 'internet_access', 'toilets', 'not:brand:wikidata', 'health_specialty:ophthalmology', 'min_age', 'operator:start_date', 'health_specialty:gynaecology', 'emergency', 'health_specialty:orthop

In [707]:
df['operator'].value_counts()

operator
AnthroMed Berlin-Brandenburg gGmbH                          8
MRT-Akademie                                                5
Charite Campus Virchow Klinik                               4
Dr. med. Malgorzata Kazimierczak                            3
Helios Kliniken                                             3
                                                           ..
Dr.med. Astrid Eilers-Lönnecker;Dr.med. Stephan Rackwitz    1
Andrea Jacobshagen;Stefanie Malanowski                      1
MEDICO LEOPOLDPLATZ Service GmbH                            1
Sana Gesundheitszentren Berlin-Brandenburg                  1
Priv. Doz. Dr. med. Sabine Fitzek                           1
Name: count, Length: 423, dtype: int64

In [708]:
df['osm_id'].nunique()

1657

In [709]:
df['osm_id'].head()

0    266680057
1    268915280
2    362631112
3    407094165
4    409925339
Name: osm_id, dtype: int64

In [710]:
df['osm_type'].value_counts()

osm_type
node        1591
way           61
relation       5
Name: count, dtype: int64

In [711]:
df['level'].value_counts()

level
1        77
0        48
2        39
3        23
4        20
5        10
6         3
0.5       3
1;2;3     1
0;1;2     1
1;2       1
0;1       1
Name: count, dtype: int64

In [712]:
df['description'].nunique()

99

In [713]:
df['short_name'].nunique()

4

In [714]:
df['health_facility:type'].value_counts()

health_facility:type
office           58
health_centre     6
clinic            3
doctors           1
Name: count, dtype: int64

In [715]:
df['health_specialty:family_medicine'].value_counts()

health_specialty:family_medicine
main    13
yes      3
Name: count, dtype: int64

In [716]:
df['healthcare:speciality'].value_counts()

healthcare:speciality
general                                                                                                                 289
gynaecology                                                                                                             119
paediatrics                                                                                                              93
ophthalmology                                                                                                            87
otolaryngology                                                                                                           58
                                                                                                                       ... 
orthopaedics;acupuncture;chiropractic;trauma                                                                              1
general;ophthalmology;surgery;dermatology;gynaecology;otolaryngology;paediatrics;naturopathy;orthopaedics;psyc

In [717]:
df['health_specialty:internal_medicine'].value_counts()

health_specialty:internal_medicine
yes     10
main     2
Name: count, dtype: int64

In [718]:
df['medical_system:western'].value_counts()

medical_system:western
yes    62
Name: count, dtype: int64

In [719]:
df['opening_hours:url'].value_counts() 

opening_hours:url
http://www.arztpraxis-mehdi-zadeh.de/sprechstunden.html                        1
http://www.praxis-jessen.de/#kontakt                                           1
https://www.berghafenpraxis.de/unser-team                                      1
http://www.hno-ratmann.de/sprechzeiten.html                                    1
https://praxis-zehlendorf.de/oeffnungszeiten/                                  1
https://www.kinderarzt-zimmermann.de/                                          1
https://www.kinderaerzte-im-netz.de/aerzte/berlin/kroeber/sprechzeiten.html    1
Name: count, dtype: int64

In [720]:
df['outdoor_seating'].value_counts()

outdoor_seating
no    1
Name: count, dtype: int64

In [721]:
df['dispensing'].value_counts()

dispensing
yes    1
Name: count, dtype: int64

In [722]:
df['office'].value_counts()

office
physician    35
Name: count, dtype: int64

In [723]:
df['comment'].value_counts()

comment
eingang Saltykowstraße    1
Name: count, dtype: int64

In [724]:
df['health_specialty:obstetrics'].value_counts()

health_specialty:obstetrics
main    4
yes     1
Name: count, dtype: int64

In [725]:
df['health_specialty:reproductive_medicine'].value_counts()

health_specialty:reproductive_medicine
yes    1
Name: count, dtype: int64

In [726]:
df['source'].value_counts()

source
Geoportal Berlin / Hauskoordinaten                           18
survey                                                        9
Geoportal Berlin / k5_2012_sw_sued.zip                        2
Geoportal Berlin / Hausumringe                                2
local knowledge                                               2
http://www.dr-stammeier.de/Kontakt.html                       1
93 Unter Den Eichen                                           1
Geoportal Berlin / Hauskoordinaten;survey;local knowledge     1
Dakota 20                                                     1
survey;website                                                1
Geoportal Berlin / ALKIS Berlin - Gebäude                     1
Name: count, dtype: int64

In [727]:
df['healthcare:alternative'].value_counts()

healthcare:alternative
acupuncture    1
Name: count, dtype: int64

In [728]:
df['operator:type'].value_counts()

operator:type
private     24
public       3
business     1
Name: count, dtype: int64

In [729]:
df['internet_access'].value_counts()

internet_access
wlan    6
no      5
yes     1
Name: count, dtype: int64

In [730]:
df['toilets'].value_counts()

toilets
yes    2
Name: count, dtype: int64

In [731]:
df['not:brand:wikidata'].value_counts()

not:brand:wikidata
Q1958759    1
Name: count, dtype: int64

In [732]:
df['health_specialty:ophthalmology'].value_counts()

health_specialty:ophthalmology
yes     1
main    1
Name: count, dtype: int64

In [733]:
df['min_age'].value_counts()

min_age
4    1
Name: count, dtype: int64

In [734]:
df['health_specialty:gynaecology'].value_counts()

health_specialty:gynaecology
main    4
Name: count, dtype: int64

In [735]:
df['emergency'].value_counts()

emergency
no     8
yes    2
Name: count, dtype: int64

In [736]:
df['health_specialty:orthopaedics'].value_counts()

health_specialty:orthopaedics
main    3
Name: count, dtype: int64

In [737]:
df['opening_hours:signed'].value_counts()

opening_hours:signed
no     65
yes     1
Name: count, dtype: int64

In [738]:
df['health_specialty:family_medicine'].value_counts()

health_specialty:family_medicine
main    13
yes      3
Name: count, dtype: int64

In [739]:
df['name:en'].value_counts() 

name:en
Dermatologist Helena Dröge                                                         1
AID Friedrichshain                                                                 1
Dr. med. J. Nicklas                                                                1
Gynecological Practice of Bettina Gassen                                           1
orthoteam.berlin - orthopedic medical                                              1
Spreemedizin MVZ Rehberge (General medicine)                                       1
Oncological Outpatient Department                                                  1
Spreemedizin MVZ Berlin                                                            1
Endocrinology Berlin                                                               1
OPTICUM Eye Clinic                                                                 1
Orthomed Berlin - Bartholomäus Gabrys                                              1
Dr. (TIP/Tr) Mehmet Emin Turgut                          

In [740]:
df['health_specialty:dermatology'].value_counts()

health_specialty:dermatology
partial    1
main       1
yes        1
Name: count, dtype: int64

In [741]:
df['alt_name'].value_counts()

alt_name
Ambulantes Rehazentrum                                                        1
Augenarzt                                                                     1
bonedoctor                                                                    1
Ärztehaus                                                                     1
Orthopädische und Unfallchirurgische Praxisgemeinschaft am Leipziger Platz    1
PIA                                                                           1
Zentrum für ambulante Rehabilitation Berlin                                   1
Urologische Praxis                                                            1
Ärztehaus Müllerstraße 139                                                    1
Orthopädie & Chirurgie Stadtmitte                                             1
MVZ am St. Marien-Krankenhaus Berlin                                          1
Hautarzt-Praxis Neukölln                                                      1
Praxis für Onkologie im MVZ Hav

In [742]:
df['note'].value_counts()

note
im MVZ am Bahnhof Spandau                                                                                                                                                                                                                                5
Erreichbarkeit von den Gropiuspassagen: Schild 'GesundheitsZentrum Gropiuspassagen' folgen'; 'direkt über Woolworth 1.OG und Primark EG'; Erreichbarkeit vom Kundenparkhaus P1: 'Fahrstuhl ins 2. OG'                                                    2
Allgemeinmedizin, Diab Point, Dermatologie, Diabetolog., Gynäkologie,Kardiologie; Logopädie, Neurologie, Orthopäd. Unfallchirurg, Orthopädie Schuhtechnik & Sanitätshaus, Physiotherapie, Podologie, SC Dental Labor, Schmerzpraxis,Urologie, Zahnarz    1
Fachaerztin fuer Allgemeinmedizin                                                                                                                                                                                                                 

In [743]:
df['health_specialty:venereology'].value_counts()

health_specialty:venereology
main    1
yes     1
Name: count, dtype: int64

In [744]:
df['addr:housename'].value_counts()

addr:housename
Marheineke Markthalle        1
103                          1
Nebenhaus                    1
Haus 16                      1
ÄrzteZENTRUM Ruschestraße    1
Haus 17                      1
Haus 19                      1
Haupthaus                    1
Haus 5.2                     1
Haus 5.1                     1
Haus 5.3                     1
Haus 20                      1
Werner-Otto-Haus             1
Name: count, dtype: int64

In [745]:
df['description:de'].value_counts()

description:de
Sprechzeiten sind unterschiedlich                                                      1
Kernspintomogtaphie (MRT)                                                              1
Patienten ohne Termin werden in den ersten 30 Minuten der Öffnungszeiten angenommen    1
Name: count, dtype: int64

In [746]:
df['health_specialty:paediatrics'].value_counts()

health_specialty:paediatrics
main    4
yes     2
Name: count, dtype: int64

In [747]:
df['home_visit'].value_counts()

home_visit
yes    1
Name: count, dtype: int64

In [748]:
df['fixme'].value_counts()

fixme
Name der Praxis / des Arztes?                                              2
genaue position, level                                                     2
Hier hat möglicherweise der Betreiber der gewechselt, Website ist down.    1
type                                                                       1
Name: count, dtype: int64

In [749]:
df['addr:inclusion'].value_counts()

addr:inclusion
actual    3
Name: count, dtype: int64

In [750]:
df['deaf:description:de'].value_counts()

deaf:description:de
Patient*innen werden visuell aufgerufen.    2
Name: count, dtype: int64

In [751]:
df['health_specialty:psychotherapy'].value_counts()

health_specialty:psychotherapy
yes    2
Name: count, dtype: int64

In [752]:
df['health_specialty:traditional_chinese_medicine'].value_counts()

health_specialty:traditional_chinese_medicine
partial    1
Name: count, dtype: int64

In [753]:
df['addr:place'].value_counts()

addr:place
Garbátyplatz            2
Emser Platz             1
Heinrich-Heine-Platz    1
Berlin                  1
Name: count, dtype: int64

In [754]:
df['opening_hours:dr_weinhold'].value_counts()

opening_hours:dr_weinhold
Mo,We 08:00-12:00, 13:30-16:00; Tu 08:00-12:00; Th 08:00-12:00, 14:00-17:00; Fr 08:00-12:00    1
Name: count, dtype: int64

In [755]:
df['health_specialty:sports_medicine'].value_counts()

health_specialty:sports_medicine
additional    3
Name: count, dtype: int64

In [756]:
df['opening_hours:note'].value_counts()

opening_hours:note
Wednesday: only for private insurance    1
Name: count, dtype: int64

In [757]:
df['note:de'].value_counts()

note:de
Privatpraxis                                   1
Terminvereinbarungen Mo, Tu, Fr 12:30-13:00    1
Name: count, dtype: int64

In [758]:
df['name:de'].value_counts()

name:de
Arztpraxis Driesener Straße                                                 1
HNO-Praxis                                                                  1
AID Friedrichshain                                                          1
Hautarztpraxis Dr. Hasert                                                   1
Dr. med. Katja Rebell                                                       1
Spreemedizin MVZ Berlin                                                     1
Orthomed Berlin – Bartholomäus Gabrys - Orthopädie Friedrichshain           1
Praxis für Kinder- und Jugendpsychatrie MVZ Greven, Treuter und Kollegen    1
Name: count, dtype: int64

In [759]:
df['opening_hours:note:de'].value_counts()

opening_hours:note:de
Fr Termine nach Vereinbarung    1
Name: count, dtype: int64

In [760]:
df['start_date'].value_counts()

start_date
2003-09-03    1
1993          1
2014-10       1
2020-10       1
2023-03-01    1
2005          1
1921..1923    1
2013          1
Name: count, dtype: int64

In [761]:
df['language:tr'].value_counts()

language:tr
yes    3
Name: count, dtype: int64

In [762]:
df['opening_hours:office'].value_counts()

opening_hours:office
Mo-Th 09:00-12:00,15:00-18:00; We 10:00-12:00; Fr 09:00-12:00    1
Name: count, dtype: int64

In [763]:
df['survey:date'].value_counts()

survey:date
2022-03-02    3
Name: count, dtype: int64

In [764]:
df['building'].value_counts()

building
yes           34
commercial    11
civic          4
healthcare     3
office         2
clinic         2
house          1
apartments     1
Name: count, dtype: int64

In [765]:
df['ele'].value_counts()

ele
20    3
Name: count, dtype: int64

In [766]:
df['type'].value_counts()

type
multipolygon    5
Name: count, dtype: int64

In [767]:
df['check_date:opening_hours'].value_counts()


check_date:opening_hours
2022-11-27    5
2025-04-24    5
2024-08-10    4
2025-10-15    4
2025-10-16    4
             ..
2023-03-31    1
2024-10-16    1
2022-07-19    1
2025-05-05    1
2023-10-10    1
Name: count, Length: 216, dtype: int64

In [768]:
columns_to_drop=['osm_type','operator','level', 'short_name','health_facility:type', 'health_specialty:internal_medicine', 'medical_system:western', 'check_date', 'image', 
                 'outdoor_seating', 'dispensing', 'office', 'comment','health_specialty:obstetrics','health_specialty:reproductive_medicine', 'source','healthcare:alternative',
                 'operator:type','internet_access', 'toilets','not:brand:wikidata', 'health_specialty:ophthalmology', 'min_age', 'operator:start_date', 'health_specialty:gynaecology', 'emergency',
                 'health_specialty:orthopaedics','opening_hours:signed', 'health_specialty:dermatology','contact:fax', 'fax', 'health_specialty:family_medicine','name:en', 
                 'health_specialty:surgery', 'alt_name', 'health_specialty:venereology', 'addr:housename', 'health_specialty:paediatrics','home_visit', 'addr:inclusion', 
                 'blind:description:de', 'deaf:description:de', 'health_specialty:psychosomatic_medicine', 'health_specialty:psychotherapy',
                 'health_specialty:traditional_chinese_medicine', 'addr:place', 'payment:cash', 'payment:credit_cards', 'payment:debit_cards', 'contact:facebook', 'name:ru', 
                 'opening_hours:reception', 'operator:wikidata', 'health_specialty:emergency_medicine', 'health_specialty:sports_medicine', 'payment:mastercard', 'payment:visa',
                 'layer', 'opening_hours:note', 'note:de', 'health_specialty:diagnostic_radiology', 'health_specialty:dentistry', 'health_specialty:maxillofacial_surgery', 
                 'access', 'doctor', 'instagram', 'health_specialty:allergology', 'health_specialty:pulmonology', 'disease:hiv', 'disease:stds', 'start_date', 'blind:description', 
                 'operator:wikipedia', 'health_specialty:gastroenterology', 'internet_access:fee', 'internet_access:ssid', 'addr:floor', 'elevator',  'language:en', 'language:es',
                 'language:fr', 'language:tr', 'wheelchair:description:de', 'air_conditioning', 'health_service:prevention', 'health_service:test', 'lgbtq:trans',  
                 'health_specialty:cardiology', 'healthcare:speciality:de', 'health_specialty:paediatric_gastroenterology', 'beauty', 'opening_hours:covid19', 'name:ja', 'branch',
                 'name:signed', 'old_name', 'payment:american_express', 'payment:coins', 'payment:maestro', 'payment:notes', 'payment:telephone_cards', 'capacity', 'day_surgery', 
                 'health_specialty:pain_medicine', 'health_specialty:andrology', 'health_specialty:urology', 'health_specialty:optometry', 'survey:date', 'counselling_type:nutrition', 
                 'health_service:counselling', 'health_specialty:acupuncture', 'health_specialty:homoeopathy', 'health_specialty:mind_body_intervention','health_specialty:naturopathy', 
                 'health_specialty:osteopathy', 'health_specialty:phythotherapy', 'insurance:health', 'health_specialty:adult_psychiatry', 'health_specialty:neurology', 'reservation', 
                 'noname', 'payment:privat', 'contact:instagram', 'health_specialty:neonatology', 'health_specialty:ear_nose_throat', 'social_facility:for',  'language:de', 
                 'language:ku', 'language:ru', 'language:ar', 'language:th', 'fixme:type', 'health_service:examination', 'smoking', 'building', 'building:levels', 'neighborhood', 
                 'roof:shape', 'height', 'roof:levels', 'roof:colour', 'social_facility', 'wikidata', 'wikimedia_commons', 'deaf:description', 'official_name', 'brand', 'brand:wikidata', 
                 'wikipedia', 'roof:height', 'heritage', 'heritage:operator', 'lda:criteria', 'ref:lda', 'building:colour', 'ele', 'health_specialty:anesthesiology', 
                 'health_specialty:hand_surgery', 'health_specialty:trauma_surgery', 'not:operator:wikidata', 'work_accident', 'building:material', 'type', 'building:parts', 
                  'contact:city', 'contact:country', 'contact:housenumber', 'contact:postcode', 'contact:street', 'contact:suburb','check_date:opening_hours']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    662 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

Now, consolidate related columns (e.g., 'website', 'contact:website') into a single unified column.

In [769]:
   
all_filled = df[['website', 'contact:website','heritage:website','url']].notnull().all(axis=1)

print(df[all_filled][['website', 'contact:website','heritage:website','url']])

Empty DataFrame
Columns: [website, contact:website, heritage:website, url]
Index: []


In [770]:
df['website'] = df['website'].fillna(df['contact:website'])
df['website'] = df['website'].fillna(df['heritage:website'])
df['website'] = df['website'].fillna(df['url'])
df.drop(columns=['contact:website','heritage:website','url'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    910 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

In [771]:
all_filled = df[['contact:phone', 'phone','phone_1','contact:mobile','mobile']].notnull().all(axis=1)

print(df[all_filled][['contact:phone', 'phone','phone_1','contact:mobile','mobile']])

Empty DataFrame
Columns: [contact:phone, phone, phone_1, contact:mobile, mobile]
Index: []


In [772]:
df['phone'] = df['phone'].fillna(df['contact:phone'])
df['phone'] = df['phone'].fillna(df['phone'])
df['phone'] = df['phone'].fillna(df['phone_1'])
df['phone'] = df['phone'].fillna(df['contact:mobile'])
df['phone'] = df['phone'].fillna(df['mobile'])
df.drop(columns=['contact:phone','phone','phone_1','contact:mobile','mobile'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    910 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

In [773]:
all_filled = df[['email', 'contact:email']].notnull().all(axis=1)

print(df[all_filled][['email', 'contact:email']])

                              email                  contact:email
1331  karlshorst@ihre-hno-aerzte.de  karlshorst@ihre-hno-aerzte.de
1622    praxis@mvz-kinderexperts.de    praxis@mvz-kinderexperts.de


In [774]:
df.drop(columns=['contact:email'], inplace=True, errors='ignore')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    910 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

In [775]:
all_filled = df[['website', 'opening_hours:url']].notnull().all(axis=1)

print(df[all_filled][['website', 'opening_hours:url']])

                                                website  \
6                 http://www.arztpraxis-mehdi-zadeh.de/   
227                       https://www.praxis-jessen.de/   
618                     https://www.berghafenpraxis.de/   
620                           http://www.hno-ratmann.de   
835                       https://praxis-zehlendorf.de/   
1029              https://www.kinderarzt-zimmermann.de/   
1421  https://www.kinderaerzte-im-netz.de/aerzte/ber...   

                                      opening_hours:url  
6     http://www.arztpraxis-mehdi-zadeh.de/sprechstu...  
227                http://www.praxis-jessen.de/#kontakt  
618           https://www.berghafenpraxis.de/unser-team  
620         http://www.hno-ratmann.de/sprechzeiten.html  
835       https://praxis-zehlendorf.de/oeffnungszeiten/  
1029              https://www.kinderarzt-zimmermann.de/  
1421  https://www.kinderaerzte-im-netz.de/aerzte/ber...  


In [776]:
df.drop(columns=['opening_hours:url'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    910 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

In [777]:
all_filled = df[['name', 'name:de']].notnull().all(axis=1)

print(df[all_filled][['name', 'name:de']])

                                                   name  \
333                         Arztpraxis Driesener Straße   
346                                          HNO-Praxis   
350                                  AID Friedrichshain   
380                           Hautarztpraxis Dr. Hasert   
1139                              Dr. med. Katja Rebell   
1269                            Spreemedizin Berlin MVZ   
1394                                    Orthomed Berlin   
1574  Praxis für Kinder- und Jugendpsychatrie MVZ Gr...   

                                                name:de  
333                         Arztpraxis Driesener Straße  
346                                          HNO-Praxis  
350                                  AID Friedrichshain  
380                           Hautarztpraxis Dr. Hasert  
1139                              Dr. med. Katja Rebell  
1269                            Spreemedizin MVZ Berlin  
1394  Orthomed Berlin – Bartholomäus Gabrys - Orthop...  
1574

In [778]:
df.drop(columns=['name:de'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   addr:city                  1146 non-null   object 
 1   addr:housenumber           1269 non-null   object 
 2   addr:postcode              1210 non-null   object 
 3   addr:street                1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   healthcare                 1647 non-null   object 
 6   healthcare:speciality      1437 non-null   object 
 7   name                       1611 non-null   object 
 8   opening_hours              1129 non-null   object 
 9   website                    910 non-null    object 
 10  osm_id                     1657 non-null   int64  
 11  longitude                  1591 non-null   float64
 12  latitude                   1591 non-null   float64
 13  addr:country               755 non-null    objec

In [779]:
df['addr:country'].unique()

array([nan, 'DE'], dtype=object)

In [780]:
df['addr:city'].unique()

array(['Berlin', nan], dtype=object)

In [781]:
df['healthcare'].unique()

array(['doctor', 'centre', 'clinic', 'doctor;alternative',
       'psychotherapist', 'dialysis', 'dentist', 'rehabilitation',
       'optometrist', 'physiotherapist', 'speech_therapist', nan],
      dtype=object)

In [782]:
df.drop(columns=['healthcare'], inplace=True, errors='ignore')

In [783]:
df['healthcare:speciality'].unique()

array(['ophthalmology', 'general',
       'general;gynaecology;paediatrics;internal;psychiatry;neurology;oncology;obstetrics',
       nan, 'general;internal', 'paediatrics', 'internal',
       'orthodontics;plastic_surgery;dental_oral_maxillo_facial_surgery',
       'orthopaedics', 'urology', 'general;haematology;oncology;internal',
       'radiology', 'nephrology', 'general;internal;infectious_diseases',
       'general;naturopathy', 'internal;oncology', 'otolaryngology',
       'surgery', 'neurology', 'internal;gastroenterology',
       'surgery;general', 'neurosurgery', 'gynaecology',
       'general;gastroenterology;optometry;gynaecology;cardiology;internal;dentist',
       'dermatology', 'surgery;general;orthopaedics;trauma;proctology',
       'pulmonology', 'general;sports_physician', 'behavior',
       'general;paediatrics', 'general;neurosurgery',
       'oncology;internal;haematology', 'child_psychiatry',
       'general; gynaecology; ophthalmology; orthopaedics; otolaryngolog

Let's rename the columns.

In [784]:
df.rename(columns={
    'addr:city': 'city',
    'addr:housenumber': 'housenumber',
    'addr:postcode': 'postcode',
    'addr:street': 'street',
    'healthcare:speciality': 'speciality',
    'addr:country': 'country',
    'addr:suburb': 'suburb',
    'osm_id': 'id',
    'toilets:wheelchair': 'toilets_wheelchair',
    'wheelchair:description': 'wheelchair_description',
    'contact:email': 'email1'
}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   city                       1146 non-null   object 
 1   housenumber                1269 non-null   object 
 2   postcode                   1210 non-null   object 
 3   street                     1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   speciality                 1437 non-null   object 
 6   name                       1611 non-null   object 
 7   opening_hours              1129 non-null   object 
 8   website                    910 non-null    object 
 9   id                         1657 non-null   int64  
 10  longitude                  1591 non-null   float64
 11  latitude                   1591 non-null   float64
 12  country                    755 non-null    object 
 13  suburb                     738 non-null    objec

In [785]:
df['note'].unique()

array([nan,
       'Allgemeinmedizin, Diab Point, Dermatologie, Diabetolog., Gynäkologie,Kardiologie; Logopädie, Neurologie, Orthopäd. Unfallchirurg, Orthopädie Schuhtechnik & Sanitätshaus, Physiotherapie, Podologie, SC Dental Labor, Schmerzpraxis,Urologie, Zahnarz',
       'Fachaerztin fuer Allgemeinmedizin',
       'Klinik für Allgemein-, Visceral- und Transplantationschirurgie Experimentelle Chirurgie und Regenerative Medizin Charité - Campus Virchow Universitätsmedizin Berlin Forschungshaus – Forum 4 Augustenburger Platz 1 13353 Berlin',
       'Dr. Christian Rubner, Monika Schneider, Manuela Stoppe, Dr. Harald Wolf',
       'Dr. med. Gerd Menzel, Dr. med. Simone Kühnlein, Jens Niermann',
       'Praxis für Urologie und Anti-Aging-Medizin',
       'im MVZ am Bahnhof Spandau', 'Keine Kassenleistungen',
       'treatment with and without health insurance', 'Anmeldung erbeten',
       "Erreichbarkeit von den Gropiuspassagen: Schild 'GesundheitsZentrum Gropiuspassagen' folgen'; 'direkt

In [786]:
all_filled = df[['description', 'description:de','description:en']].notnull().all(axis=1)

print(df[all_filled][['description', 'description:de','description:en']])

Empty DataFrame
Columns: [description, description:de, description:en]
Index: []


In [787]:
df['description'] = df['description'].fillna(df['description:de'])
df['description'] = df['description'].fillna(df['description:en'])
df.drop(columns=['description:de','description:en','note'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   city                       1146 non-null   object 
 1   housenumber                1269 non-null   object 
 2   postcode                   1210 non-null   object 
 3   street                     1272 non-null   object 
 4   amenity                    1657 non-null   object 
 5   speciality                 1437 non-null   object 
 6   name                       1611 non-null   object 
 7   opening_hours              1129 non-null   object 
 8   website                    910 non-null    object 
 9   id                         1657 non-null   int64  
 10  longitude                  1591 non-null   float64
 11  latitude                   1591 non-null   float64
 12  country                    755 non-null    object 
 13  suburb                     738 non-null    objec

In [788]:
all_filled = df[['wheelchair_description', 'wheelchair:description:en']].notnull().all(axis=1)

print(df[all_filled][['wheelchair_description', 'wheelchair:description:en']])

Empty DataFrame
Columns: [wheelchair_description, wheelchair:description:en]
Index: []


In [789]:
df['wheelchair_description'] = df['wheelchair_description'].fillna(df['wheelchair:description:en'])
df.drop(columns=['wheelchair:description:en'], inplace=True, errors='ignore')

In [790]:
df_1=df[df['opening_hours:office'].notna()]
print(df_1)

        city housenumber postcode        street  amenity speciality  \
1081  Berlin          29    12163  Schloßstraße  doctors  neurology   

                   name opening_hours                           website  \
1081  Neurologie Berlin           NaN  https://www.neurologie-berlin.de   

              id  ...    suburb  wheelchair description email  \
1081  9548281680  ...  Steglitz         NaN         NaN   NaN   

     toilets_wheelchair wheelchair_description fixme  \
1081                NaN                    NaN   NaN   

     opening_hours:dr_weinhold opening_hours:note:de  \
1081                       NaN                   NaN   

                                   opening_hours:office  
1081  Mo-Th 09:00-12:00,15:00-18:00; We 10:00-12:00;...  

[1 rows x 23 columns]


In [791]:
all_filled = df[['opening_hours', 'opening_hours:dr_weinhold','opening_hours:office']].notnull().all(axis=1)

print(df[all_filled][['opening_hours', 'opening_hours:dr_weinhold','opening_hours:office']])

Empty DataFrame
Columns: [opening_hours, opening_hours:dr_weinhold, opening_hours:office]
Index: []


In [792]:
df['opening_hours'] = df['opening_hours'].fillna(df['opening_hours:dr_weinhold'])
df['opening_hours'] = df['opening_hours'].fillna(df['opening_hours:office'])
df.drop(columns=['opening_hours:dr_weinhold','opening_hours:office','opening_hours:note:de'], inplace=True, errors='ignore')

In [793]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657 entries, 0 to 1656
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1146 non-null   object 
 1   housenumber             1269 non-null   object 
 2   postcode                1210 non-null   object 
 3   street                  1272 non-null   object 
 4   amenity                 1657 non-null   object 
 5   speciality              1437 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1131 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1657 non-null   int64  
 10  longitude               1591 non-null   float64
 11  latitude                1591 non-null   float64
 12  country                 755 non-null    object 
 13  suburb                  738 non-null    object 
 14  wheelchair              583 non-null    

In [794]:
df['amenity'].unique()

array(['doctors', 'clinic'], dtype=object)

In [795]:
df['fixme'].unique()

array([nan, 'Name der Praxis / des Arztes?', 'genaue position, level',
       'Hier hat möglicherweise der Betreiber der gewechselt, Website ist down.',
       'type'], dtype=object)

I would like to inspect missing values in the 'name' column.

In [None]:
missing_names_df = df[df['name'].isna()]

# Define the specific columns to see
columns_to_show = [
    'website',
    'speciality', 
    'street', 
    'housenumber', 
    'postcode', 
    'city', 
    'suburb'
]

print(f"Found {len(missing_names_df)} objects without a name.")
print("Showing them with only 'name', 'website', and 'address' columns:")

print(missing_names_df[columns_to_show])

Found 46 objects without a name.
Showing them with only 'name', 'website', and 'address' columns:
                                                website  \
3                   https://www.drhenriettefriedrich.de   
51                                                  NaN   
76                   https://www.berliner-augenarzt.de/   
84                                                  NaN   
95                                                  NaN   
142                                                 NaN   
166                                                 NaN   
390                    https://www.mvz-berlin-rudow.de/   
391                    https://www.mvz-berlin-rudow.de/   
392                    https://www.mvz-berlin-rudow.de/   
466                                                 NaN   
477                                                 NaN   
522              https://www.gastroenterologie-horn.de/   
558                                                 NaN   
563              

Checking 'fixme' column.

In [797]:
# Find all rows where 'fixme' is NOT empty (.notna())
fixme_rows_df = df[df['fixme'].notna()]

# Define the columns
columns_to_show = [
    'name', 
    'speciality',
    'website', 
    'street', 
    'housenumber', 
    'postcode', 
    'city', 
    'suburb',
    'fixme' # Let's add the 'fixme' column itself to see the note
]

# Print the 'fixme_rows_df', showing only those columns
print(f"Found {len(fixme_rows_df)} objects with a 'fixme' tag:")
print("Showing them with 'name', 'website', 'address', and the 'fixme' note:")
print(fixme_rows_df[columns_to_show])

Found 6 objects with a 'fixme' tag:
Showing them with 'name', 'website', 'address', and the 'fixme' note:
                                         name        speciality  \
166                                       NaN  child_psychiatry   
305       Augenarzt Dr. med. Ina Kannengießer     ophthalmology   
466                                       NaN      radiotherapy   
471                 Dr. med. Wolfgang Heidler               NaN   
956                   HNO Praxis Susanne Holz    otolaryngology   
1431  Praxis für Oralchirurgie Thekla Wandelt               NaN   

                               website           street housenumber postcode  \
166                                NaN              NaN         NaN      NaN   
305             https://frohnaugen.de/  Zeltinger Platz         4-6      NaN   
466                                NaN      Kormoranweg          31      NaN   
471   https://www.dr-heidler-berlin.de  Rathener Straße           6    12627   
956         http://www.h

I'll check if there are any useless rows without any obligatory info and drop them.

In [798]:
# Define the columns that must ALL be empty
check_cols = ['name', 'website', 'street', 'housenumber']

# Find rows where ALL of these are empty (NaN)
# .isna() finds empty cells
# .all(axis=1) checks if ALL are True horizontally (for that row)
empty_rows_condition = df[check_cols].isna().all(axis=1)

empty_rows_df = df[empty_rows_condition]

# Define the *other* columns that might be interesting
# (to see if these rows are *completely* useless)
columns_to_show = [
    'name', 
    'website', 
    'street', 
    'housenumber',
    'amenity', 
    'speciality'  
]

print(f"Found {len(empty_rows_df)} rows where 'name', 'website', 'street', AND 'housenumber' are all empty:")
print(empty_rows_df[columns_to_show])

Found 14 rows where 'name', 'website', 'street', AND 'housenumber' are all empty:
     name website street housenumber  amenity        speciality
84    NaN     NaN    NaN         NaN  doctors       gynaecology
95    NaN     NaN    NaN         NaN  doctors      orthopaedics
166   NaN     NaN    NaN         NaN  doctors  child_psychiatry
477   NaN     NaN    NaN         NaN  doctors               NaN
558   NaN     NaN    NaN         NaN  doctors               NaN
717   NaN     NaN    NaN         NaN  doctors               NaN
771   NaN     NaN    NaN         NaN  doctors               NaN
881   NaN     NaN    NaN         NaN  doctors               NaN
1205  NaN     NaN    NaN         NaN  doctors               NaN
1219  NaN     NaN    NaN         NaN  doctors               NaN
1489  NaN     NaN    NaN         NaN  doctors      occupational
1544  NaN     NaN    NaN         NaN  doctors               NaN
1648  NaN     NaN    NaN         NaN   clinic               NaN
1651  NaN     NaN    N

In [799]:
# Get the row count *before* dropping
original_row_count = len(df)
rows_to_drop_count = len(empty_rows_df)

# The ~ (tilde) operator inverts the boolean condition.
# This means we "keep all rows that are NOT (~) in our empty condition".
df = df[~empty_rows_condition].copy()

# .copy() is important to prevent a 'SettingWithCopyWarning'

# --- Report the result ---
print(f"Original DataFrame size: {original_row_count} rows.")
print(f"Dropped {rows_to_drop_count} rows.")
print(f"New DataFrame size: {len(df)} rows.")
print("DataFrame has been successfully cleaned of these usless rows.")

Original DataFrame size: 1657 rows.
Dropped 14 rows.
New DataFrame size: 1643 rows.
DataFrame has been successfully cleaned of these usless rows.


Now we can drop 'fixme' column too.

In [800]:
df.drop(columns=['fixme'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1643 entries, 0 to 1656
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1146 non-null   object 
 1   housenumber             1269 non-null   object 
 2   postcode                1210 non-null   object 
 3   street                  1272 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1579 non-null   float64
 11  latitude                1579 non-null   float64
 12  country                 755 non-null    object 
 13  suburb                  738 non-null    object 
 14  wheelchair              582 non-null    objec

Since we have missing values in the longitude and latitude columns, we are using geocoding to fill them in.

In [801]:
from geopy.geocoders import Nominatim
from tqdm.auto import tqdm
import time

# 1. Initialize the geocoder
# 'user_agent' is just a unique name for your application (required)
geolocator = Nominatim(user_agent="my_doctors_analyzer_v1")

# 2. Find the indices of rows that need to be filled
indices_to_fill = df[df['latitude'].isnull()].index

print(f"Found {len(indices_to_fill)} rows without coordinates. Starting geocoding...")

# 3. Loop through these indices using TQDM for a progress bar
for index in tqdm(indices_to_fill, desc="Geocoding addresses"):
    
    # 4. Get the row and safely build the address
    row = df.loc[index]
    address_parts = []
    
    # Build the address, skipping NaN (empty) values
    if pd.notna(row['street']):
        address_parts.append(row['street'])
    if pd.notna(row['housenumber']):
        address_parts.append(str(row['housenumber'])) # use str just in case the house number is a number
    if pd.notna(row['postcode']):
        address_parts.append(str(row['postcode']))
        
    # Add the city, default to 'Berlin' if the city column is empty
    city = row['city'] if pd.notna(row['city']) else 'Berlin'
    address_parts.append(city)
    address_parts.append('Germany')
    
    address_string = ", ".join(address_parts)
    
    try:
        # 5. Send the request
        location = geolocator.geocode(address_string, timeout=10)
        
        if location:
            # 6. If the address is found, write the coordinates into the df
            df.loc[index, 'latitude'] = location.latitude
            df.loc[index, 'longitude'] = location.longitude
        else:
            print(f"  Warning: Address not found for: {address_string}")
            
    except Exception as e:
        print(f"  Error geocoding {address_string}: {e}")
    
    # 7. IMPORTANT: Pause for 1.1 seconds
    # Nominatim is a free service and requires a rate limit (no more than 1 request per second)
    time.sleep(1.1)

print("Geocoding finished. ✅")

Found 64 rows without coordinates. Starting geocoding...


Geocoding addresses:   0%|          | 0/64 [00:00<?, ?it/s]

Geocoding finished. ✅


In [802]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1643 entries, 0 to 1656
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1146 non-null   object 
 1   housenumber             1269 non-null   object 
 2   postcode                1210 non-null   object 
 3   street                  1272 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  country                 755 non-null    object 
 13  suburb                  738 non-null    object 
 14  wheelchair              582 non-null    objec

Now we have all rows in 'longitude' and 'latitude' filled.

And let's to try reverse geocoding to fill in the missing values in the address columns.

In [803]:
# 1. Initialize the geocoder (using a new user_agent)
geolocator = Nominatim(user_agent="my_doctors_analyzer_v2")

# 2. Find rows where the street address is missing
# (We assume if the street is missing, the rest might be too)
indices_to_fill = df[df['street'].isnull()].index
print(f"Found {len(indices_to_fill)} rows with missing addresses. Starting reverse geocoding...")

# 3. Loop through these indices
for index in tqdm(indices_to_fill, desc="Reverse Geocoding"):
    
    # 4. Get the coordinates from the row
    row = df.loc[index]
    coordinates = f"{row['latitude']}, {row['longitude']}"
    
    try:
        # 5. Send the reverse geocoding request
        # We use language='en' to get consistent address keys (like 'road', 'city')
        location = geolocator.reverse(coordinates, language='en', timeout=10)
        
        if location and location.raw.get('address'):
            # This is the dictionary with all address parts
            address = location.raw['address']
            
            # 6. Safely fill ONLY the missing (NaN) values
            # We check each one so we don't overwrite existing good data
            
            if pd.isna(row['street']):
                df.loc[index, 'street'] = address.get('road')
            
            if pd.isna(row['housenumber']):
                df.loc[index, 'housenumber'] = address.get('house_number')
                
            if pd.isna(row['postcode']):
                df.loc[index, 'postcode'] = address.get('postcode')
                
            if pd.isna(row['suburb']):
                df.loc[index, 'suburb'] = address.get('suburb')

            if pd.isna(row['city']):
                # City can have fallbacks (like 'town' or 'village')
                city = address.get('city', address.get('town', address.get('village')))
                df.loc[index, 'city'] = city
        else:
            print(f"  Warning: Address not found for coordinates: {coordinates}")
            
    except Exception as e:
        print(f"  Error geocoding {coordinates}: {e}")
    
    # 7. IMPORTANT: Pause for 1.1 seconds to respect Nominatim's free policy
    time.sleep(1.1)

print("Reverse geocoding finished. ✅")

Found 371 rows with missing addresses. Starting reverse geocoding...


Reverse Geocoding:   0%|          | 0/371 [00:00<?, ?it/s]

Reverse geocoding finished. ✅


In [804]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1643 entries, 0 to 1656
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1512 non-null   object 
 1   housenumber             1335 non-null   object 
 2   postcode                1571 non-null   object 
 3   street                  1639 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  country                 755 non-null    object 
 13  suburb                  1105 non-null   object 
 14  wheelchair              582 non-null    objec

In [805]:
# Save the final result to a file

# Define the path to the file inside the 'clean' folder
full_path = Path('../clean/doctors_clean.csv')

# Save the DataFrame using the constructed path
df.to_csv(full_path, index=False, encoding='utf-8-sig')