In [340]:
import pandas as pd
import json
from pathlib import Path

# --- Configure your files here ---
input_geojson_file = Path('../source/night_clubs_raw.geojson')  # The name of your downloaded GeoJSON
output_dir = Path('../source')
output_csv_file = output_dir / 'night_clubs.csv'
# ----------------------------------

# A list to store our flat data
flattened_data = []

# 1. Open and load the GeoJSON
with open(input_geojson_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Iterate over each "club" in the 'features' list
for feature in data['features']:
    # 3. Copy all "properties" (name, website, etc.)
    # This is already almost a finished row
    properties = feature['properties']
    
    # 4. Get the geometry (coordinates)
    geometry = feature['geometry']
    
    # Add coordinates to our row.
    # Make sure it's a 'Point' and has 'coordinates'
    if geometry and geometry['type'] == 'Point' and 'coordinates' in geometry:
        # In GeoJSON, the order is [longitude, latitude]
        properties['longitude'] = geometry['coordinates'][0]
        properties['latitude'] = geometry['coordinates'][1]
    
    # 5. Add the assembled row to our main list
    flattened_data.append(properties)

# 6. Create a DataFrame (table) from our list
df = pd.DataFrame(flattened_data)

# 7. Save the DataFrame to a CSV
# index=False - to avoid adding an extra column with row numbers
# encoding='utf-8' - for correct display of characters (e.g., German letters)
df.to_csv(output_csv_file, index=False, encoding='utf-8')

print(f"File '{output_csv_file}' was successfully created!")

File '..\source\night_clubs.csv' was successfully created!


In [341]:
df.head()

Unnamed: 0,@id,amenity,building,name,phone,website,wheelchair,building:levels,email,opening_hours:signed,...,name:es,delivery,takeaway,payment:cards,karaoke,disused:amenity,disused:name,charge:conditional,fee,nudism
0,way/23278633,nightclub,yes,Roadrunners Rock & Motor Club,+49 30 78082991,http://www.roadrunners-paradise.de,limited,,,,...,,,,,,,,,,
1,way/24248500,nightclub,residential,Werk9,+49 30 20165823,https://www.werk9.de/,limited,1.0,info@werk9.de,no,...,,,,,,,,,,
2,way/24283864,nightclub,commercial,Pride Warehouse,,,limited,1.0,,,...,,,,,,,,,,
3,way/36908987,nightclub,industrial,Gretchen,,,yes,1.0,gretchen@gretchen-club.de,,...,,,,,,,,,,
4,way/41474936,nightclub,school,Duncker,,,yes,1.0,,no,...,,,,,,,,,,


In [342]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 137 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    @id                            object 
 1    amenity                        object 
 2    building                       object 
 3    name                           object 
 4    phone                          object 
 5    website                        object 
 6    wheelchair                     object 
 7    building:levels                object 
 8    email                          object 
 9    opening_hours:signed           object 
 10   roof:shape                     object 
 11   smoking                        object 
 12   toilets:wheelchair             object 
 13   wheelchair:description         object 
 14   addr:city                      object 
 15   addr:country                   object 
 16   addr:housenumber               object 
 17   addr:postcode                  ob

In [343]:
df['amenity'].unique()

array(['nightclub'], dtype=object)

In [344]:
df['building'].unique()

array(['yes', 'residential', 'commercial', 'industrial', 'school',
       'viaduct', 'boat', nan], dtype=object)

In [345]:
df['building:levels'].unique()

array([nan, '1', '2', '3', '5'], dtype=object)

In [346]:
df['opening_hours:signed'].unique()


array([nan, 'no'], dtype=object)

In [347]:
df['roof:shape'].unique()

array([nan, 'flat', 'gabled', 'hipped'], dtype=object)

In [348]:
df['toilets:wheelchair'].unique()

array([nan, 'yes', 'no'], dtype=object)

In [349]:
df['wheelchair:description'].unique()

array([nan,
       'Haben für den Eingang eine Rampe, die muss bei Bedarf hingelegt werden. Eine behindertengerechte Toilette ist vorhanden.',
       'Tribüne rollstuhlgerecht bei Konzerten', 'enge Toilette',
       'Durchs Tor kann man ebenerdig, kein Rolli-WC',
       'Stufe am Haupteingang, Rampe vorhanden. Für das rolligerechte WC bitte vor Ort nach dem Schlüssel fragen.',
       'WC war gerade kaputt, ansonsten im EG alles ok erreichbar',
       'Befindet sich im 1. Stock. Achtung: Ignoriert das Nichtraucherschutzgesetz.',
       'eine Rollstuhlrampe wird auf Anfrage ausgelegt',
       'Es gibt zwar eine rollstuhlgerechte Toilette, aber auch Treppen am Eingang. Man kann wohl hinten rum über einen Lastenaufzug...',
       'Rampe am Eingang wird auf Nachfrage hingelegt;Drinnen: Rollstuh auf ein Podest heben zum Konzert schauen',
       'keine Stufen, alles mit Rampen und Fahrstühlen zu erreichen, lediglich die Rampen im Garten lassen sich nur mit guter Armfunktion bewältigen, WCs si

In [350]:
df['addr:suburb'].unique()

array([nan, 'Friedrichshain', 'Prenzlauer Berg', 'Weißensee',
       'Gesundbrunnen', 'Mitte', 'Rummelsburg', 'Lichtenberg',
       'Kreuzberg', 'Tempelhof', 'Halensee', 'Neukölln', 'Charlottenburg',
       'Schöneberg', 'Moabit', 'Wedding', 'Märkisches Viertel',
       'Alt-Treptow', 'Buckow', 'Spandau', 'Niederschöneweide',
       'Charlottenburg-Nord'], dtype=object)

In [351]:
df['roof:colour'].unique()

array([nan, '#74817e', '#3b3d46', '#414141'], dtype=object)

In [352]:
df['roof:levels'].unique()

array([nan, '0', '1'], dtype=object)

In [353]:
df['alt_name'].unique()

array([nan, 'Dunckerclub', 'Postbahnhof am Ostbahnhof',
       'KitKat Club;KitKatClub', 'Havanna Club', 'Hole Berlin',
       'Berghain Panorama Bar', 'Club OST'], dtype=object)

In [354]:
df['check_date'].unique()

array([nan, '2025-01-26', '2024-08-28', '2023-11-03', '2025-06-30',
       '2023-08-29', '2024-11-26', '2023-11-01', '2024-08-17',
       '2023-06-04', '2025-07-03', '2025-08-26', '2023-11-10',
       '2024-03-03', '2024-06-10', '2025-05-03', '2023-07-23',
       '2024-09-01', '2025-09-28', '2023-05-23', '2024-09-08',
       '2025-09-13', '2024-07-27', '2024-06-16', '2025-07-02',
       '2024-11-02', '2025-06-21'], dtype=object)

In [355]:
df['check_date:opening_hours'].unique()

array([nan, '2025-01-26', '2025-10-08', '2025-01-20', '2024-11-26',
       '2025-10-14', '2025-03-10', '2023-11-29', '2025-08-21',
       '2023-11-17', '2024-02-19', '2025-09-28', '2023-04-02'],
      dtype=object)

In [356]:
df['image'].unique()

array([nan, 'https://www.dunckerclub.de/bilder/duncker_aussen.jpg',
       'https://media04.berliner-woche.de/event/2016/05/18/6/16526_XXL.png',
       'https://www.schwuz.de/wp-content/uploads/2022/03/2013-gruendung-schwuz-verwaltungs-gmbh-berlin.jpg',
       'https://www.theclubmap.com/wp-content/uploads/2021/09/ritter-butzke-club-berlin-7.jpg',
       'https://groove.de/wp-content/uploads/2019/05/prince-charles-final.jpg',
       'https://i.imgur.com/zRbEgb0.jpeg'], dtype=object)

In [357]:
df['start_date'].unique()

array([nan, '1989', '2009', '2014', '2007-09', '1905', '1997', '2009-10',
       '2014-12', '2007', '2010'], dtype=object)

In [358]:
df['operator'].unique()

array([nan, 'KULT. Verein für Jugendkultur e.V.',
       'Kreuzberg Musik Produktion GmbH', 'Badehaus Musik GmbH',
       'Sisyphos Event GmbH',
       'Neue Flutgraben Betriebsgesellschaft mbH & Co KG',
       'Berghain OstGut GmbH', 'Katerclub GmbH', 'Maaya Event GmbH',
       'Zur Wilden Renate oHG', 'Insomnia GbR', 'Ballhaus Berlin GmbH',
       'SchwuZ Kulturveranstaltungs GmbH',
       'Ritter Butzke Production GmbH', 'BRICKS Gastronomie GmbH',
       'Volksbühne am Rosa-Luxemburg-Platz',
       'Michael Sonntag, Michael Klatt', 'Milchmädchen Musikkultur GmbH',
       '8MM Bar Veranstaltungs GmbH', 'Javid und Ücel GbR',
       'Humberto Castillo', 'Tresor Berlin GmbH', 'about blank eG',
       'Alexander Schulzendorff', 'Mein Haus am See GmbH',
       "Spreewies'n GmbH & Co. Beach Betriebsgesellschaft KG",
       'Ingo Kaatz'], dtype=object)

In [359]:
df['previously'].unique()

array([nan, 'Suicide Circus'], dtype=object)

In [360]:
df['drink:club-mate'].unique()

array([nan, 'yes', 'served'], dtype=object)

In [361]:
df['leisure'].unique()

array([nan, 'music_venue', 'cultural_institution'], dtype=object)

In [362]:
df['live_music'].unique()

array([nan, 'yes'], dtype=object)

In [363]:
df['operator:type'].unique()

array([nan, 'business'], dtype=object)

In [364]:
df['owner'].unique()

array([nan, 'Kreuzberg Musik Produktion GmbH'], dtype=object)

In [365]:
df['wikidata'].unique()

array([nan, 'Q105762275', 'Q21036515', 'Q21036516', 'Q112861701',
       'Q115794233', 'Q2658747', 'Q1724204', 'Q47163174', 'Q552863',
       'Q100256393', 'Q1103846', 'Q62011644', 'Q21036513', 'Q2211685',
       'Q322135', 'Q1285955', 'Q834245', 'Q21036533', 'Q105762300',
       'Q130601665', 'Q561850', 'Q678186', 'Q23906461', 'Q136975',
       'Q805252', 'Q116142643'], dtype=object)

In [366]:
df['wikimedia_commons'].unique()

array([nan, 'Category:Astra Kulturhaus', 'Category:Remili (ship, 1906)',
       'File:20230916 xl 1534-Festsaal Kreuzberg 2.jpg',
       'Category:Cassiopeia (Berlin)', 'Category:Matrix (club)',
       'Category:Lido (Berlin)', 'Category:Clärchens Ballhaus',
       'Category:Postbahnhof (venue)', 'Category:KitKatClub',
       'File:Ballhaus Berlin Chausseestraße.jpg', 'Category:SchwuZ',
       'Category:SO36', 'Category:Ballhaus Spandau'], dtype=object)

In [367]:
df['layer'].unique()

array([nan, '2', '1'], dtype=object)

In [368]:
df['source'].unique()

array([nan, 'Geoportal Berlin / Hauskoordinaten', 'survey',
       'Solikon 2015;survey'], dtype=object)

In [369]:
df['seamark:hulk:category'].unique()
df['seamark:hulk:category'].value_counts()

seamark:hulk:category
casino_boat    1
Name: count, dtype: int64

In [370]:
df['seamark:type'].unique()

array([nan, 'hulk'], dtype=object)

In [371]:
df['addr:housename'].unique()

array([nan, 'Bogen 47', 'Bogen 49', 'Europa-Center', 'SO36'], dtype=object)

In [372]:
df['addr:place'].unique()

array([nan, 'Stadtbahnbogen', 'Richard-Wagner-Platz'], dtype=object)

In [373]:
df['building:part'].unique()

array([nan, 'yes'], dtype=object)

In [374]:
df['culture'].unique()

array([nan, 'open_air'], dtype=object)

In [375]:
df['open_air'].unique()

array([nan, 'yes'], dtype=object)

In [376]:
df['addr:floor'].unique()

array([nan, '0'], dtype=object)

In [377]:
df['bar'].unique()

array([nan, 'yes'], dtype=object)

In [378]:
df['level'].unique()

array([nan, '0', '1', '20', '-1'], dtype=object)

In [379]:
df['ref:vatin'].unique()

array([nan, 'DE310804941'], dtype=object)

In [380]:
df['toilets'].unique()

array([nan, 'yes'], dtype=object)

In [381]:
df['nohousenumber'].unique()

array([nan, 'yes'], dtype=object)

In [382]:
df['club'].unique()

array([nan, 'culture'], dtype=object)

In [383]:
df['brewery'].unique()

array([nan, 'Krombacher;Starnberger'], dtype=object)

In [384]:
df['official_name'].unique()

array([nan, 'Metropol Berlin'], dtype=object)

In [385]:
df['loc_name'].unique()

array([nan, 'Renate'], dtype=object)

In [386]:
df['opening_hours:description'].unique()

array([nan, '"At night. since 23.00h till early in the morning."'],
      dtype=object)

In [387]:
df['outdoor_seating'].unique()

array([nan, 'yes', 'no'], dtype=object)

In [388]:
df['note'].unique()

array([nan, 'subsidiary of KitKat'], dtype=object)

In [389]:
df['kinky'].unique()
df['kinky'].value_counts()

kinky
yes    2
Name: count, dtype: int64

In [390]:
df['access'].unique()

array([nan, 'customers'], dtype=object)

In [391]:
df['entrance'].unique()

array([nan, 'yes', 'main'], dtype=object)

In [392]:
df['brothel:swingerclub'].unique()
df['brothel:swingerclub'].value_counts()

brothel:swingerclub
events    1
Name: count, dtype: int64

In [393]:
df['description'].unique()
df['description'].value_counts()

description
Erotic Nightclub                           1
concert collective;concert venue;events    1
ATM inside                                 1
rentable                                   1
2nd address: S-Bahn Bogen 46               1
Name: count, dtype: int64

In [394]:
df['min_age'].unique()

array([nan, '18'], dtype=object)

In [395]:
df['mobile'].unique()

array([nan, '+49 177 2333878'], dtype=object)

In [396]:
df['lgbtq'].unique()
df['lgbtq'].value_counts()

lgbtq
primary    3
Name: count, dtype: int64

In [397]:
df['url'].unique()

array([nan, 'https://am-to-pm.metro.bar/'], dtype=object)

In [398]:
df['music_genre'].unique()

array([nan, 'schlager'], dtype=object)

In [399]:
df['name:ja'].unique()

array([nan, 'ハーフェンバー', 'ウィークエンド・クラブ'], dtype=object)

In [400]:
df['cruising'].unique()
df['cruising'].value_counts()

cruising
yes    1
Name: count, dtype: int64

In [401]:
df['unused:amenity'].unique()

array([nan, 'nightclub'], dtype=object)

In [402]:
df['mapillary'].unique()

array([nan, '458394502129423', '371476558518986'], dtype=object)

In [403]:
df['identity'].unique()
df['identity'].value_counts()

identity
solidarity_economy    1
Name: count, dtype: int64

In [404]:
df['provides'].unique()
df['provides'].value_counts()

provides
culture;spaces    1
Name: count, dtype: int64

In [405]:
df['fixme'].unique()

array([nan,
       'Website ist down, die Betreibergesellschaft insolvent, existiert der Club noch?'],
      dtype=object)

In [406]:
df['name:es'].unique()

array([nan, 'Conejo Blanco'], dtype=object)

In [407]:
df['delivery'].unique()
df['delivery'].value_counts()

delivery
no    1
Name: count, dtype: int64

In [408]:
df['takeaway'].value_counts()

takeaway
no    1
Name: count, dtype: int64

In [409]:
df['karaoke'].value_counts()

karaoke
yes    1
Name: count, dtype: int64

In [410]:
df['disused:amenity'].value_counts()

disused:amenity
restaurant    1
Name: count, dtype: int64

In [411]:
df['disused:name'].value_counts()

disused:name
Marin    1
Name: count, dtype: int64

In [412]:
df['fee'].value_counts()

fee
yes    1
Name: count, dtype: int64

In [413]:
df['charge:conditional'].value_counts()

charge:conditional
no @ (female;couple)    1
Name: count, dtype: int64

In [414]:
df['nudism'].value_counts()

nudism
yes    1
Name: count, dtype: int64

In [415]:
df['addr:country'].unique()

array([nan, 'DE'], dtype=object)

In [416]:
df['source_ref:url'].unique()

array([nan, 'https://metropol-berlin.de/contact'], dtype=object)

We can drop 'amenity' column as it contains just 'nightclub' category, 'addr:country' as it contains only 'DE','building', 'building:levels', 'opening_hours:signed', 'roof:shape', 'smoking', 'roof:colour', 'roof:levels', 'alt_name', 'building:colour', 'building:material', 'image', 'start_date', 'year_of_construction', 'operator', 'previously', 'contact:fax', 'contact:facebook', 'contact:instagram', 'contact:twitter', 'drink:club-mate', 'operator:type', 'owner', 'wikidata', 'wikimedia_commons', 'layer', 'source', 'seamark:type', 'addr:housename', 'addr:place', 'building:part', 'contact:ra', 'culture', 'name:etymology:wikidata', 'name:etymology:wikipedia', 'wikipedia', 'addr:floor', 'bar', 'level', 'payment:cash', 'payment:cash', 'payment:credit_cards', 'payment:debit_cards', 'payment:mastercard', 'payment:visa', 'ref:vatin', 'toilets', 'nohousenumber', 'club', 'brewery', 'official_name','old_contact:facebook', 'old_contact:ra', 'old_name:2015--2024-01-22', 'old_operator', 'old_wikidata', 'wifi', 'check_date:smoking', 'old_name', 'check_date:currency:XBT', 'currency:XBT', 'loc_name', 'name:en', 'opening_hours:description', 'payment:lightning', 'payment:lightning_contactless', 'payment:onchain', 'note', 'access', 'entrance', 'brothel:swingerclub', 'kinky', 'description','fax', 'min_age', 'mobile', 'facebook', 'instagram', 'lgbtq', 'short_name', 'internet_access', 'music_genre', 'name:ja', 'payment:free', 'seamark:hulk:category', 'cruising', 'lgbtq:cruising', 'lgbtq:men', 'opening_hours:office', 'unused:amenity', 'contact:whatsapp', 'alt_name_1', 'mapillary', 'air_conditioning', 'contact:qype', 'contact:yelp', 'identity', 'provides', 'name:es', 'delivery','takeaway', 'payment:cards', 'karaoke', 'disused:amenity', 'disused:name', 'fee', 'charge:conditional', 'nudism' columns as we don't need them

In [417]:
columns_to_drop = [
    'amenity', 'addr:country', 'building', 'building:levels', 'opening_hours:signed', 'roof:shape',
    'smoking', 'roof:colour', 'roof:levels', 'alt_name', 'building:colour',
    'building:material', 'image', 'start_date', 'year_of_construction', 'operator',
    'previously', 'contact:fax', 'contact:facebook', 'contact:instagram',
    'contact:twitter', 'drink:club-mate', 'operator:type', 'owner', 'wikidata',
    'wikimedia_commons', 'layer', 'source', 'seamark:type', 'addr:housename',
    'addr:place', 'building:part', 'contact:ra', 'culture', 'name:etymology:wikidata',
    'name:etymology:wikipedia', 'wikipedia', 'addr:floor', 'bar', 'level',
    'payment:cash', 'payment:cash', 'payment:credit_cards', 'payment:debit_cards',
    'payment:mastercard', 'payment:visa', 'ref:vatin', 'toilets', 'nohousenumber',
    'club', 'brewery', 'official_name', 'old_contact:facebook',
    'old_contact:ra', 'old_name:2015--2024-01-22', 'old_operator', 'old_wikidata',
    'wifi', 'check_date:smoking', 'old_name', 'check_date:currency:XBT',
    'currency:XBT', 'loc_name', 'name:en', 'opening_hours:description',
    'payment:lightning', 'payment:lightning_contactless', 'payment:onchain', 'note',
    'access', 'entrance', 'brothel:swingerclub', 'kinky', 'description', 'fax',
    'min_age', 'mobile', 'facebook', 'instagram', 'lgbtq', 'short_name',
    'internet_access', 'music_genre', 'name:ja', 'payment:free',
    'seamark:hulk:category', 'cruising', 'lgbtq:cruising', 'lgbtq:men',
    'opening_hours:office', 'unused:amenity', 'contact:whatsapp', 'alt_name_1',
    'mapillary', 'air_conditioning', 'contact:qype', 'contact:yelp', 'identity',
    'provides', 'name:es', 'delivery', 'takeaway', 'payment:cards', 'karaoke',
    'disused:amenity', 'disused:name', 'fee', 'charge:conditional', 'nudism'
]


# inplace=True modifies the df directly (no need for df = df.drop(...))
# errors='ignore' prevents a crash if a column in the list doesn't exist
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')


print("Columns successfully dropped. Remaining columns:")
df.info()

Columns successfully dropped. Remaining columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   @id                       143 non-null    object 
 1   name                      142 non-null    object 
 2   phone                     23 non-null     object 
 3   website                   53 non-null     object 
 4   wheelchair                90 non-null     object 
 5   email                     18 non-null     object 
 6   toilets:wheelchair        36 non-null     object 
 7   wheelchair:description    11 non-null     object 
 8   addr:city                 103 non-null    object 
 9   addr:housenumber          108 non-null    object 
 10  addr:postcode             111 non-null    object 
 11  addr:street               109 non-null    object 
 12  addr:suburb               82 non-null     object 
 13  contact:phone   

Let's check the 'fixme' column – it only has one value: "Website ist down, die Betreibergesellschaft insolvent, existiert der Club noch?". I will manually check if the club still exists.

In [418]:
columns_to_show = [
    'name',
    'addr:city',
    'addr:housenumber',
    'addr:postcode',
    'addr:street',
    'addr:suburb',
    'contact:website',
    'fixme'  # Including 'fixme' itself to see the problem note
]

# 2. Filter the DataFrame for rows where 'fixme' is not null
df_with_fixme = df[df['fixme'].notnull()]

# 3. (FIX) Find which of your desired columns *actually exist* in the DataFrame
existing_columns_to_show = [col for col in columns_to_show if col in df_with_fixme.columns]

# 4. Print only the existing columns from the filtered result
# We use .to_string() to ensure ALL rows are printed, not just a summary
print(df_with_fixme[existing_columns_to_show].to_string())

                     name addr:city addr:housenumber addr:postcode              addr:street addr:suburb                  contact:website                                                                            fixme
106  BlackWhite Danceclub    Berlin              261         12351  Johannisthaler Chaussee      Buckow  http://blackwhite-danceclub.de/  Website ist down, die Betreibergesellschaft insolvent, existiert der Club noch?


The website is still unavailable and the information on Facebook hasn't been updated in a long time. I will delete this row and drop column 'fixme'.

In [419]:
# We get the index from your original filter
index_to_delete = df[df['fixme'].notnull()].index

# inplace=True modifies df directly
df.drop(index_to_delete, inplace=True, errors='ignore')

# axis=1 means we are dropping a column, not a row
df.drop('fixme', axis=1, inplace=True, errors='ignore')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   @id                       142 non-null    object 
 1   name                      141 non-null    object 
 2   phone                     23 non-null     object 
 3   website                   53 non-null     object 
 4   wheelchair                89 non-null     object 
 5   email                     18 non-null     object 
 6   toilets:wheelchair        36 non-null     object 
 7   wheelchair:description    11 non-null     object 
 8   addr:city                 102 non-null    object 
 9   addr:housenumber          107 non-null    object 
 10  addr:postcode             110 non-null    object 
 11  addr:street               108 non-null    object 
 12  addr:suburb               81 non-null     object 
 13  contact:phone             35 non-null     object 
 14  contact:website

I will also delete columns that have too few values for analysis.

In [420]:
columns_to_drop = ['leisure', 'open_air', 'outdoor_seating', 'check_date', 'check_date:opening_hours']

# inplace=True modifies the df directly
# errors='ignore' prevents an error if they are already dropped
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   @id                     142 non-null    object 
 1   name                    141 non-null    object 
 2   phone                   23 non-null     object 
 3   website                 53 non-null     object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets:wheelchair      36 non-null     object 
 7   wheelchair:description  11 non-null     object 
 8   addr:city               102 non-null    object 
 9   addr:housenumber        107 non-null    object 
 10  addr:postcode           110 non-null    object 
 11  addr:street             108 non-null    object 
 12  addr:suburb             81 non-null     object 
 13  contact:phone           35 non-null     object 
 14  contact:website         46 non-null     object 

I would like to inspect id column.

In [421]:
df['@id'].nunique()

142

In [422]:
df['@id'].unique()

array(['way/23278633', 'way/24248500', 'way/24283864', 'way/36908987',
       'way/41474936', 'way/42163697', 'way/45001954', 'way/45001955',
       'way/72984386', 'way/108243893', 'way/118327391', 'way/119172796',
       'way/123019464', 'way/123750257', 'way/136852555', 'way/136852556',
       'way/148829988', 'way/167777311', 'way/172900722', 'way/277805717',
       'way/286165789', 'way/291111616', 'way/293386107', 'way/342600129',
       'way/355476495', 'way/379305718', 'way/684464913',
       'way/1081145396', 'node/259845846', 'node/262818281',
       'node/339749949', 'node/360111407', 'node/373095675',
       'node/383210462', 'node/410135945', 'node/429735547',
       'node/461623768', 'node/475153994', 'node/485354535',
       'node/485640732', 'node/492849637', 'node/621429565',
       'node/652884290', 'node/667842214', 'node/670069373',
       'node/729294555', 'node/736787851', 'node/739305494',
       'node/821646254', 'node/898983650', 'node/910429971',
       'node/

As it already unique I will leave it as is.

In [423]:
df.rename(columns={
    '@id': 'id',
    'name': 'club_name',
    'toilets:wheelchair': 'toilets_wheelchair',
    'wheelchair:description': 'wheelchair_description',
    'addr:city': 'city',
    'addr:housenumber': 'house_num',
    'addr:postcode': 'postcode',
    'addr:street': 'street',
    'addr:suburb': 'suburb',
    'contact:phone': 'contact_phone',
    'contact:website': 'website1',
    'contact:mobile': 'mobile_phone',
    'contact:email': 'email1'
}, inplace=True)

In [424]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   23 non-null     object 
 3   website                 53 non-null     object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  contact_phone           35 non-null     object 
 14  website1                46 non-null     object 

I would like to check columns phone contact_phone and mobile_phone.

In [425]:
has_any_phone = df[['phone', 'mobile_phone','contact_phone']].notnull().any(axis=1)

print(df[has_any_phone][['phone', 'mobile_phone','contact_phone']])

                 phone       mobile_phone        contact_phone
0      +49 30 78082991                NaN                  NaN
1      +49 30 20165823                NaN                  NaN
3                  NaN                NaN      +49 30 25922702
4                  NaN                NaN       +49 30 4459509
5                  NaN   +49 176 10051189                  NaN
7                  NaN                NaN      +49 30 20056767
9      +49 30 92092120                NaN                  NaN
12                 NaN                NaN      +49 30 25933042
19    +49 30 403655630                NaN                  NaN
22                 NaN                NaN      +49 30 47361686
23                 NaN                NaN     +49 30 297766770
25    +49 30 403678560                NaN                  NaN
27                 NaN                NaN      +49 30 65832595
28        +49306946602                NaN                  NaN
30                 NaN                NaN      +49 30 2

I want to see only the rows where all columns have values.

In [426]:
all_phones_filled = df[['phone', 'mobile_phone','contact_phone']].notnull().all(axis=1)

print(df[all_phones_filled][['phone', 'mobile_phone','contact_phone']])

Empty DataFrame
Columns: [phone, mobile_phone, contact_phone]
Index: []


Since the values in these columns don't overlap, I will combine them into one column 'phone'.

In [427]:
df['phone'] = df['phone'].fillna(df['mobile_phone'])
df['phone'] = df['phone'].fillna(df['contact_phone'])

And drop columns 'mobile_phone' and 'contact_phone'

In [428]:
df.drop(columns=['mobile_phone','contact_phone'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 53 non-null     object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  website1                46 non-null     object 
 14  opening_hours           53 non-null     object 

Now I'll inspect columns 'website' and 'website1'

In [429]:
website = df[['website', 'website1','source_ref:url','url']].notnull().any(axis=1)

print(df[website][['website', 'website1','source_ref:url','url']])

                                website                        website1  \
0    http://www.roadrunners-paradise.de                             NaN   
1                 https://www.werk9.de/                             NaN   
3                                   NaN   https://www.gretchen-club.de/   
4                                   NaN     https://www.dunckerclub.de/   
5                                   NaN            https://www.yaam.de/   
..                                  ...                             ...   
132           https://fitzroy-berlin.de                             NaN   
134                                 NaN      https://falscherfisch.net/   
137           https://kreuzwerk.berlin/                             NaN   
139            https://moechtegern.com/                             NaN   
142                                 NaN  https://brickhouse-berlin.com/   

    source_ref:url  url  
0              NaN  NaN  
1              NaN  NaN  
3              NaN  N

I want to see only the rows where all columns have values.

In [430]:
all_filled = df[['website', 'website1','source_ref:url','url']].notnull().all(axis=1)

print(df[all_filled][['website', 'website1','source_ref:url','url']])

Empty DataFrame
Columns: [website, website1, source_ref:url, url]
Index: []


There are no such rows – I will fill the 'website' column with the missing values from the 'website1', 'source_ref:url' and 'url' columns and then drop them.

In [431]:
df['website'] = df['website'].fillna(df['website1'])
df['website'] = df['website'].fillna(df['source_ref:url'])
df['website'] = df['website'].fillna(df['url'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  website1                46 non-null     object 
 14  opening_hours           53 non-null     object 

In [432]:
df.drop(columns=['website1','source_ref:url','url'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  opening_hours           53 non-null     object 
 14  email1                  19 non-null     object 

Same with 'email' and 'email1' columns.

In [433]:
has_email = df[['email', 'email1']].notnull().any(axis=1)

print(df[has_email][['email', 'email1']])

                              email                                 email1
1                     info@werk9.de                                    NaN
3         gretchen@gretchen-club.de                                    NaN
6    contact@lokschuppen-berlin.com                                    NaN
7                               NaN                   info@astra-berlin.de
12                              NaN               info@badehaus-berlin.com
14              club@larkberlin.com                                    NaN
20              support@berghain.de                                    NaN
22                              NaN   info@katerblau.de;hello@katerblau.de
23                              NaN                          info@maaya.de
25          info@metropol-berlin.de                                    NaN
30                              NaN                 hallo@matrix-berlin.de
35         info@claerchensball.haus                                    NaN
37                       

In [434]:
all_email_filled = df[['email', 'email1']].notnull().all(axis=1)

print(df[all_filled][['email', 'email1']])

Empty DataFrame
Columns: [email, email1]
Index: []


There is no such rows, I'll drop column 'email1'

In [435]:
df.drop(columns=['email1'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  opening_hours           53 non-null     object 
 14  live_music              10 non-null     object 

In [436]:
%pip install geopy tqdm

Note: you may need to restart the kernel to use updated packages.


Since we have missing values in the longitude and latitude columns, we are using geocoding to fill them in.

In [437]:
import pandas as pd
from geopy.geocoders import Nominatim
from tqdm.auto import tqdm
import time

# 1. Initialize the geocoder
# 'user_agent' is just a unique name for your application (required)
geolocator = Nominatim(user_agent="my_club_analyzer_v1")

# 2. Find the indices of rows that need to be filled
indices_to_fill = df[df['latitude'].isnull()].index

print(f"Found {len(indices_to_fill)} rows without coordinates. Starting geocoding...")

# 3. Loop through these indices using TQDM for a progress bar
for index in tqdm(indices_to_fill, desc="Geocoding addresses"):
    
    # 4. Get the row and safely build the address
    row = df.loc[index]
    address_parts = []
    
    # Build the address, skipping NaN (empty) values
    if pd.notna(row['street']):
        address_parts.append(row['street'])
    if pd.notna(row['house_num']):
        address_parts.append(str(row['house_num'])) # use str just in case the house number is a number
    if pd.notna(row['postcode']):
        address_parts.append(str(row['postcode']))
        
    # Add the city, default to 'Berlin' if the city column is empty
    city = row['city'] if pd.notna(row['city']) else 'Berlin'
    address_parts.append(city)
    address_parts.append('Germany')
    
    address_string = ", ".join(address_parts)
    
    try:
        # 5. Send the request
        location = geolocator.geocode(address_string, timeout=10)
        
        if location:
            # 6. If the address is found, write the coordinates into the df
            df.loc[index, 'latitude'] = location.latitude
            df.loc[index, 'longitude'] = location.longitude
        else:
            print(f"  Warning: Address not found for: {address_string}")
            
    except Exception as e:
        print(f"  Error geocoding {address_string}: {e}")
    
    # 7. IMPORTANT: Pause for 1.1 seconds
    # Nominatim is a free service and requires a rate limit (no more than 1 request per second)
    time.sleep(1.1)

print("Geocoding finished. ✅")

Found 28 rows without coordinates. Starting geocoding...


Geocoding addresses:   0%|          | 0/28 [00:00<?, ?it/s]

Geocoding finished. ✅


In [438]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    102 non-null    object 
 9   house_num               107 non-null    object 
 10  postcode                110 non-null    object 
 11  street                  108 non-null    object 
 12  suburb                  81 non-null     object 
 13  opening_hours           53 non-null     object 
 14  live_music              10 non-null     object 

Now I want to try reverse geocoding to fill in the missing values in the address columns.

In [439]:
# 1. Initialize the geocoder (using a new user_agent)
geolocator = Nominatim(user_agent="my_club_analyzer_v2")

# 2. Find rows where the street address is missing
# (We assume if the street is missing, the rest might be too)
indices_to_fill = df[df['street'].isnull()].index
print(f"Found {len(indices_to_fill)} rows with missing addresses. Starting reverse geocoding...")

# 3. Loop through these indices
for index in tqdm(indices_to_fill, desc="Reverse Geocoding"):
    
    # 4. Get the coordinates from the row
    row = df.loc[index]
    coordinates = f"{row['latitude']}, {row['longitude']}"
    
    try:
        # 5. Send the reverse geocoding request
        # We use language='en' to get consistent address keys (like 'road', 'city')
        location = geolocator.reverse(coordinates, language='en', timeout=10)
        
        if location and location.raw.get('address'):
            # This is the dictionary with all address parts
            address = location.raw['address']
            
            # 6. Safely fill ONLY the missing (NaN) values
            # We check each one so we don't overwrite existing good data
            
            if pd.isna(row['street']):
                df.loc[index, 'street'] = address.get('road')
            
            if pd.isna(row['house_num']):
                df.loc[index, 'house_num'] = address.get('house_number')
                
            if pd.isna(row['postcode']):
                df.loc[index, 'postcode'] = address.get('postcode')
                
            if pd.isna(row['suburb']):
                df.loc[index, 'suburb'] = address.get('suburb')

            if pd.isna(row['city']):
                # City can have fallbacks (like 'town' or 'village')
                city = address.get('city', address.get('town', address.get('village')))
                df.loc[index, 'city'] = city
        else:
            print(f"  Warning: Address not found for coordinates: {coordinates}")
            
    except Exception as e:
        print(f"  Error geocoding {coordinates}: {e}")
    
    # 7. IMPORTANT: Pause for 1.1 seconds to respect Nominatim's free policy
    time.sleep(1.1)

print("Reverse geocoding finished. ✅")

Found 34 rows with missing addresses. Starting reverse geocoding...


Reverse Geocoding:   0%|          | 0/34 [00:00<?, ?it/s]

Reverse geocoding finished. ✅


In [440]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 0 to 142
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      142 non-null    object 
 1   club_name               141 non-null    object 
 2   phone                   59 non-null     object 
 3   website                 100 non-null    object 
 4   wheelchair              89 non-null     object 
 5   email                   18 non-null     object 
 6   toilets_wheelchair      36 non-null     object 
 7   wheelchair_description  11 non-null     object 
 8   city                    135 non-null    object 
 9   house_num               108 non-null    object 
 10  postcode                140 non-null    object 
 11  street                  141 non-null    object 
 12  suburb                  114 non-null    object 
 13  opening_hours           53 non-null     object 
 14  live_music              10 non-null     object 

In [441]:
# Save the final result to a file

# Define the path to the file inside the 'clean' folder
full_path = Path('../clean/night_clubs_clean.csv')

# Save the DataFrame using the constructed path
df.to_csv(full_path, index=False, encoding='utf-8-sig')