In [1]:
import pandas as pd
import seaborn as sns  #load dataset
import numpy as np #statistical calculations
import matplotlib.pyplot as plt # for visualization
import psycopg2
from sqlalchemy import create_engine

In [5]:
df = pd.read_csv("pools_berlin_citymap.csv")

In [6]:
df.head()

Unnamed: 0,Bad-ID,Name,Badtyp,Straße,Postleitzahl,Ort,Breitengrad,Längengrad,eu_badegewaesser,Öffnungsstunden pro Jahr,Baujahr,Ganzjährig geöffnet,Name des Eigentümers,Ermäßigung Kind,Ermäßigung Familie,Ermäßigung Behinderte
0,472,Strandbad Lübars,Naturbad,Am Freibad 9,13469,Berlin,52.61824,13.33519,,,,nein,Henry Arzig,ja,ja,ja
1,473,Kleine Schwimmhalle Wuhlheide,Hallenbad,An der Wuhlheide 161,12459,Berlin,52.45993,13.53965,nein,,1990.0,ja,,ja,nein,ja
2,474,Kombibad Mariendorf,Kombibad,Ankogelweg 95,12107,Berlin,52.41972,13.40154,nein,,1974.0,ja,,ja,ja,ja
3,475,Schwimmhalle Anton-Saefkow-Platz,Hallenbad,Anton-Saefkow-Platz 1,10369,Berlin,52.53093,13.47184,nein,,1980.0,ja,,ja,nein,
4,476,Stadtbad Kreuzberg - Baerwaldbad,Hallenbad,Baerwaldstraße 64-67,10961,Berlin,52.49451,13.40432,nein,,1955.0,ja,Förderverein,ja,ja,ja


In [7]:
df.describe()

Unnamed: 0,Bad-ID,Postleitzahl,Breitengrad,Längengrad,Öffnungsstunden pro Jahr,Baujahr
count,144.0,144.0,144.0,144.0,0.0,49.0
mean,5096.506944,12565.152778,52.494014,13.364897,,1960.877551
std,4010.713202,1276.171872,0.055093,0.138273,,25.381779
min,472.0,10115.0,52.37322,13.114743,,1896.0
25%,1043.75,12105.0,52.445267,13.255845,,1950.0
50%,6987.5,12573.0,52.49232,13.35015,,1970.0
75%,8500.25,13587.0,52.539145,13.452025,,1980.0
max,10510.0,14199.0,52.62868,13.735059,,2000.0


In [8]:
df.shape

(144, 16)

In [9]:
# Map of German column names → English column names
rename_map = {
    'Bad-ID': 'pool_id',
    'Name': 'name',
    'Badtyp': 'pool_type',
    'Straße': 'street',
    'Postleitzahl': 'postal_code',
    'Ort': 'city',
    'Breitengrad': 'latitude',
    'Längengrad': 'longitude',
    'eu_badegewaesser': 'eu_bathing_water',
    'Öffnungsstunden pro Jahr': 'annual_opening_hours',
    'Baujahr': 'year_built',
    'Ganzjährig geöffnet': 'open_all_year',
    'Name des Eigentümers': 'owner_name',
    'Ermäßigung Kind': 'discount_children',
    'Ermäßigung Familie': 'discount_family',
    'Ermäßigung Behinderte': 'discount_disabled'
}

In [10]:
df.rename(columns=rename_map, inplace=True)

In [11]:
df.head()

Unnamed: 0,pool_id,name,pool_type,street,postal_code,city,latitude,longitude,eu_bathing_water,annual_opening_hours,year_built,open_all_year,owner_name,discount_children,discount_family,discount_disabled
0,472,Strandbad Lübars,Naturbad,Am Freibad 9,13469,Berlin,52.61824,13.33519,,,,nein,Henry Arzig,ja,ja,ja
1,473,Kleine Schwimmhalle Wuhlheide,Hallenbad,An der Wuhlheide 161,12459,Berlin,52.45993,13.53965,nein,,1990.0,ja,,ja,nein,ja
2,474,Kombibad Mariendorf,Kombibad,Ankogelweg 95,12107,Berlin,52.41972,13.40154,nein,,1974.0,ja,,ja,ja,ja
3,475,Schwimmhalle Anton-Saefkow-Platz,Hallenbad,Anton-Saefkow-Platz 1,10369,Berlin,52.53093,13.47184,nein,,1980.0,ja,,ja,nein,
4,476,Stadtbad Kreuzberg - Baerwaldbad,Hallenbad,Baerwaldstraße 64-67,10961,Berlin,52.49451,13.40432,nein,,1955.0,ja,Förderverein,ja,ja,ja


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pool_id            144 non-null    int64  
 1   name               144 non-null    object 
 2   pool_type          144 non-null    object 
 3   street             141 non-null    object 
 4   postal_code        144 non-null    object 
 5   city               144 non-null    object 
 6   latitude           144 non-null    float64
 7   longitude          144 non-null    float64
 8   eu_bathing_water   111 non-null    object 
 9   year_built         49 non-null     float64
 10  open_all_year      114 non-null    object 
 11  owner_name         63 non-null     object 
 12  discount_children  111 non-null    object 
 13  discount_family    108 non-null    object 
 14  discount_disabled  105 non-null    object 
dtypes: float64(3), int64(1), object(11)
memory usage: 17.0+ KB


In [17]:
# cleaning the data

df.duplicated(subset='pool_id').sum()  # checking for duplicate




np.int64(0)

In [19]:
df['postal_code'] = df['postal_code'].astype(str)
df['city'] = df['city'].str.title()


In [21]:
df

# Convert yes/no or German terms to Boolean
yes_values = ['Ja', 'Yes', 'yes', 'ja', True]
df['open_all_year'] = df['open_all_year'].apply(lambda x: True if x in yes_values else False)


In [23]:
df = df[(df['latitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]


In [28]:
mapping_cols = [
    'pool_id', 'name', 'pool_type', 'street', 'postal_code', 'city',
    'latitude', 'longitude', 'open_all_year'
]
df = df[mapping_cols]


In [29]:
df['street'] = df['street'].fillna("Unknown")


In [30]:
df.head()

Unnamed: 0,pool_id,name,pool_type,street,postal_code,city,latitude,longitude,open_all_year
0,472,Strandbad Lübars,Naturbad,Am Freibad 9,13469,Berlin,52.61824,13.33519,False
1,473,Kleine Schwimmhalle Wuhlheide,Hallenbad,An der Wuhlheide 161,12459,Berlin,52.45993,13.53965,True
2,474,Kombibad Mariendorf,Kombibad,Ankogelweg 95,12107,Berlin,52.41972,13.40154,True
3,475,Schwimmhalle Anton-Saefkow-Platz,Hallenbad,Anton-Saefkow-Platz 1,10369,Berlin,52.53093,13.47184,True
4,476,Stadtbad Kreuzberg - Baerwaldbad,Hallenbad,Baerwaldstraße 64-67,10961,Berlin,52.49451,13.40432,True


In [32]:
df.to_csv("berlin_pools_clean.csv", index=False)
