# Notebook to scrape dynamic webpage from Copenhagen Municipality
This notebook contains the code used to scrape information from exercise facilities, collected and published by Copenhagen Municipality.
Since the table displayed on the website is generated according to filters from an interactive map, it was apparently required to use Selenium to appropriately access the desired data.

In [15]:
# Import libraries

import requests # http requests
import re # regular expressions
from bs4 import BeautifulSoup # xml parsing
import pandas as pd
import regex as re
import os
import time
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm
import numpy as np

In [22]:
# Define URL
url = 'https://kbhkort.kk.dk/spatialmap?page=widget-view&name=motion/motionslisten'

In [63]:
# Initialise a dictionary to hold the scraped data
data = {'type':[], 'activity':[], 'location':[], 'website':[], 'gender':[], 'age':[], 'special':[], 'address':[]}

options = ChromeOptions()       # Get Chrome options
options.headless = True         # This stops an actual browser from being open and shown
driver = Chrome('libraries/chromedriver-win64/chromedriver.exe', options=options)  # Optional argument, if not specified will search path.
driver.get(url)

print("Driver and URL passed. Wait a second...")
driver.implicitly_wait(20) # Wait for the website to load

count = len(driver.find_elements(By.XPATH, '/html/body/div/div[4]/div/ul/li'))
print(f'There is a total of {count} entries.')

# At the time of writing, there were 606 entries (2023-11-16)
for i in tqdm(range(1, count+1)): 

    # Activity type/category - this is indicated by the associated icon. The icon name is collected.
    icon = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[1]/img')
    data['type'].append(icon.get_attribute('src'))

    # Name/title of the activity
    type = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[1]/strong[1]')
    data['activity'].append(type.text)

    # Location of activity - not all entries list a location
    try:
        location = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[1]/strong[2]')
        data['location'].append(location.text)
    except NoSuchElementException:
        data['location'].append(None)

    # Website
    try:
        site = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[1]/a')
        data['website'].append(site.get_attribute('href'))
    except NoSuchElementException:
        data['website'].append(None)
    

    # Gender
    gender = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[4]') # This works
    data['gender'].append(gender.text)

    # Age Group
    age = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[5]')
    data['age'].append(age.text)

    """ # Special case + address
    try:
        special = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[6]')
        if not special.text.startswith("| Særlig"):
            data['special'].append(None)
        else:
            data['special'].append(special.text)
    except NoSuchElementException:
        data['special'].append(None) """

    # Address of activity - some entries have an extra field, so this messess with the current XPath implementation
    # Therefore, each element needs to be checked that it is indeed an address.
    # Luckily, all address entries in the table contain the prefix 'Mødested' (meeting place).
    try:
        address = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[6]')
        if not address.text.startswith('Mødested: ') and address.text.startswith("| Særlig"): # Check that the text is indeed the address
            data['special'].append(address.text)
            try:
                address = driver.find_element(By.XPATH, f'/html/body/div/div[4]/div/ul/li[{i}]/div[2]/div[2]/span[7]')
                data['address'].append(address.text) #.removeprefix('Mødested: '))
            except NoSuchElementException:
                data['address'].append(None)
        else:
            data['special'].append(None)
            data['address'].append(address.text) #.removeprefix('Mødested: '))
    except NoSuchElementException:
        data['special'].append(None)
        data['address'].append(None)

driver.quit()

Driver and URL passed. Wait a second...
There is a total of 606 entries.


100%|██████████| 606/606 [29:31<00:00,  2.92s/it]  


In [80]:
# Create and inspect the dataframe
df_raw = pd.DataFrame.from_dict(data)
df_raw.head()

Unnamed: 0,type,activity,location,website,gender,age,special,address
0,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Styrke- og grundtræning,SOS Motion,http://www.sosmotion.dk/,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Sundhedshus Østerbro, Randersgade 60..."
1,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Træningspavillion,,,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Kvægtorvsgade, 1710 KBH V"
2,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Kondisti,Valbyparken,,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Tudsemindevej, 2450 Valby"
3,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Nærgymnastik,LOFskolen,https://lofskolen.dk/kurser/motion-og-sundhed/...,Køn: Begge,| Aldersgruppe: Alle,| Særlig målgruppe: Målrettet personer der har...,"Mødested: Østerbrogade 240, 2100 København Ø"
4,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Floorball for kvinder 65+ år,BK Skjold,https://www.bkskjold.dk/klub/boldklubben-skjol...,Køn: Kvinder,| Aldersgruppe: Seniorer,,"Mødested: Nørrebrogade 208, 2200 Kbh. N"


In [81]:
# Save the dataframe. The encoding 'utf-16' had to be used to ensure the danish special characters of 'æ, ø, å' were present.
# Using the default encoding did not work.

#cwd = os.getcwd()
PATH = '../data/raw_data/'
path = PATH + "/kbh_facilities_v2.csv"
df_raw.to_csv(path, encoding='utf-16', index=False)

# Read in existing CSV:

In [300]:
# Read in CSV, ensure correct encoding is used.

df = pd.read_csv('../data/raw_data/kbh_facilities_v2.csv', encoding='utf-16').fillna(value='None')

print(f'The {len(df)} entries consist of {len(np.unique(df["activity"], return_counts = True)[0])} unique activity names.')
df.head()

The 606 entries consist of 365 unique activity names.


Unnamed: 0,type,activity,location,website,gender,age,special,address
0,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Styrke- og grundtræning,SOS Motion,http://www.sosmotion.dk/,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Sundhedshus Østerbro, Randersgade 60..."
1,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Træningspavillion,,,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Kvægtorvsgade, 1710 KBH V"
2,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Kondisti,Valbyparken,,Køn: Begge,| Aldersgruppe: Alle,,"Mødested: Tudsemindevej, 2450 Valby"
3,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Nærgymnastik,LOFskolen,https://lofskolen.dk/kurser/motion-og-sundhed/...,Køn: Begge,| Aldersgruppe: Alle,| Særlig målgruppe: Målrettet personer der har...,"Mødested: Østerbrogade 240, 2100 København Ø"
4,https://kbhkort.kk.dk/images/ikoner/suf/sundhe...,Floorball for kvinder 65+ år,BK Skjold,https://www.bkskjold.dk/klub/boldklubben-skjol...,Køn: Kvinder,| Aldersgruppe: Seniorer,,"Mødested: Nørrebrogade 208, 2200 Kbh. N"


# Data Cleaning
Unfortunately, there are too many unique exercise types present in the default description.

In [301]:
df_test = df.copy()

In [302]:
dict_type = {'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/boldspil_26x26.png': 'ball_sports',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/dans_26x26.png': 'dance',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/fitness_26x26.png': 'fitness',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/gym_26x26.png': 'gym',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/kampsport_26x26.png': 'martial_arts',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/loeb_26x26.png': 'running',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/natur_26x26.png': 'nature',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/svoemning_26x26.png': 'swimming',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/udemotion_26x26.png': 'outdoors',
             'https://kbhkort.kk.dk/images/ikoner/suf/sundhed/yoga_26x26.png': 'yoga'}
dict_gender = {'Køn: Begge':'both', 'Køn: Kvinder':'women', 'Køn: Mænd':'men'}
dict_age = {'| Aldersgruppe: Alle': 'all', '| Aldersgruppe: Seniorer': 'seniors'}
dict_special = {'+ 65 år':'65+',
                '+60':'60+', 
                '+60 år':'60+', 
                '+65':'65+', 
                '+65 år':'65+',
                '65+ år':'65+',
                'Kvinder 45 +':'45+', 
                'Kvinder 65+ år':'65+',
                'Mænd 65 år+':'65+',
                'PAN har fokus på inklusion af mennesker med et særligt fokus på seksuel mangfoldighed og kønsdiversitet.':'PAN har fokus på inklusion af mennesker med et særligt fokus på seksuel mangfoldighed og kønsdiversitet',
                'mænd +65 år':'65+'}
dict_address = {"":"None"}

df_test["special"] = df_test["special"].apply(lambda s:s.replace('| Særlig målgruppe: ', ''))
df_test["address"] = (df_test["address"].str.removeprefix("Mødested:")).str.strip()

mask = df_test["location"]=='Kommunal park'
df_test.loc[mask, ["activity", "location"]] = (df_test.loc[mask, ["location","activity"]].values)

In [303]:
df_test = df_test.replace({"type": dict_type, "gender": dict_gender, "age": dict_age, "special":dict_special, "address":dict_address})
df_test.tail()

Unnamed: 0,type,activity,location,website,gender,age,special,address
601,yoga,Yoga,AH Sport,https://www.ahsport.dk/,both,all,,Støberiet
602,yoga,Yoga,SheZone,https://www.shezone.dk/,women,all,,"Jagtvej 34, 2200 København N"
603,gym,Motionsgymnastik,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,
604,ball_sports,Floorball,PACT/Copenhagen Floorball Club,https://pact.dk/motionsfloorball-pact-copenhag...,both,all,,"Julius Andersens Vej 1, 2500 Valby"
605,yoga,Yoga,Aktive ældre på Nørrebro,https://aktiv.kk.dk/seniorklubber/noerrebro/ak...,both,seniors,65+,"Mjølnerparken 48, st. th, 2200 Kbh. N"


In [304]:
temp = {'health':['fysio', 'hjært', 'nær', 'puls', 'hjert', 'mind', 'knæ', 'ryg', 'senior', 'stabil', 'mobil'],
        'sport':['bold', 'ball', 'tennis', 'minton', 'golf', 'cricket', 'volley'], 
        'fitness':['yoga', 'træn', 'gym', 'motion', 'fitness', 'cyk', 'løb', 'stav', 'cross', 'ro', 'kajak', 'zumba', 'pilates', 'kamp', 'svøm', 'spin', 'kondi'], 
        'recreation':['billiard', 'billard', 'dart', 'dans', 'dance', 'bowl', 'gå', 'walk', 'spil', 'petanque', 'park', 'have'] 
        }

dict_activity = {}
for k,v in temp.items():
    for x in v:
        dict_activity.setdefault(x,k)

#dict_activity

temp = df_test["activity"].copy()
for i in range(len(temp)):
    for key in dict_activity.keys():
        if re.search(key, temp[i].lower()): #      key in temp[i].lower():
            #print(f'we replace {temp[i]} with {dict_activity[key]}')
            temp[i] = str(dict_activity[key].strip(""))
            break
        #else:
            #print(f'i is {i} for {temp[i]}, search for {key}')
    if temp[i] not in dict_activity.values():
        #print(i, temp[i])
        temp[i] = "other"

sorted(list(zip(np.unique(temp, return_counts=True)[0], np.unique(temp, return_counts=True)[1])), key = lambda x:x[1]), len(np.unique(temp, return_counts=True)[0])

df_test["category"] = temp


In [311]:
df_test = df_test[['type', 'activity', 'category', 'location', 'website', 'gender', 'age', 'special','address']]
df_test.tail(10)

Unnamed: 0,type,activity,category,location,website,gender,age,special,address
596,yoga,Mindful bevægelse for krop og sind,health,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,Dansekapellet
597,running,Cykeltur - roligt tempo,fitness,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,
598,swimming,Varmtvandsgymnastik,fitness,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,
599,ball_sports,Bordtennis,sport,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,Remisen
600,ball_sports,Mix Volley,sport,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,"Øster Farimagsgade Skole, Multisalen"
601,yoga,Yoga,fitness,AH Sport,https://www.ahsport.dk/,both,all,,Støberiet
602,yoga,Yoga,fitness,SheZone,https://www.shezone.dk/,women,all,,"Jagtvej 34, 2200 København N"
603,gym,Motionsgymnastik,fitness,Idrætsforeningen Kæmperne,https://ifk98.dk/default.aspx,both,all,,
604,ball_sports,Floorball,sport,PACT/Copenhagen Floorball Club,https://pact.dk/motionsfloorball-pact-copenhag...,both,all,,"Julius Andersens Vej 1, 2500 Valby"
605,yoga,Yoga,fitness,Aktive ældre på Nørrebro,https://aktiv.kk.dk/seniorklubber/noerrebro/ak...,both,seniors,65+,"Mjølnerparken 48, st. th, 2200 Kbh. N"


In [309]:
""" np.unique(df_test["gender"], return_counts = True)
np.unique(df_test["age"], return_counts = True)
len(np.unique(df_test["activity"], return_counts = True)[0]), np.unique(df_test["activity"], return_counts = True)[0] """

' np.unique(df_test["gender"], return_counts = True)\nnp.unique(df_test["age"], return_counts = True)\nlen(np.unique(df_test["activity"], return_counts = True)[0]), np.unique(df_test["activity"], return_counts = True)[0] '