# Setup

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Notes to self

**goal**  
Create 3 datasets: 
1. For every party in every municipality the number of seats they have. 
2. For every municipality contactinfo + number of seats in city council. 
3. For every city counciler, their function, party, url, and municipality. 

**to do**  
- visit page for every municipality
- collect data
- write data to correct file

# Collect links to get scraped

In [10]:
# request webpage
r = requests.get('https://almanak.overheid.nl/Gemeenten')
# parse HTML
soup = BeautifulSoup(r.content, 'html.parser')
# extract list with links: div that contains links I need
inputLinks = soup.find('div', {'data-roo-element': 'organisationtype-content'})
# create empty list to store all links in
links = []
# loop through inputLinks list:
for i in inputLinks.findAll('a'):
    # extract href for every item in list, also complete link :) 
    link = 'https://almanak.overheid.nl' + i['href']
    # add link to list of links named links
    links.append(link)

In [11]:
# check length links list
len(links)

355

# Scrape data for every municipality

In [97]:
# create empty list to store links in for which the request failed
failed = []
# create empty list to store scraped data in
munDf = []
funcDf = []
ccDf = []

# set headers to send with website requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
           'Referer': 'https://almanak.overheid.nl/Gemeenten/',
           'Cookie': 'stg_returning_visitor=Wed%2C%2023%20Sep%202020%2016:48:16%20GMT; stg_externalReferrer=; stg_last_interaction=Thu%2C%2024%20Sep%202020%2008:45:38%20GMT; _pk_id.1a96e6f9-01b9-4565-8580-046a1491ea57.4951=362487a154ff97cd.1600877147.3.1600937120.1600937116.; stg_traffic_source_priority=1; _pk_ses.1a96e6f9-01b9-4565-8580-046a1491ea57.4951=*'
          }

# loop over list with links for every municipality
for l in links: 
    # request webpage
    r = requests.get(l,
                     timeout=5,
                     headers=headers)
    # if requests worked, move on
    if r.status_code == 200:
        # create soup
        soup = BeautifulSoup(r.content, 'html.parser')
        # parse data from soup
        # We're using 'try' and 'except', to make sure that when for instance the
        # municipality name is not present on the webpage, our scraper doesn't break;
        # but instead continues storing nothing (None) to the name variable.
        try:
            # select the title from the abbr HTML-element in soup
            munName = soup.find('abbr')['title']
        except:
            munName = None
        try:
            # Note: the code is written to be understood by our computers. When I'm explaining the function
            # of said code to you, a human ;), it makes more sense to explain the code read from right to left
            # That is the exact opposite direction of the direction in which the code was written. 
            # Knowing this will help you when trying to understand code that is not commented out. :) 

            # select the 10th to 8th last (hence the - sign) letters of the stripped text
            # of the h2 HTML-element with id 'gemeenteraad' found in soup
            seats = soup.find('h2', {'id':'gemeenteraad'}).text.strip()[-10:-8]
        except:
            seats = None
                        
        # parse data from adress info table
        try: 
            # select the table with data-roo-element element-adresgegevens from soup
            ai = soup.find('table', {'data-roo-element': 'element-adresgegevens'})
        except:
            ai = None
        try:
            # select the text, without surrounding spaces (strip()), that is stored in the 
            # HTML-element td with data-before value 'Bezoekadres'; save in variable visitingAdress
            visitingAdress = ai.find('td', {'data-before': 'Bezoekadres'}).text.strip()
        except:
            visitingAdress = None
        try:
            # select the text (.text), without surrounding spaces (strip()), that is stored in the 
            # HTML-element td with data-before value 'Postadres'; save in variable mailingAdress
            mailingAdress = ai.find('td', {'data-before': 'Postadres'}).text.strip()
        except:
            mailingAdress = None
        try:
            # select the text (.text), without surrounding spaces (strip()), and ' (algemeen)' replaced
            # with nothing (read: removed), that is stored in the HTML-element td 
            # with data-before value 'Telefoon'; save in variable telephone
            telephone = ai.find('td', {'data-before': 'Telefoon'}).text.strip().replace(' (algemeen)','')
        except:
            telephone = None
        try:
            # take the text stored in in the HTML-element td with data-before 'Internet' as found in ai,
            # remove all surrounding spaces (.strip()) and remove ' (algemeen)' by replacing it with nothing
            website = ai.find('td', {'data-before': 'Internet'}).text.strip().replace(' (algemeen)','')
        except:
            website = None
        try:
            # take the text stored in in the HTML-element td with data-before 'E-mail' as found in ai,
            # remove all surrounding spaces (.strip()) and remove ' (algemeen)' by replacing it with nothing
            email = ai.find('td', {'data-before': 'E-mail'}).text.strip().replace(' (algemeen)','')
        except:
            email = None
            
        # create row for municipality table export
        munRow = [munName,
                  seats,
                  visitingAdress,
                  mailingAdress,
                  telephone,
                  website,
                  email]
        munDf.append(munRow)
            
        # parse data from functions table
        try:
            # select the table with data-roo-element element-functies from soup
            functions = soup.find('table', {'data-roo-element': 'element-functies'}).findAll('tr')
        except:
            functions = None
        # loop over rows in table, for row in allrows of table named functions:
        for row in functions:
            try:
                # find HTML-element named th, get text, store as func
                func = row.find('td')['data-before'].strip()
            except:
                func = None
            try:
                # get text from HTML-element td found in row, store as namesParties
                namesAndParties = row.find('td').text.strip()
            except:
                namesAndParties = None
            # if variable namesParties is not empty:
            if namesAndParties != None:
                # take namesParties and split this string on every comma
                # store the list (multiple strings) that this creates in npList
                npList = namesAndParties.split(',')
                # for 'np' in this new list called npList
                for np in npList:
                    try:
                        # take the first group, as defined by regex (the regular expression r'\((.*)\)'
                        # means 'get anything thats within brackets') from the stripped np text
                        # and store whatever you find to the partyCouncillor variable 
                        partyCouncillor = re.search(r'\((.*)\)', np.strip()).group(1)
                    except:
                        partyCouncillor = None
                    try:
                        # remove \r by replacing it with nothing after selecting the first group
                        # as defined by regex from the stripped text in np 
                        nameCouncillor = re.search(r'(.*)', np.strip()).group(1).replace('\r','')
                    except:
                        nameCouncillor = None
                    # create row for functions table export
                    # include municipality name, name + party councillor, function councillor
                    funcRow = [munName, nameCouncillor, partyCouncillor, func]
                    # take this new list, and add it to the list funcDf
                    funcDf.append(funcRow)
                    
            else: # else meaning, if namesAndParties is None (read: empty)
                pass # then do nothing, just pass this one and continue with the script
                        
        # parse data from city council table
        try: 
            # select the table with data-roo-element element-gemeenteraad from soup
            cc = soup.find('table', {'data-roo-element': 'element-gemeenteraad'})
        except:
            cc = None
                
        # for item in list of all tr-elements found in cc except the first item of the list [1:]:
        for row in cc.findAll('tr')[1:]:
            try:
                # extract text from HTML-element td with data-before 'Partij'
                partyCC = row.find('td', {'data-before': 'Partij'}).text
            except:
                partyCC = None
            try:
                # extract text from HTML-element td with data-before 'Aantal zetels'
                seatsCC =  row.find('td', {'data-before': 'Aantal zetels'}).text
            except:
                seatsCC = None

            # create row with municipality name, party and number of seats
            ccRow = [munName, partyCC, seatsCC]
            # add newly created row to ccDf
            ccDf.append(ccRow)
                    
    # if request failed...
    else: 
        # let me know
        print('Request failed: ' + l)
        # and save failed links
        failed.append(l)

# note how the indentions end here? Every thing that follows is code that should be executed
# outside of our for-loops, and outside of our if/else and try/except setups. Meaning: 
# the following code is executed only once, after the code above for however many loops,
# and however many tries and/or exceptions has been executed.

# create dataframe with municipality data
munDf = pd.DataFrame(munDf)
munDf.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
# rename columns
munDf.columns = ['municipalityName',
                 'noSeatsCityCouncil',
                 'visitingAdress',
                 'mailingAdress',
                 'telephone',
                 'website',
                 'email']
# save to csv
munDf.to_csv('Dutch municipalities data.csv',
             index=False)

# create dataframe with functions data
funcDf = pd.DataFrame(funcDf)
# rename columns
funcDf.columns = ['municipalityName',
                  'nameCouncillor',
                  'partyCouncillor',
                  'function']
# save to csv
funcDf.to_csv('City councillors per municipality.csv',
             index=False)

# create dataframe with functions data
ccDf = pd.DataFrame(ccDf)

# rename columns
ccDf.columns = ['municipalityName',
                'partyName',
                'numberOfSeats']
# save to csv
ccDf.to_csv('Seats per party per municipality.csv',
             index=False)
        