# libraries

In [2]:
# libraries 
import pandas as pd 
import os
import numpy as np
import time
import random
from csv import reader
import time

# plots:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


# scraping
from bs4 import BeautifulSoup
from lxml import etree #for using XPATH with beautifulsoup
import requests

#JSON
import json

# GeoJSON
import geopandas as gpd

# regular expression
import re 

# concurrent futures - boosts the process of scraping utilicing the CPU better
# two main classes- 
#       1) executor class: manages all the threads and workload
#       2) futures class: creates a little instance and manages data coming back
import concurrent.futures

# libraries: 
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# NOTES ON THE WEBSITE:

* MAXIMUM 20 PROPERTIES PER PAGE
* MAXIMUM 500 PAGES
* This means that the maximum result per query is 10.000 (20*500), thankfully the hidden api gives out the total result number so I can use that number to update the query parameters inorder to get the result below 10.000 - then scrape

**NOTES ON THE QUERY PARAMETER**

Here are all the query parameters after using the filters on the website - example is for the kommune called Aarhus

* municipalities: Aarhus
* per_page: 20
* page: 1
* priceMax: 17900000
* priceMin: 6700000
* yearSoldFrom: 2000
* yearSoldTo: 2005
* addressTypes: villa,condo,terraced house,farm,hobby farm
* areaMin: 80
* areaMax: 180
* sold: true
* sortAscending: false
* sortBy: soldDate


I created the list of all the kommunes as a CSV file- the data is taken from danish wikipedia : https://da.wikipedia.org/wiki/Kommuner_i_Danmark_efter_indbyggertal

# Below is the python version of the curl bash of the hidden API

In [None]:
headers = {
    'authority': 'api.boligsiden.dk',
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
    'origin': 'https://www.boligsiden.dk',
    'referer': 'https://www.boligsiden.dk/',
    'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
    'x-api-key': 'GWD0fljZzkc8GOLV',
}

params = {
    'municipalities': 'Aarhus',
    'per_page': '20',
    'page': '1',
    'priceMax': '17900000',
    'priceMin': '6700000',
    'yearSoldFrom': '2000',
    'yearSoldTo': '2005',
    'addressTypes': 'villa,condo,terraced house,farm,hobby farm',
    'areaMin': '80',
    'areaMax': '180',
    'sold': 'true',
    'sortAscending': 'false',
    'sortBy': 'soldDate',
}

response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)

# Parameter overview 

* municipalities: 
    * 98 in total - loop over the csv file contianing all the names 
---------------------------
* per_page: 
    * the maximum properties per page is 20 
---------------------------    
* page: 
    * the maximum page is 500
---------------------------

* priceMax': '17900000',
    * maximum 25000000 (25 million)
---------------------------

* priceMin': '6700000'
    * minimum 0
---------------------------

* yearSoldFrom': '2000',
    * minimum: 1998
---------------------------

* yearSoldTo': '2005',
    * maximum: 2023
---------------------------

* addressTypes': 'villa,condo,terraced house,farm,hobby farm',
    * the following types are : 
        * Villa,
        * ejerlejliged rækkehus,
        * fritidsbolig  
        * landejendom 
    * But for the api its in english as follows : 
        * villa,
        * condo
        * terraced house 
        * hobby farm    
---------------------------

* areaMin': '80',
    * minimum is 0

---------------------------

* areaMax': '180',
    * maximum is 300
---------------------------

* sold': 'true',
    * either 'false' or 'true'

---------------------------

* sortAscending': 'false',
    * either 'false' or 'true'

---------------------------

* sortBy': 'soldDate',
    * can take values of: 
        * 'soldDate'
        * 'soldPrice'
        * 'address'


In [None]:
2315 villa
3596 ejerlejighed
807 rækkehus
92 fritidsbolig
10 landejendom

In [28]:
2315+3596+807+92+10

6820

------------------------
# Plan: 

* filter for each 98 municipalities (kommunes) 
* loop through the years 1995 to 2023 increment by one year - meaning 29 years
* if the totalHits < 10.000 (500 max pages * 20 per page) results then Scrape
* if the **totalHits > 10.000** then 
    * **filter** for boligtype i.e. property type: villa, condo, terraced house and hobby farm
        * if the **totalHits is still > 10.000**  then: 
            * **filter** by price by incrementing by 500.000 thousan i.s. 0-500.000, 500.001-1.000.000 etc.
                * if the **totalHits is still > 10.000** then:
                 * filter for storrelse i.e. size by 20 m^2 i.e. 0-20, 21-40, 41-60 etc.

from analysis the site and testing this should be enough !

--------------

## 1) Read in the csv file of all the kommunes 

In [54]:
dk_kommune = pd.read_csv('Denmark_kommune.csv', header=0)
dk_kommune

Unnamed: 0,Navn
0,København
1,Aarhus
2,Aalborg
3,Odense
4,Vejle
...,...
93,Langeland
94,Ærø
95,Samsø
96,Fanø


In [55]:
len(dk_kommune)

98

## set up the header and parameter for the HTTP request 

In [None]:
# change user agent for each request randomly
def get_user_agent():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
        'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
    ]
    user_agent = user_agent_list[random.randint(0,len(user_agent_list)-1)]
   
    return user_agent

In [43]:
user_agent = get_user_agent()
headers = {
    'authority': 'api.boligsiden.dk',
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
    'origin': 'https://www.boligsiden.dk',
    'referer': 'https://www.boligsiden.dk/',
    'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': user_agent,
    'x-api-key': 'GWD0fljZzkc8GOLV',
}
headers

{'authority': 'api.boligsiden.dk',
 'accept': 'application/json, text/plain, */*',
 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
 'origin': 'https://www.boligsiden.dk',
 'referer': 'https://www.boligsiden.dk/',
 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
 'sec-ch-ua-mobile': '?0',
 'sec-ch-ua-platform': '"Windows"',
 'sec-fetch-dest': 'empty',
 'sec-fetch-mode': 'cors',
 'sec-fetch-site': 'same-site',
 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
 'x-api-key': 'GWD0fljZzkc8GOLV'}

In [None]:

params = {
    'municipalities': 'Aarhus',
    'per_page': '20',
    'page': '600',
    #'priceMax': '17900000',
    #'priceMin': '6700000',
    'yearSoldFrom': '2005',
    'yearSoldTo': '2005',
    #'addressTypes': 'villa,condo,terraced house,farm,hobby farm',
    #'areaMin': '80',
    #'areaMax': '180',
    'sold': 'true',
    'sortAscending': 'false',
    'sortBy': 'soldDate',
}

response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)
response

In [56]:
for kommune in dk_kommune:
    print(dk_kommune)
    

         Navn
0   København
1      Aarhus
2     Aalborg
3      Odense
4       Vejle
..        ...
93  Langeland
94        Ærø
95      Samsø
96       Fanø
97       Læsø

[98 rows x 1 columns]


In [93]:
dk_kommune['Navn'][0]

'København'

In [68]:
parameter_year = list(range(1995,2024))
parameter_housingType = ['villa','condo','terraced house', 'hobby farm']
parameter_price= list(range(0,25500000,500000)) # increment by 500.000 dkk
parameter_size = list(range(0,320,20)) # increment by 20m^2

In [94]:
len(parameter_price)

51

In [71]:
# change user agent for each request randomly
def get_user_agent():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
        'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
    ]
    user_agent = user_agent_list[random.randint(0,len(user_agent_list)-1)]
   
    return user_agent

def get_header():
    
    # get user-agent
    user_agent = get_user_agent()

     # set up header     
    header = {
        'authority': 'api.boligsiden.dk',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
        'origin': 'https://www.boligsiden.dk',
        'referer': 'https://www.boligsiden.dk/',
        'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': user_agent,
        'x-api-key': 'GWD0fljZzkc8GOLV',
    }
    return header

In [35]:
def check_totalHits(kommune):
    
    municipal = kommune
    
    for year in parameter_year: 
        
        # set parameters
        params = {
            'municipalities': municipal,
            'per_page': '20',
            'page': '1',
            #'priceMax': '17900000',
            #'priceMin': '6700000',
            'yearSoldFrom': str(year),
            'yearSoldTo': str(year),
            #'addressTypes': 'villa,condo,terraced house,farm,hobby farm',
            #'areaMin': '80',
            #'areaMax': '180',
            'sold': 'true',
            'sortAscending': 'false',
            'sortBy': 'soldDate',
        }
        
        # make request
        response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)

        # get total hits
        totalhit = response.json()['totalHits']
        
        # If total hits > 10.000 filter for housing type
        if(totalhit< 10.000):
            start_scrape()
        else:
            for housing_type in parameter_housingType:
                #set up the parameter
                params = {
                    'municipalities': municipal,
                    'per_page': '20',
                    'page': '1',
                    #'priceMax': '17900000',
                    #'priceMin': '6700000',
                    'yearSoldFrom': str(year),
                    'yearSoldTo': str(year),
                    'addressTypes': housing_type,
                    #'areaMin': '80',
                    #'areaMax': '180',
                    'sold': 'true',
                    'sortAscending': 'false',
                    'sortBy': 'soldDate',
                }
                
                # make request
                response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)

                # get total hits
                totalhit = response.json()['totalHits']
                
                # if totalhit is still < 10.000 then scrape else filter for price
                if(totalhit<0):
                    start_scrape()
                else: 
                    for i in range(0,len(parameter_price)):
                        # get max min price ranges
                        minprice = str(parameter_price[i])
                        maxprice = str(parameter_price[i+1])
                        
                        #set up the parameter
                        params = {
                            'municipalities': municipal,
                            'per_page': '20',
                            'page': '1',
                            'priceMax': maxprice,
                            'priceMin': minprice,
                            'yearSoldFrom': str(year),
                            'yearSoldTo': str(year),
                            'addressTypes': housing_type,
                            #'areaMin': '80',
                            #'areaMax': '180',
                            'sold': 'true',
                            'sortAscending': 'false',
                            'sortBy': 'soldDate',
                        }

                        # make request
                        response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)

                        # get total hits
                        totalhit = response.json()['totalHits']
                        
                        # Scrape: 
                        start_scrape()

                        

    

In [82]:
headers = get_header()
headers

{'authority': 'api.boligsiden.dk',
 'accept': 'application/json, text/plain, */*',
 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
 'origin': 'https://www.boligsiden.dk',
 'referer': 'https://www.boligsiden.dk/',
 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
 'sec-ch-ua-mobile': '?0',
 'sec-ch-ua-platform': '"Windows"',
 'sec-fetch-dest': 'empty',
 'sec-fetch-mode': 'cors',
 'sec-fetch-site': 'same-site',
 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
 'x-api-key': 'GWD0fljZzkc8GOLV'}

# ---- code scrape

In [3]:
headers = {
    'authority': 'api.boligsiden.dk',
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
    'origin': 'https://www.boligsiden.dk',
    'referer': 'https://www.boligsiden.dk/',
    'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
    #'x-api-key': 'GWD0fljZzkc8GOLV',
}

params = {
    'municipalities': 'Aarhus',
    'per_page': '20',
    'page': '1',
    #'priceMax': '25000000',
    #'priceMin': '0',
    'yearSoldFrom': '2015',
    'yearSoldTo': '2015',
    #'addressTypes': 'villa,condo,terraced house,farm,hobby farm',
    #'areaMin': '0',
    #'areaMax': '300',
    'sold': 'true',
    'sortAscending': 'false',
    'sortBy': 'soldDate',
}

response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=headers)
response

<Response [200]>

In [12]:
response

<Response [200]>

In [4]:
a = response.json()['addresses']
a

[{'_links': {'self': {'href': '/addresses/0a3f50c3-4286-32b8-e044-0003ba298018'}},
  'addressID': '0a3f50c3-4286-32b8-e044-0003ba298018',
  'buildings': [{'bathroomCondition': 'Badeværelse i enheden',
    'buildingName': '(UDFASES) Række-, kæde- eller dobbelthus (lodret adskillelse mellem enhederne).',
    'buildingNumber': '1',
    'externalWallMaterial': 'Mursten',
    'heatingInstallation': 'Fjernvarme/blokvarme',
    'housingArea': 122,
    'kitchenCondition': 'Eget køkken med afløb',
    'numberOfBathrooms': 1,
    'numberOfFloors': 1,
    'numberOfKitchens': 1,
    'numberOfToilets': 2,
    'roofingMaterial': 'Betontagsten',
    'supplementaryHeating': '(UDFASES) Bygningen har ingen supplerende varme',
    'toiletCondition': 'Vandskyllende toilet i enheden',
    'totalArea': 122,
    'yearBuilt': 1983,
    'yearRenovated': 1994}],
  'cases': None,
  'city': {'name': 'Åbyhøj', 'slug': 'aabyhoej'},
  'cityName': 'Åbyhøj',
  'coordinates': {'lat': 56.15482, 'lon': 10.150557, 'type':

In [63]:
len(a)

20

In [6]:
response.json()['totalHits']

9281

In [None]:
Aarhus: 
2000-2005 : 38842
2006-2010 : 31741
2011-2016 : 35173
2017-2022 : 41012
2023      :  1327

In [27]:
38842+31741+35173+41012+1327

148095

In [33]:
12094/20

604.7

In [8]:
a[1]

{'_links': {'self': {'href': '/addresses/48284790-bf16-61c5-e044-0003ba298018'}},
 'addressID': '48284790-bf16-61c5-e044-0003ba298018',
 'addressType': 'villa',
 'buildings': [{'bathroomCondition': 'Badeværelse i enheden',
   'buildingName': 'Fritliggende enfamilieshus (parcelhus)',
   'buildingNumber': '1',
   'externalWallMaterial': 'Mursten',
   'heatingInstallation': 'Fjernvarme/blokvarme',
   'housingArea': 189,
   'kitchenCondition': 'Eget køkken med afløb',
   'numberOfBathrooms': 2,
   'numberOfFloors': 1,
   'numberOfKitchens': 1,
   'numberOfToilets': 2,
   'roofingMaterial': 'Tegl',
   'supplementaryHeating': '(UDFASES) Bygningen har ingen supplerende varme',
   'toiletCondition': 'Vandskyllende toilet i enheden',
   'totalArea': 240,
   'yearBuilt': 2009}],
 'cases': None,
 'city': {'name': 'Solbjerg', 'slug': 'solbjerg'},
 'cityName': 'Solbjerg',
 'coordinates': {'lat': 56.044743, 'lon': 10.09765, 'type': 'EPSG4326'},
 'daysOnMarket': {'realtors': []},
 'energyLabelImprove

In [11]:
year_reverse = list(range(2024,1995,-1))

In [12]:
year_reverse

[2024,
 2023,
 2022,
 2021,
 2020,
 2019,
 2018,
 2017,
 2016,
 2015,
 2014,
 2013,
 2012,
 2011,
 2010,
 2009,
 2008,
 2007,
 2006,
 2005,
 2004,
 2003,
 2002,
 2001,
 2000,
 1999,
 1998,
 1997,
 1996]

In [16]:
list_json_properties = []
for i in a: 
    print(len(i))
    list_json_properties.append(i)

25
27
28
27
27
27
28
27
27
26
27
28
26
27
27
26
26
27
27
26


# The PLAN
Scrape each property by the API and simply merge all the JSON for each property into a list instead of trying to predetermine the columns. That means I will simply append multiple JSON objects into a list where a single JSON object is from a single property

### Setting up the User-agent and Header methods 

get_user_agent() - gives a random user-agent from a pre-defined user-agent list 

get_header() - calls the User-agent and constructs and returns a header 

In [42]:
# change user agent for each request randomly
def get_user_agent():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
        'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
    ]
    user_agent = user_agent_list[random.randint(0,len(user_agent_list)-1)]
   
    return user_agent

def get_header():
    
    # get user-agent
    user_agent = get_user_agent()

     # set up header     
    header = {
        'authority': 'api.boligsiden.dk',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
        'origin': 'https://www.boligsiden.dk',
        'referer': 'https://www.boligsiden.dk/',
        'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': user_agent,
        'x-api-key': 'GWD0fljZzkc8GOLV',
    }
    return header

### Setting up the start_scrape() function

In [86]:
def start_scrape(stage, totalhit,parameter_list):
    print('====== start_scrape() function ON')
    
    global session
    
    # record
    global property_recordlist
    
    # Calculate the number of pages the program needs to loop over
    # The maximum pages is 500 with 20 properties per page 
    # I add 1 to the page to mage sure that we take all properties
    # the pre-condition for the start_scrape() is that it doesn't exceed the 500 limit
    number_pages = round(totalhit/20)+1
    
    for page in range(1,number_pages+1):
        print(f'======= Scraping Page: {page} out of {number_pages} pages')
        ###########################       Stage == 1 : only year is in the parameter_list 
        if(stage == 1):
            #---------- scrape
            # get Header 
            the_header = get_header()
            
            # set up parameter: only Year
            params = {
                'municipalities': parameter_list[0],
                'per_page': '20',
                'page': str(page),
                #'priceMax': maxprice,
                #'priceMin': minprice,
                'yearSoldFrom': str(parameter_list[1]), #parameter_list[1] has the year
                'yearSoldTo': str(parameter_list[1]),
                #'addressTypes': housing_type,
                #'areaMin': '80',
                #'areaMax': '180',
                'sold': 'true',
                'sortAscending': 'false',
                'sortBy': 'soldDate',
            }
            
            # make the Request to the API
            response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=the_header)
            
            # get all the properties from the JSON
            property_json_list = response.json()['addresses']
            
            # add each property JSON individually into the recordlist such that 1 index number == 1 property
            for prop in range(0,len(property_json_list)):
                property_recordlist.append(property_json_list[prop])
            
        ####################################  Stage == 2 : parameters: year, housing type
        elif(stage == 2):
            #scrape
            #---------- scrape
        
            # get Header 
            the_header = get_header()
            
            # set up parameter: only Year
            params = {
                'municipalities': parameter_list[0],
                'per_page': '20',
                'page': str(page),
                #'priceMax': maxprice,
                #'priceMin': minprice,
                'yearSoldFrom': str(parameter_list[1]), #parameter_list[1] has the year
                'yearSoldTo': str(parameter_list[1]),
                'addressTypes': parameter_list[2], # parameter_list[2] contains the housing type
                #'areaMin': '80',
                #'areaMax': '180',
                'sold': 'true',
                'sortAscending': 'false',
                'sortBy': 'soldDate',
            }
            
            # make the Request to the API
            response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=the_header)
            
            # get all the properties from the JSON
            property_json_list = response.json()['addresses']
            
            # add each property JSON individually into the recordlist such that 1 index number == 1 property
            for prop in range(0,len(property_json_list)):
                property_recordlist.append(property_json_list[prop])
                
        ################################## Stage == 3 : parameters: year, housing type, prize
        elif(stage == 3):
            #scrape
            #scrape
            #---------- scrape
        
            # get Header 
            the_header = get_header()
            
            # set up parameter: only Year
            params = {
                'municipalities': parameter_list[0],
                'per_page': '20',
                'page': str(page),
                'priceMax': parameter_list[3],
                'priceMin': parameter_list[3],
                'yearSoldFrom': str(parameter_list[1]), #parameter_list[1] has the year
                'yearSoldTo': str(parameter_list[1]),
                'addressTypes': parameter_list[2], # parameter_list[2] contains the housing type
                #'areaMin': '80',
                #'areaMax': '180',
                'sold': 'true',
                'sortAscending': 'false',
                'sortBy': 'soldDate',
            }
            
            # make the Request to the API
            response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=the_header)
            
            # get all the properties from the JSON
            property_json_list = response.json()['addresses']
            
            # add each property JSON individually into the recordlist such that 1 index number == 1 property
            for prop in range(0,len(property_json_list)):
                property_recordlist.append(property_json_list[prop])
            
        ################################## Stage == 4 : parameters: year, housing type, prize and size
        elif(stage == 4):
            #---------- scrape
        
            # get Header 
            the_header = get_header()
            
            # set up parameter: only Year
            params = {
                'municipalities': parameter_list[0],
                'per_page': '20',
                'page': str(page),
                'priceMax': parameter_list[4],
                'priceMin': parameter_list[3],
                'yearSoldFrom': str(parameter_list[1]), #parameter_list[1] has the year
                'yearSoldTo': str(parameter_list[1]),
                'addressTypes': parameter_list[2], # parameter_list[2] contains the housing type
                'areaMin': str(parameter_list[5]),
                'areaMax': str(parameter_list[6]),
                'sold': 'true',
                'sortAscending': 'false',
                'sortBy': 'soldDate',
            }
            
            # make the Request to the API
            response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=the_header)
            
            # get all the properties from the JSON
            property_json_list = response.json()['addresses']
            
            # add each property JSON individually into the recordlist such that 1 index number == 1 property
            for prop in range(0,len(property_json_list)):
                property_recordlist.append(property_json_list[prop])

In [52]:
parameter_list = [1,2,3,4,5,6]
yeartest = 2020
housetest= "villa"
prizetestmax= 1000000
prizetestmin= 0

In [58]:
parameter_list[0] = yeartest
parameter_list[1] = housetest
parameter_list[2] = prizetestmax
parameter_list[3] = prizetestmin

In [59]:
parameter_list

[2020, 'villa', 1000000, 0, 5, 6]

In [68]:
asd = list(range(0,3))
asd

[0, 1, 2]

In [73]:
list_globe = []

def testfunc():
    global list_globe
    
    a = list(range(0,10))
    for i in a:
        print(i)
        list_globe.append(i)
testfunc()

0
1
2
3
4
5
6
7
8
9


In [74]:
list_globe

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### Setting up a function to check if total number of properties are less or equal to 10.000 (i.e. the maximum results the website/API gives) 

check_totalHits() takes a string of a kommune (municipality in Denmark) then loops over the 4 stages where each stage involves setting a parameter for the API request. 

The stages are as follows: 

* Stage 1: Parameter  Year
    * We filter for years. That is, for a given year if the total number of the search result is less than 10.000 then we carry on and scrape all pages. If the search result (totalhits) exceeds 10.000 then we move to stage 2 
* Stage 2: Parameters Year and housing type
    * we filter for both years and housing type. For each year and each housing type we scrape only if the search result (totalhits) is less than 10.000. If its greater than 10.000 then we move to stage 3. 
* Stage 3: Parameters Year, Housing type and Prize 
    * if the search result, after filtering for years, housing type and prize range, exceeds the 10.000 mark then we move to stage 4 
* Stage 4: Parameters Year, Housing type, Prize and Size
    * This is the final step - after manual testing this stage will most likely not be needed but i have it here just in case. Otherwise, it seems that the three stages above will be enough 

the list of the first parameter which is the  years and makes a request to the API - if a result of a particular year for a particular kommune gives result greater than the maximum result capacity of the API, which is 10.000,  it will set another parameter

In [77]:
def check_totalHits(municipal):
    
    global session
    global parameter_year
    global parameter_housingType
    global parameter_price
    global parameter_size

    # stage variable tell the start_scrape() function which parameters must be added for the HTTP request to the API
    stage = 0
    
    # keep all the values of those parameters necessary
    # -- the list has 7 values , for index zero (value 0) comes the municipality string, for index 1 comes the year,
    # -- for index 2 comes the property type, for index 3 and 4 comes minimum and maximum price range
    # -- and for index 5 and 6 comes minimum and maximum size range 
    parameter_list= [municipal,1,2,3,4,5,6]
    
    
#     municipal = kommune
    print(f'-------------------------------------------- kommune: {municipal}')
    for year in parameter_year: 
        print('--------------------------STAGE:1 - YEAR')
        print(f'------------ Year: {year}')
        # set parameters
        params = {
            'municipalities': municipal,
            'per_page': '20',
            'page': '1',
            #'priceMax': '17900000',
            #'priceMin': '6700000',
            'yearSoldFrom': str(year),
            'yearSoldTo': str(year),
            #'addressTypes': 'villa,condo,terraced house,farm,hobby farm',
            #'areaMin': '80',
            #'areaMax': '180',
            'sold': 'true',
            'sortAscending': 'false',
            'sortBy': 'soldDate',
        }
        
        # get the header
        theHeader = get_header()
        
        # make request
        response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=theHeader)

        # get total hits
        totalhit = response.json()['totalHits']
        print(f'------------Total hits: {totalhit}')
        
        
        # if totalhits < 10000 == True then scrape 
        if(totalhit< 10000):
            # add the parameter year to the parameter list at index 1 (value 1)
            parameter_list[1] = year
            
            print('######## SCRAPING: YEAR #########')
            stage = 1
            start_scrape(stage,totalhit,parameter_list)
        
        # If total hits > 10.000 filter for housing type
        else:
            for housing_type in parameter_housingType:
                
                print('--------------------------STAGE:2 - YEAR & HOUSING TYPE')
                print(f'------------ Year: {year}')
                print(f'------------ Housing type: {housing_type}')
                
                #set up the parameter
                params = {
                    'municipalities': municipal,
                    'per_page': '20',
                    'page': '1',
                    #'priceMax': '17900000',
                    #'priceMin': '6700000',
                    'yearSoldFrom': str(year),
                    'yearSoldTo': str(year),
                    'addressTypes': housing_type,
                    #'areaMin': '80',
                    #'areaMax': '180',
                    'sold': 'true',
                    'sortAscending': 'false',
                    'sortBy': 'soldDate',
                }
                
                # get the header
                theHeader = get_header()
                
                # make request
                response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=theHeader)
                print(f'------------ HTTP response: {response}')
                
                # get total hits
                totalhit = response.json()['totalHits']
                print(f'------------Total hits: {totalhit}')
                
                # if totalhit is still < 10.000 then scrape else filter for price
                if(totalhit<10000):
                    # add the parameter housing_type to the parameter list 
                    # at index 2 (value 2)
                    parameter_list[2] = housing_type
                    
                    print('######## SCRAPING: YEAR & HOUSING TYPE #########')
                    stage = 2
                    start_scrape(stage,totalhit,parameter_list)
                else: 
                    for i in range(0,len(parameter_price)):
                        
                        print('--------------------------STAGE:3 - YEAR & HOUSING TYPE & PRICE RANGE')
                        print(f'------------ Year: {year}')
                        print(f'------------ Housing type: {housing_type}')               
                
                        # get max min price ranges
                        minprice = str(parameter_price[i])
                        print(f'------------ Minimum price: {minprice}')
                        
                        maxprice = str(parameter_price[i+1])
                        print(f'------------ Maximum price: {maxprice}')
                        
                        #set up the parameter
                        params = {
                            'municipalities': municipal,
                            'per_page': '20',
                            'page': '1',
                            'priceMax': maxprice,
                            'priceMin': minprice,
                            'yearSoldFrom': str(year),
                            'yearSoldTo': str(year),
                            'addressTypes': housing_type,
                            #'areaMin': '80',
                            #'areaMax': '180',
                            'sold': 'true',
                            'sortAscending': 'false',
                            'sortBy': 'soldDate',
                        }
                        
                        # get the header
                        theHeader = get_header()

                        # make request
                        response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=theHeader)
                        print(f'------------ HTTP response: {response}')

                        # get total hits
                        totalhit = response.json()['totalHits']
                        print(f'------------Total hits: {totalhit}')
                        
                        # if totalhit is still < 10.000 then scrape else filter for size
                        if(totalhit<10000):
                            # add the parameter year to the parameter list 
                            # minprice = index 3 (value 3) & maxprice = index 4 (value 4)
                            parameter_list[3]= minprice
                            parameter_list[4]= maxprice
                            
                            print('######## SCRAPING: YEAR & HOUSING TYPE & PRICE RANGE #########')
#                             stage = 3
#                             start_scrape(stage,totalhit,parameter_list)

                        else:
                            for i in range(0,len(parameter_size)):
                
                                print('--------------------------STAGE:4 - YEAR & HOUSING TYPE & PRICE RANGE & SIZE')
                                print(f'------------ Year: {year}')
                                print(f'------------ Housing type: {housing_type}') 
                                print(f'------------ Minimum price: {minprice}')
                                print(f'------------ Maximum price: {maxprice}')
                    
                                # get max min price ranges 
                                minsize = str(parameter_size[i])
                                print(f'------------ Minimum Size: {minsize}')
                                
                                maxsize = str(parameter_size[i+1])
                                print(f'------------ Maximum Size: {maxsize}')
                                
                                # get the header
                                theHeader = get_header()

                                #set up the parameter
                                params = {
                                    'municipalities': municipal,
                                    'per_page': '20',
                                    'page': '1',
                                    'priceMax': maxprice,
                                    'priceMin': minprice,
                                    'yearSoldFrom': str(year),
                                    'yearSoldTo': str(year),
                                    'addressTypes': housing_type,
                                    #'areaMin': '80',
                                    #'areaMax': '180',
                                    'sold': 'true',
                                    'sortAscending': 'false',
                                    'sortBy': 'soldDate',
                                }

                                # make request
                                response = requests.get('https://api.boligsiden.dk/search/addresses', params=params, headers=theHeader)
                                print(f'------------ HTTP response: {response}')
                                
                                # get total hits
                                totalhit = response.json()['totalHits']
                                print(f'------------Total hits: {totalhit}')
                                
                                # add the parameter year to the parameter list 
                                # minprice = index 5 (value 5) & maxsize = index 6 (value 6)
                                parameter_list[5] = minsize
                                parameter_list[6] = maxsize

                                # Last stage - Scrape: (Should be enough) 
                                stage = 4
                                print('######## SCRAPING: YEAR & HOUSING TYPE & PRICE RANGE & SIZE RANGE#########')
                                start_scrape(stage,totalhit,parameter_list)

In [62]:
parameter_list= [0,1,2,3,4,5,6]

### main program

In [None]:
# ------------ Setting up the parameters 
parameter_year = list(range(2023,1994, -1))
parameter_housingType = ['villa','condo','terraced house', 'hobby farm']
parameter_price= list(range(0,25500000,500000)) # increment by 500.000 dkk
parameter_size = list(range(0,320,20)) # increment by 20m^2

# --------------- make the first request 


#  --------------- Call the check_totalHits and start scraping 

# --------------- get the csv file with the kommunes and make the test for total hits 
dk_kommune = pd.read_csv('Denmark_kommune.csv', header=0)

# convert to list
kommun_list = dk_kommune.Navn.values.tolist()

# call check_totalHits() function: 
# for kommune in kommun_list:
#     property_recordlist = []
    
#     check_totalHits(kommune)

############# Test code for: Aarhus
property_recordlist = []
municipal = "Aarhus"
check_totalHits(municipal)


############## Test2: speed up

propertyList = [] # as list keep record of all properties and their attributes

with concurrent.futures.ThreadPoolExecutor(max_workers=35) as executor: 
    with requests.Session() as session:
        retry = Retry(connect=3, backoff_factor=0.5) # max 3 retries to the same link with a 0.5 sec delay
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        # run script
        executor.map(check_totalHits,municipal)


In [91]:
len(property_recordlist)

1387

In [92]:
1327-1387

-60

In [1]:
type(property_recordlist)

NameError: name 'property_recordlist' is not defined

In [None]:
property_recordlist[0]

In [96]:
test_normalize = pd.json_normalize(property_recordlist[0])
test_normalize

Unnamed: 0,addressID,addressType,buildings,cases,cityName,door,entryAddressID,floor,gstkvhx,houseNumber,...,province.regionCode,province.slug,road.municipalityCode,road.name,road.roadCode,road.roadID,road.slug,zip.name,zip.slug,zip.zipCode
0,0a3f50c4-75b0-32b8-e044-0003ba298018,condo,"[{'basementArea': 294, 'bathroomCondition': 'B...",,Aarhus C,tv,0a3f5097-090b-32b8-e044-0003ba298018,4,07518455___1__4__tv,1,...,1082,oestjylland,751,Thorvaldsensgade,8455,fc104510-38d8-4438-8ff6-ae14956352a8,thorvaldsensgade,Aarhus C,aarhus-c,8000


In [97]:
test_normalize.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 53 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   addressID                                  1 non-null      object 
 1   addressType                                1 non-null      object 
 2   buildings                                  1 non-null      object 
 3   cases                                      0 non-null      object 
 4   cityName                                   1 non-null      object 
 5   door                                       1 non-null      object 
 6   entryAddressID                             1 non-null      object 
 7   floor                                      1 non-null      object 
 8   gstkvhx                                    1 non-null      object 
 9   houseNumber                                1 non-null      object 
 10  isPublic                      

In [102]:
pd.set_option('display.max_columns',None)
test_normalize

Unnamed: 0,addressID,addressType,buildings,cases,cityName,door,entryAddressID,floor,gstkvhx,houseNumber,isPublic,latestValuation,livingArea,propertyNumber,registrations,roadName,slug,zipCode,_links.self.href,city.name,city.slug,coordinates.lat,coordinates.lon,coordinates.type,daysOnMarket.realtors,energyLabelImprovement.improvementCase,energyLabelImprovement.sparEnergiLink,latestSoldCaseDescription.body,latestSoldCaseDescription.date,latestSoldCaseDescription.title,municipality.churchTaxPercentage,municipality.councilTaxPercentage,municipality.districtPlanFrom,municipality.districtPlanPdfUrl,municipality.landValueTaxLevelPerThousand,municipality.masterPlanPdfUrl,municipality.municipalityCode,municipality.name,municipality.numberOfSchools,municipality.population,municipality.slug,province.name,province.provinceCode,province.regionCode,province.slug,road.municipalityCode,road.name,road.roadCode,road.roadID,road.slug,zip.name,zip.slug,zip.zipCode
0,0a3f50c4-75b0-32b8-e044-0003ba298018,condo,"[{'basementArea': 294, 'bathroomCondition': 'B...",,Aarhus C,tv,0a3f5097-090b-32b8-e044-0003ba298018,4,07518455___1__4__tv,1,True,1950000,85,788347,"[{'amount': 2240000, 'area': 85, 'date': '2023...",Thorvaldsensgade,thorvaldsensgade-1-4-tv-8000-aarhus-c-07518455...,8000,/addresses/0a3f50c4-75b0-32b8-e044-0003ba298018,Aarhus C,aarhus-c,56.155323,10.198574,EPSG4326,[],FALLBACK,https://sparenergi.dk/forbruger/boligen/traeng...,I den gamle bydel i Aarhus finder du nogle af ...,2019-03-26,Stor 3-værelses delelejlighed med spisekøkken ...,0.74,24.52,2023-01-11,https://dokument.plandata.dk/20_11126371_16734...,24.58,https://dokument.plandata.dk/12_11026896_16729...,751,Aarhus,81,355238,aarhus,Østjylland,DK042,1082,oestjylland,751,Thorvaldsensgade,8455,fc104510-38d8-4438-8ff6-ae14956352a8,thorvaldsensgade,Aarhus C,aarhus-c,8000


In [None]:
property_recordlist[0]

In [38]:
kommun_list = dk_kommune.Navn.values.tolist()
kommun_list

['København',
 'Aarhus',
 'Aalborg',
 'Odense',
 'Vejle',
 'Esbjerg',
 'Frederiksberg',
 'Randers',
 'Silkeborg',
 'Viborg',
 'Horsens',
 'Kolding',
 'Roskilde',
 'Herning',
 'Næstved',
 'Slagelse',
 'Gentofte',
 'Sønderborg',
 'Holbæk',
 'Gladsaxe',
 'Skanderborg',
 'Hjørring',
 'Helsingør',
 'Køge',
 'Guldborgsund',
 'Svendborg',
 'Aabenraa',
 'Holstebro',
 'Frederikshavn',
 'Lyngby-Taarbæk',
 'Rudersdal',
 'Ringkøbing-Skjern',
 'Haderslev',
 'Høje-Taastrup',
 'Hillerød',
 'Hvidovre',
 'Faaborg-Midtfyn',
 'Fredericia',
 'Greve',
 'Ballerup',
 'Varde',
 'Favrskov',
 'Kalundborg',
 'Hedensted',
 'Frederikssund',
 'Vordingborg',
 'Egedal',
 'Skive',
 'Syddjurs',
 'Thisted',
 'Tårnby',
 'Vejen',
 'Rødovre',
 'Ikast-Brande',
 'Furesø',
 'Mariagerfjord',
 'Fredensborg',
 'Gribskov',
 'Assens',
 'Middelfart',
 'Lolland',
 'Bornholm',
 'Jammerbugt',
 'Faxe',
 'Brøndby',
 'Norddjurs',
 'Tønder',
 'Brønderslev',
 'Vesthimmerland',
 'Ringsted',
 'Odsherred',
 'Nyborg',
 'Halsnæs',
 'Rebild',
 '

In [39]:
kommun_list[1]

'Aarhus'