In [1]:
# libraries 
import pandas as pd 
import os
import numpy as np
import time
import random
from csv import reader
import time

# plots:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


# scraping
from bs4 import BeautifulSoup
from lxml import etree #for using XPATH with beautifulsoup
import requests
import numpy as np

#JSON
import json

# GeoJSON
# import geopandas as gpd

# regular expression
import re 

# concurrent futures - boosts the process of scraping utilicing the CPU better
# two main classes- 
#       1) executor class: manages all the threads and workload
#       2) futures class: creates a little instance and manages data coming back
import concurrent.futures

# libraries: 
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# The plan :


* filter for properties that mach the following 
    * **housing type:** Villa (detached house) , Rækkehus (raðhús) , Ejerlejlighed (condo)
    * **Time:** 2012-2023
    * **Sale type:** normal or according to boliga.dk Almindeligt frit salg
    * ***according to Boliga.dk the result is 1.028.612 properties*** 
    
* First I scrape the API that gives me search result 
    * they dont seem to filter i.e. the properties amounts to 1.028.612 and have 50 properties per page which means **25.573** pages which they allow to search in. 
    
    * parameters that it takes are : 
        * searchTab: 1
        * propertyType: 1,2,3
        * saleType: 1
        * salesDateMin: 2012
        * salesDateMax: 2023
        * sort: date-d
        * page: 2
        
    * Property types are numbers such that 
        * property type = 1 is **Villa**
        * property type = 2 is **Rækkehus**
        * property type = 3 is **Ejerlejlighed**
        * property type = 6 is **Landejendom** (farm house) 


# Setting up User agent and header 

In [2]:
# change user agent for each request randomly
def get_user_agent():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
        'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
    ]
    user_agent = user_agent_list[random.randint(0,len(user_agent_list)-1)]
   
    return user_agent

def get_header():
    
    # get user-agent
    user_agent = get_user_agent()

     # set up header     
    header = {
        'authority': 'api.boligsiden.dk',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
        'origin': 'https://www.boligsiden.dk',
        'referer': 'https://www.boliga.dk/',
        'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': user_agent
#         'x-api-key': 'GWD0fljZzkc8GOLV',
    }
    return header 

# scrape page 

In [None]:
# def scrape_page(page):
#     global dataframes_list
    
#     print('-------------------------------------')
#     print(f'---- page number {page} of (510.670 boliger solgt')
#     print('-------------------------------------')

#     # Set up the header
#     ########################
#     main_header = get_header()

#     # Set up the url of the API
#     ###########################
#     main_url = f'https://api.boliga.dk/api/v2/sold/search/results?searchTab=1&propertyType=1,2,3,6&saleType=1&salesDateMin=2012&salesDateMax=2023&sort=date-d&page={page}'

#     # Make the HTTPS Request 
#     ########################
#     response = session.get(main_url, headers=main_header)

#     if response.status_code == 200:
        
#         print('--------------------- scraping !')

#         # Convert to JSON 
#         #################
#         page_result = response.json()['results']

#         # loop over each object and construct the dataframe
#         ####################################################
#         dataframes = [pd.DataFrame([prop]) for prop in page_result]

#         # append to the dataframes_list 
#         ###############################
#         dataframes_list.extend(dataframes)
#     else:
#         print()
#         print('############################################################')
#         print('------------------- BAD REQUEST, NOT 200 ------------------')
#         print('############################################################')
#         print()

    
    

In [5]:

def scrape_page(page):
    global dataframes_list
    global session
    
    print('-------------------------------------')
    print(f'---- page number {page} of 25.573 pages')
    print('-------------------------------------')

    # Set up the header
    ########################
    main_header = get_header()

    # Set up the url of the API
    ###########################
    main_url = f'https://api.boliga.dk/api/v2/sold/search/results?searchTab=1&propertyType=1,2,3,6&saleType=1&salesDateMin=2006&salesDateMax=2011&sort=date-d&page={page}'

    # Make the HTTPS Request 
    ########################
    response = session.get(main_url, headers=main_header)

    if response.status_code == 200:
        
        print('--------------------- scraping !')

        # Convert to JSON 
        #################
        page_result = response.json()['results']

        # loop over each object and construct the dataframe
        ####################################################
        dataframes = [pd.DataFrame([prop]) for prop in page_result]

        # append to the dataframes_list 
        ###############################
        dataframes_list.extend(dataframes)
    else:
        print()
        print('############################################################')
        print('------------------- BAD REQUEST, NOT 200 ------------------')
        print('############################################################')
        print()

    
    

10213.4

# Main function 

The total number of pages is 25.573.

In [None]:

###############################################
# for loop: 
# --------- go over each of the  25.573 pages
###############################################

dataframes_list = []

session = requests.Session()

page_numbers = list(range(1,25572+1))

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 
    retry = Retry(connect=3, backoff_factor=0.5) # max 3 retries to the same link with a 0.5 sec delay
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    # run script
    executor.map(scrape_page,page_numbers)
    
    


In [None]:
len(dataframes_list)

# ---- Save data as multiple chunks 

In [None]:
# Assuming you have a list of DataFrames called "db_list"

# Specify the chunk size (adjust based on your system's available memory)
chunk_size = 10000  # Process 10000 DataFrames at a time

# Initialize a counter to keep track of the chunks
counter = 81

for i in range(0, len(dataframes_list), chunk_size):
    print(f" -------- chunk {counter} out of 80")
    
    chunk = dataframes_list[i:i + chunk_size]
    combined_df = pd.concat(chunk, ignore_index=True)
    
    # Save each chunk to a CSV file with a unique name
    csv_filename = f'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Step_1_Scrape_Property_ID_Links\data\combined_data_chunk_{counter}.csv'
    combined_df.to_csv(csv_filename, index=False, encoding='utf-8')
    
    counter += 1
    
    