The script utilizes multithreading to scrape house listings from Immoweb, storing URLs locally. It then concurrently scrapes detailed information from each listing, constructing a DataFrame, and exporting it to a CSV file, providing a comprehensive dataset of real estate listings with various attributes.

In [14]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import threading
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import sys

# Function to scrape URLs from Immoweb
def scrape_urls(page_num):
    """
    Scrape URLs from Immoweb for house listings.

    Args:
    page_num (int): Page number to scrape.

    Returns:
    list: List of URLs scraped from the page.
    """
    base_url = f"https://www.immoweb.be/en/search/house/for-sale?countries=BE&page={page_num}&orderBy=relevance"
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, "html.parser")

    urls = []
    for elem in soup.find_all("a", attrs={"class": "card__title-link"}):
        urls.append(elem.get('href'))

    # Append URLs to a local file for future reference
    with open("full_list.txt", "a") as f:
        for url in urls:
            f.write(url + '\n')
    return urls

# Function to perform concurrent scraping of URLs
def thread_scraping():
    """
    Perform scraping of URLs using multiple threads.
    """
    full_list_url = []
    num_pages = 2
    threads = []
    start_time = time.time()

    # Create and start threads for scraping each page
    for i in range(1, num_pages + 1):
        t = threading.Thread(target=lambda: full_list_url.extend(scrape_urls(i)))
        threads.append(t)
        t.start()

    # Wait for all threads to complete
    for t in threads:
        t.join()

    end_time = time.time()
    execution_time = end_time - start_time

    # Display scraping completion message and execution time
    print("Scraping completed!")
    print("Total URLs scraped:", len(full_list_url))
    print("Total time:", execution_time, "seconds")
    return full_list_url

# Function to report progress during scraping
def reporting(str, i):
    """
    Reports on scraping progress.

    Args:
    str (str): Message to print.
    i (int): Counter value.
    """
    sys.stdout.write(str + ' %d\r' %i)
    sys.stdout.flush()
    return

# Function to create a global counter for use in list comprehension
def counter():
    """
    Creates a global counter for use in list comprehension.
    """
    global counters
    if counters < 1:
        counters = 1
    else:
        counters +=1
    return

# Function to scrape detailed information from individual house pages
def scrape_house(url):
    """
    Scrapes all the info from a house listing.

    Args:
    url (str): URL of the house listing.

    Returns:
    dict: Information scraped from the house listing.
    """
    try:
        house_page = requests.get(url)
        house_page = BeautifulSoup(house_page.text, 'html.parser')
    except:
        return {}

    try:
        regex = r"window.classified = (\{.*\})"
        script = house_page.find('div',attrs={"id":"main-container"}).script.text
        script = re.findall(regex, script)
        script = json.loads(script[0])
    except:
        return {}

    final_dictionary = {}

    # Extract relevant information from the script
    try:
        final_dictionary['url'] = url
    except:
        final_dictionary['url'] = 'UNKNOWN'

    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'

    # Extract other key-value pairs similarly...

    return final_dictionary

# Function to create a DataFrame from scraped data
def create_dataframe():
    """
    Scrapes info from house pages and creates a pandas DataFrame.
    """
    houses_links = []
    houses_links = thread_scraping()

    print("")
    print("Scraping individual pages...")
    start_time = time.time()

    # Scraping info from individual house pages concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [(executor.submit(scrape_house, url), counter(), reporting("Individual pages scraped:", counters), time.sleep(.2)) for url in houses_links]
        results = [item[0].result() for item in futures]
        df = pd.DataFrame(results)

    # Define paths for data export
    cwd = Path.cwd()
    csv_path = r'.\data_output'
    csv_path = (cwd / csv_path).resolve()

    # Export DataFrame to CSV file
    df.to_csv(csv_path, index = True)

    end_time = time.time()
    execution_time = end_time - start_time

    # Display completion message and execution time
    print("Scraping completed!")
    print("Total time spent scraping:", execution_time, "seconds")
    return df

# Initialize counter for the counter function
counters = 1

# Call function to create DataFrame from scraped data
dataset = create_dataframe()
print(dataset)

# Display column names of the DataFrame
column_names = dataset.columns
print(column_names)


Scraping completed!
Total URLs scraped: 120
Total time: 1.099456548690796 seconds

Scraping individual pages...
Scraping completed!raped: 121
Total time spent scraping: 25.06033444404602 seconds
                                                   url    region
0    https://www.immoweb.be/en/classified/house/for...  Wallonie
1    https://www.immoweb.be/en/classified/house/for...  Wallonie
2    https://www.immoweb.be/en/classified/house/for...  Wallonie
3    https://www.immoweb.be/en/classified/new-real-...  Flanders
4    https://www.immoweb.be/en/classified/house/for...  Flanders
..                                                 ...       ...
115  https://www.immoweb.be/en/classified/house/for...  Flanders
116  https://www.immoweb.be/en/classified/house/for...  Wallonie
117  https://www.immoweb.be/en/classified/flat-stud...  Brussels
118  https://www.immoweb.be/en/classified/apartment...  Flanders
119  https://www.immoweb.be/en/classified/house/for...  Flanders

[120 rows x 2 columns]
I

Here's the part of the code that implements multithreading:

In [13]:
# Function to perform concurrent scraping of URLs
def thread_scraping():
    """
    Perform scraping of URLs using multiple threads.
    """
    full_list_url = []  # Initialize an empty list to store all scraped URLs
    num_pages = 2  # Define the number of pages to scrape
    threads = []  # Initialize a list to store thread objects
    start_time = time.time()  # Record the start time of the scraping process

    # Create and start threads for scraping each page
    for i in range(1, num_pages + 1):
        # Define a new thread for each page, targeting the scrape_urls function with page number as argument
        t = threading.Thread(target=lambda: full_list_url.extend(scrape_urls(i)))
        threads.append(t)  # Append the thread object to the list of threads
        t.start()  # Start the thread

    # Wait for all threads to complete
    for t in threads:
        t.join()  # Wait for each thread to finish its task

    end_time = time.time()  # Record the end time of the scraping process
    execution_time = end_time - start_time  # Calculate the total execution time

    # Display scraping completion message and execution time
    print("Scraping completed!")
    print("Total URLs scraped:", len(full_list_url))
    print("Total time:", execution_time, "seconds")
    
    return full_list_url  # Return the list of all scraped URLs


This refactored code uses ThreadPoolExecutor for both scraping URLs and scraping detailed information from individual house pages. It should provide concurrency through threading while scraping data from the Immoweb website.
The ThreadPoolExecutor and threading.Thread classes in Python both provide ways to run tasks concurrently using threads. However, the ThreadPoolExecutor often performs better in terms of efficiency and speed compared to directly using threading.Thread

In [15]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import sys

def scrape_urls(page_num):
    base_url = f"https://www.immoweb.be/en/search/house/for-sale?countries=BE&page={page_num}&orderBy=relevance"
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, "html.parser")

    urls = [elem.get('href') for elem in soup.find_all("a", attrs={"class": "card__title-link"})]

    with open("full_list.txt", "a") as f:
        for url in urls:
            f.write(url + '\n')
    return urls

def thread_scraping():
    full_list_url = []
    num_pages = 2
    start_time = time.time()

    with ThreadPoolExecutor() as executor:
        results = executor.map(scrape_urls, range(1, num_pages + 1))
        for result in results:
            full_list_url.extend(result)

    end_time = time.time()
    execution_time = end_time - start_time

    print("Scraping completed!")
    print("Total URLs scraped:", len(full_list_url))
    print("Total time:", execution_time, "seconds")
    return full_list_url

def scrape_house(url):
    try:
        house_page = requests.get(url)
        house_page = BeautifulSoup(house_page.text, 'html.parser')
    except:
        return {}

    try:
        regex = r"window.classified = (\{.*\})"
        script = house_page.find('div',attrs={"id":"main-container"}).script.text
        script = re.findall(regex, script)
        script = json.loads(script[0])
    except:
        return {}

    final_dictionary = {'url': url}

    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'

    # Extract other key-value pairs similarly...

    return final_dictionary

def create_dataframe():
    houses_links = thread_scraping()

    print("")
    print("Scraping individual pages...")
    start_time = time.time()

    with ThreadPoolExecutor() as executor:
        results = executor.map(scrape_house, houses_links)
        df = pd.DataFrame(results)

    cwd = Path.cwd()
    csv_path = r'.\data_output'
    csv_path = (cwd / csv_path).resolve()

    df.to_csv(csv_path, index=True)

    end_time = time.time()
    execution_time = end_time - start_time

    print("Scraping completed!")
    print("Total time spent scraping:", execution_time, "seconds")
    return df

counters = 1
dataset = create_dataframe()
print(dataset)
column_names = dataset.columns
print(column_names)


Scraping completed!
Total URLs scraped: 120
Total time: 1.6385235786437988 seconds

Scraping individual pages...
Scraping completed!
Total time spent scraping: 12.10824179649353 seconds
                                                   url    region
0    https://www.immoweb.be/en/classified/house/for...  Wallonie
1    https://www.immoweb.be/en/classified/house/for...  Wallonie
2    https://www.immoweb.be/en/classified/house/for...  Wallonie
3    https://www.immoweb.be/en/classified/new-real-...  Flanders
4    https://www.immoweb.be/en/classified/house/for...  Flanders
..                                                 ...       ...
115  https://www.immoweb.be/en/classified/apartment...  Flanders
116  https://www.immoweb.be/en/classified/apartment...  Brussels
117  https://www.immoweb.be/en/classified/apartment...  Brussels
118  https://www.immoweb.be/en/classified/new-real-...  Flanders
119  https://www.immoweb.be/en/classified/house/for...  Wallonie

[120 rows x 2 columns]
Index(['ur