In [4]:
## FOR CODE TRIAL 
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import threading
from collections import defaultdict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor, as_completed
from pathlib import Path
import sys
import numpy as np

# Function to scrape URLs
def scrape_urls(page_num):
    base_url = f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page={page_num}&orderBy=relevance"
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, "html.parser")

    urls = []
    for elem in soup.find_all("a", attrs={"class": "card__title-link"}):
        urls.append(elem.get('href'))

    # Save URLs to file - full_list.txt (local storage)
    with open("full_list.txt", "a") as f:
        for url in urls:
            f.write(url + '\n')
    return urls

def thread_scraping():
    full_list_url = []
    num_pages = 2

    # Create a list to store threads
    threads = []
    start_time = time.time()  # Start timer

    # Create and start threads
    for i in range(1, num_pages + 1):
        t = threading.Thread(target=lambda: full_list_url.extend(scrape_urls(i)))
        threads.append(t)
        t.start()

    # Wait for all threads to complete and then join
    for t in threads:
        t.join()

    end_time = time.time()  # Stop timer
    execution_time = end_time - start_time

    print("Scraping completed!")
    print("Total URLs scraped:", len(full_list_url))
    print("Total time:", execution_time, "seconds")
    return full_list_url

thread_scraping()


def reporting(str, i):
    """Reports on scraping progress"""
    sys.stdout.write(str + ' %d\r' %i)
    sys.stdout.flush()
    return

# Initialize counter for the counter function
counters = 1
def counter():
    """Creates a global counter for use in list comprehension"""
    global counters
    if counters < 1:
        counters = 1
    else:
        counters += 1
    return

def scrape_house(url):
    """Scrapes all the info from a house listing"""

    # Get the house listing and make a soup
    try:
        house_page = requests.get(url)
        house_page = BeautifulSoup(house_page.text, 'html.parser')
    # Return an empty dictionary if we can't parse the URL
    except:
        return {}

    # Get the hidden info from the java script
    try:
        regex = r"window.classified = (\{.*\})" # Only captures what's between brackets
        script = house_page.find('div',attrs={"id":"main-container"}).script.text
        script = re.findall(regex, script)
        script = json.loads(script[0])
    except:
        return {}

    final_dictionary = {}
    # URL
    try:
        final_dictionary['url'] = url
    except:
        final_dictionary['url'] = 'UNKNOWN'
    # URL adding IMMO ID
    try:
        final_dictionary['id'] = script['id']
    except:
        final_dictionary['id'] = 'UNKNOWN'
    # Region
    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'
    #  # Region
    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'
    # Province
    try:
        final_dictionary['province'] = script['property']['location']['province']
    except:
        final_dictionary['province'] = 'UNKNOWN'
    # Locality
    try:
        final_dictionary['locality'] = script['property']['location']['locality']
    except:
        final_dictionary['locality'] = 'UNKNOWN'
    # ZIP Code
    try:
        final_dictionary['zip_code'] = script['property']['location']['postalCode']
    except:
        final_dictionary['zip_code'] = 'UNKNOWN'
    # Longitude
    try:
        final_dictionary['Longitude'] = script['property']['location']['longitude']
    except:
        final_dictionary['Longitude'] = 'UNKNOWN'
    # Latitude
    try:
        final_dictionary['Latitude'] = script['property']['location']['latitude']
    except:
        final_dictionary['Latitude'] = 'UNKNOWN'
    # Type of property
    try:
        final_dictionary['property_type'] = script['property']['type']
    except:
        final_dictionary['property_type'] = 'UNKNOWN'
    # Subtype of property
    try:
        final_dictionary['property_subtype'] = script['property']['subtype']
    except:
        final_dictionary['property_subtype'] = 'UNKNOWN'
    # Price
    try:
        final_dictionary['price'] = script['price']['mainValue']
    except:
        final_dictionary['price'] = 'UNKNOWN'
    # Number of rooms
    try:
        final_dictionary['number_rooms'] = script['property']['bedroomCount']
    except:
        final_dictionary['number_rooms'] = 'UNKNOWN'
    # Living area
    try:
        final_dictionary['living_area'] = script['property']['netHabitableSurface']
    except:
        final_dictionary['living_area'] = 'UNKNOWN'
    # Fully equipped kitchen (Yes/No)
    try:
        final_dictionary['kitchen'] = script['property']['kitchen']['type']
    except:
        final_dictionary['kitchen'] = 0
    # Furnished (Yes/No)
    try:
        final_dictionary['furnished'] = script['transaction']['sale']['isFurnished']
    except:
        final_dictionary['furnished'] = 'UNKNOWN'
    # Open fire (Yes/No)
    try:
        final_dictionary['fireplace'] = script['property']['fireplaceCount']
    except:
        final_dictionary['fireplace'] = 0
    # Terrace (Yes/No)
    try:
        final_dictionary['terrace'] = script['property']['hasTerrace']
    except:
        final_dictionary['terrace'] = 0
    # If yes: Area
    try:
        final_dictionary['terrace_area'] = script['property']['terraceSurface']
    except:
        final_dictionary['terrace_area'] = 0
    # Garden
    try:
        final_dictionary['garden'] = script['property']['hasGarden']
    except:
        final_dictionary['garden'] = 0
    # If yes: Area
    try:
        final_dictionary['garden_area'] = script['property']['gardenSurface']
    except:
        final_dictionary['garden_area'] = 0
    # Surface of the land
    try:
        final_dictionary['surface_land'] = script['property']['land']['surface']
    except:
        final_dictionary['surface_land'] = "UNKNOWN"
    # Number of facades
    try:
        final_dictionary['number_facades'] = script['property']['building']['facadeCount']
    except:
        final_dictionary['number_facades'] = "UNKNOWN"
    # Swimming pool (Yes/No)
    try:
        final_dictionary['swimming_pool'] =  script['property']['hasSwimmingPool']
    except:
        final_dictionary['swimming_pool'] = 0
    # State of the building (New, to be renovated, ...)
    try:
        final_dictionary['building_state'] = script['property']['building']['condition']
    except:
        final_dictionary['building_state'] = 'UNKNOWN'

    return final_dictionary


    return final_dictionary

def create_dataframe():
    """Will scrape info from house pages and create a pandas DataFrame from the info we scrape"""
    # Initialize list and fetch all URLs
    houses_links = []
    houses_links = thread_scraping()

    print("")
    print("Scraping individual pages...")
    start_time = time.time()  # Start timer

    # Scrape info from house pages concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [(executor.submit(scrape_house, url), counter(), reporting("Individual pages scraped:", counters), time.sleep(.2)) for url in houses_links]
        results =  [item[0].result() for item in futures]
        df = pd.DataFrame(results)

    # Export our dataset to a csv"
    # Build path to file
    # Select current working directory 
        cwd = Path.cwd()
    df.to_csv(csv_path, index = True)

    end_time = time.time()  # Stop timer
    execution_time = end_time - start_time

    print("Scraping completed!                        ")
    print("Total time spent scraping:", execution_time, "seconds")
    return df

# Initialize counter for the counter function
counters = 1

# Build path to file
# Selects current working directory
cwd = Path.cwd()
output_folder = (cwd / 'data_output').resolve() # Adjusted CSV file path and name
csv_filename = "house_apart_sale.csv"
csv_path = (output_folder / csv_filename).resolve()
url_path = './full_list.txt'
csv_path = (cwd / csv_path).resolve()
url_path = (cwd / url_path).resolve()

# Ensure the "output" folder exists
output_folder = (cwd / 'data_output').resolve()
output_folder.mkdir(parents=True, exist_ok=True)

dataset = create_dataframe()
print("Original DataFrame:")
print(dataset)

# Print unique values in the 'furnished' column before recoding
print("Unique values in 'region' column before recoding:")
print(dataset['region'].unique())

# Print 'furnished' column before recoding
# print("\n'furnished' column before recoding:")
# print(dataset['furnished'].head())

# Assuming df is your DataFrame
binary_columns = ['furnished', 'terrace', 'garden', 'swimming_pool']

# Convert 'TRUE'/'FALSE' strings to 1/0 integers and handle empty values
for column in binary_columns:
    dataset[column] = dataset[column].apply(lambda x: 1 if str(x).upper() == 'TRUE' else (0 if str(x).upper() == 'FALSE' else None) if x != '' else None)

# Assuming df is your DataFrame
tria_columns = ['region']

# Define the mapping for 'region'
region_mapping = {'Brussels': 1, 'Wallonie': 2, 'Flanders': 3, '': None}

# Convert strings to integers and handle empty cells
for column in tria_columns:
    dataset[column] = dataset[column].map(region_mapping)
    
# Save the entire DataFrame to a CSV file
csv_output_path = output_folder / 'recoded_dataset.csv'
dataset.to_csv(csv_output_path, index=False)

print("\nDataFrame with 'furnished' column recoded:")
print(dataset.head())

Scraping completed!
Total URLs scraped: 120
Total time: 1.8549697399139404 seconds
Scraping completed!
Total URLs scraped: 120
Total time: 1.329416036605835 seconds

Scraping individual pages...
Scraping completed!                        
Total time spent scraping: 28.6515212059021 seconds
Original DataFrame:
                                                   url        id    region  \
0    https://www.immoweb.be/en/classified/new-real-...   9466117  Flanders   
1    https://www.immoweb.be/en/classified/new-real-...  11155862  Flanders   
2    https://www.immoweb.be/en/classified/new-real-...  11155383  Flanders   
3    https://www.immoweb.be/en/classified/apartment...  11157252  Flanders   
4    https://www.immoweb.be/en/classified/new-real-...  11156389  Flanders   
..                                                 ...       ...       ...   
115  https://www.immoweb.be/en/classified/house/for...  11159289  Wallonie   
116  https://www.immoweb.be/en/classified/villa/for...  11159288 

In [None]:
# Assuming df is your DataFrame
tria_columns = ['region']

# Define the mapping for 'region'
region_mapping = {'Brussels': 1, 'Wallonie': 2, 'Flanders': 3, '': None}

# Convert strings to integers and handle empty cells
for column in tria_columns:
    dataset[column] = dataset[column].map(region_mapping)

# Save the entire DataFrame to a CSV file
csv_output_path = output_folder / 'recoded_dataset.csv'
dataset.to_csv(csv_output_path, index=False)

# Display the updated DataFrame
print("\nDataFrame with 'region' column recoded to 1/2/3/None:")
print(dataset.head())


In [None]:
# Print unique values in the 'furnished' column before recoding
print("Unique values in 'furnished' column before recoding:")
print(dataset['furnished'].unique())

# Assuming df is your DataFrame
binary_columns = ['furnished']

# Convert 'TRUE'/'FALSE' strings to 1/0 integers and handle empty values
for column in binary_columns:
    dataset[column] = dataset[column].replace({'TRUE': 1, 'FALSE': 0, '': None, ' ': None})

# Save the entire DataFrame to a CSV file
csv_output_path = output_folder / 'recoded_dataset.csv'
dataset.to_csv(csv_output_path, index=False)

# Print unique values in the 'furnished' column after recoding
print("\nDataFrame with 'furnished' column recoded:")
print(dataset['furnished'].unique())

In [None]:
# replace the True False 
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
binary_columns = ['furnished', 'terrace', 'garden', 'swimming_pool']



In [None]:
## working code with ID 
## FOR CODE TRIAL 
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import threading
from collections import defaultdict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor, as_completed
from pathlib import Path
import sys
import numpy as np

# Function to scrape URLs
def scrape_urls(page_num):
    base_url = f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page={page_num}&orderBy=relevance"
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, "html.parser")

    urls = []
    for elem in soup.find_all("a", attrs={"class": "card__title-link"}):
        urls.append(elem.get('href'))

    # Save URLs to file - full_list.txt (local storage)
    with open("full_list.txt", "a") as f:
        for url in urls:
            f.write(url + '\n')
    return urls

def thread_scraping():
    full_list_url = []
    num_pages = 2

    # Create a list to store threads
    threads = []
    start_time = time.time()  # Start timer

    # Create and start threads
    for i in range(1, num_pages + 1):
        t = threading.Thread(target=lambda: full_list_url.extend(scrape_urls(i)))
        threads.append(t)
        t.start()

    # Wait for all threads to complete and then join
    for t in threads:
        t.join()

    end_time = time.time()  # Stop timer
    execution_time = end_time - start_time

    print("Scraping completed!")
    print("Total URLs scraped:", len(full_list_url))
    print("Total time:", execution_time, "seconds")
    return full_list_url

thread_scraping()


def reporting(str, i):
    """Reports on scraping progress"""
    sys.stdout.write(str + ' %d\r' %i)
    sys.stdout.flush()
    return

# Initialize counter for the counter function
counters = 1
def counter():
    """Creates a global counter for use in list comprehension"""
    global counters
    if counters < 1:
        counters = 1
    else:
        counters += 1
    return

def scrape_house(url):
    """Scrapes all the info from a house listing"""

    # Get the house listing and make a soup
    try:
        house_page = requests.get(url)
        house_page = BeautifulSoup(house_page.text, 'html.parser')
    # Return an empty dictionary if we can't parse the URL
    except:
        return {}

    # Get the hidden info from the java script
    try:
        regex = r"window.classified = (\{.*\})" # Only captures what's between brackets
        script = house_page.find('div',attrs={"id":"main-container"}).script.text
        script = re.findall(regex, script)
        script = json.loads(script[0])
    except:
        return {}

    final_dictionary = {}
    # URL
    try:
        final_dictionary['url'] = url
    except:
        final_dictionary['url'] = 'UNKNOWN'
    # URL adding IMMO ID
    try:
        final_dictionary['id'] = script['id']
    except:
        final_dictionary['id'] = 'UNKNOWN'
    # Region
    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'
    #  # Region
    try:
        final_dictionary['region'] = script['property']['location']['region']
    except:
        final_dictionary['region'] = 'UNKNOWN'
    # Province
    try:
        final_dictionary['province'] = script['property']['location']['province']
    except:
        final_dictionary['province'] = 'UNKNOWN'
    # Locality
    try:
        final_dictionary['locality'] = script['property']['location']['locality']
    except:
        final_dictionary['locality'] = 'UNKNOWN'
    # ZIP Code
    try:
        final_dictionary['zip_code'] = script['property']['location']['postalCode']
    except:
        final_dictionary['zip_code'] = 'UNKNOWN'
    # Longitude
    try:
        final_dictionary['Longitude'] = script['property']['location']['longitude']
    except:
        final_dictionary['Longitude'] = 'UNKNOWN'
    # Latitude
    try:
        final_dictionary['Latitude'] = script['property']['location']['latitude']
    except:
        final_dictionary['Latitude'] = 'UNKNOWN'
    # Type of property
    try:
        final_dictionary['property_type'] = script['property']['type']
    except:
        final_dictionary['property_type'] = 'UNKNOWN'
    # Subtype of property
    try:
        final_dictionary['property_subtype'] = script['property']['subtype']
    except:
        final_dictionary['property_subtype'] = 'UNKNOWN'
    # Price
    try:
        final_dictionary['price'] = script['price']['mainValue']
    except:
        final_dictionary['price'] = 'UNKNOWN'
    # Number of rooms
    try:
        final_dictionary['number_rooms'] = script['property']['bedroomCount']
    except:
        final_dictionary['number_rooms'] = 'UNKNOWN'
    # Living area
    try:
        final_dictionary['living_area'] = script['property']['netHabitableSurface']
    except:
        final_dictionary['living_area'] = 'UNKNOWN'
    # Fully equipped kitchen (Yes/No)
    try:
        final_dictionary['kitchen'] = script['property']['kitchen']['type']
    except:
        final_dictionary['kitchen'] = 0
    # Furnished (Yes/No)
    try:
        final_dictionary['furnished'] = script['transaction']['sale']['isFurnished']
    except:
        final_dictionary['furnished'] = 'UNKNOWN'
    # Open fire (Yes/No)
    try:
        final_dictionary['fireplace'] = script['property']['fireplaceCount']
    except:
        final_dictionary['fireplace'] = 0
    # Terrace (Yes/No)
    try:
        final_dictionary['terrace'] = script['property']['hasTerrace']
    except:
        final_dictionary['terrace'] = 0
    # If yes: Area
    try:
        final_dictionary['terrace_area'] = script['property']['terraceSurface']
    except:
        final_dictionary['terrace_area'] = 0
    # Garden
    try:
        final_dictionary['garden'] = script['property']['hasGarden']
    except:
        final_dictionary['garden'] = 0
    # If yes: Area
    try:
        final_dictionary['garden_area'] = script['property']['gardenSurface']
    except:
        final_dictionary['garden_area'] = 0
    # Surface of the land
    try:
        final_dictionary['surface_land'] = script['property']['land']['surface']
    except:
        final_dictionary['surface_land'] = "UNKNOWN"
    # Number of facades
    try:
        final_dictionary['number_facades'] = script['property']['building']['facadeCount']
    except:
        final_dictionary['number_facades'] = "UNKNOWN"
    # Swimming pool (Yes/No)
    try:
        final_dictionary['swimming_pool'] =  script['property']['hasSwimmingPool']
    except:
        final_dictionary['swimming_pool'] = 0
    # State of the building (New, to be renovated, ...)
    try:
        final_dictionary['building_state'] = script['property']['building']['condition']
    except:
        final_dictionary['building_state'] = 'UNKNOWN'

    return final_dictionary


    return final_dictionary

def create_dataframe():
    """Will scrape info from house pages and create a pandas DataFrame from the info we scrape"""
    # Initialize list and fetch all URLs
    houses_links = []
    houses_links = thread_scraping()

    print("")
    print("Scraping individual pages...")
    start_time = time.time()  # Start timer

    # Scrape info from house pages concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [(executor.submit(scrape_house, url), counter(), reporting("Individual pages scraped:", counters), time.sleep(.2)) for url in houses_links]
        results =  [item[0].result() for item in futures]
        df = pd.DataFrame(results)

    # Export our dataset to a csv"
    # Build path to file
    # Select current working directory 
        cwd = Path.cwd()
    df.to_csv(csv_path, index = True)

    end_time = time.time()  # Stop timer
    execution_time = end_time - start_time

    print("Scraping completed!                        ")
    print("Total time spent scraping:", execution_time, "seconds")
    return df

# Initialize counter for the counter function
counters = 1

# Build path to file
# Selects current working directory
cwd = Path.cwd()
output_folder = (cwd / 'data_output').resolve() # Adjusted CSV file path and name
csv_filename = "house_apart_sale.csv"
csv_path = (output_folder / csv_filename).resolve()
url_path = './full_list.txt'
csv_path = (cwd / csv_path).resolve()
url_path = (cwd / url_path).resolve()


# Ensure the "output" folder exists
output_folder = (cwd / 'data_output').resolve()
output_folder.mkdir(parents=True, exist_ok=True)

dataset = create_dataframe()
print(dataset)
# Assuming df is your DataFrame

column_names = dataset.columns
print(column_names)

# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = './data_output/house_apart_sale.csv'

# Read CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# List of columns to convert from yes/no strings to 1/0 integers
binary_columns = ['furnished']

# Convert yes/no strings to 1/0 integers and handle empty values
for column in binary_columns:
    df[column] = df[column].astype(str).str.upper().str.strip().map({'TRUE': 1, 'FALSE': 0, '': None})
     
# Display the updated DataFrame
print(df.head())