## Scraping with iframe

This section scrapes the first page of the AI4Belgium website based on the content inside the `<iframe>` tag.


In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def create_driver(url, headless= True):
    # Setup Selenium WebDriver
    print("Setting up WebDriver...")
    # Use ChromeDriverManager to automatically handle the ChromeDriver path
    service = Service(ChromeDriverManager().install())
    
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")

    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Navigate to the page. We include the full URL.
    driver.get(url)
    print(f"Navigating to {url}")
    return driver

def click_cookie(driver,cookie_xpath):
    try:
        # Wait for GDPR Banner to load and reject optional cookies
        time.sleep(5) 
        driver.find_element(By.XPATH, cookie_xpath).click()
        print("Optional cookies rejected")
    except:
        print("No GDPR cookie banner found! Continue ...")

def extract_company_data(root_url, company_item_css, iframe_css = None):

    driver = create_driver(root_url)
    cookie_xpath ='//*[@id="fedconsent"]/div[1]/div/div/div/div/ul/li[3]/button'
    click_cookie(driver, cookie_xpath)
    # --- 1. Define Selectors ---
    wait = WebDriverWait(driver, 10) # Use a generous wait time
    dataset =[]
    try:
        # 2. WAIT for the IFRAME to be available and SWITCH to it
        # EC.frame_to_be_available_and_switch_to_it is the most reliable method
        print("Attempting to locate and switch to the iframe...")
        
        if iframe_css != None:
            wait.until(
                EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_css))
            )
            print("✅ Successfully switched to the iframe! WARNING: Script is build to only scrape first page!")

        # --- 3. Find the element INSIDE the iframe ---
        company_items = driver.find_elements(By.CSS_SELECTOR, company_item_css.replace(" > div:nth-child(1)", " > div "))
        print(f"✅ Found the company item element(s)! Count: {len(company_items)}")

        # Example action: Get text from the first item
    
        for company in company_items:
            company_data={}
            try:
                link_element = company.find_element(By.TAG_NAME, "a")
                # Extract the URL (href)
                url = link_element.get_attribute("href")
                text_header = company.find_element(By.TAG_NAME, "h2")
                name = text_header.find_element(By.TAG_NAME,"a").text
            except Exception as e:
                url = None
                name = None
                print(f"Warning: Could not fully extract data from url for{name}. Error: {e}")
                continue
            try:       
                # Find the image element inside the link
                image_src = None
                image_element = link_element.find_element(By.TAG_NAME, "img")
                # Extract the image source (src)
                image_src = image_element.get_attribute("src")
            except Exception as e:
                print(f"Warning: Could not fully extract data from image.")
                continue
            try:
                headers = company.find_elements(By.TAG_NAME, "h4")
                header_info = company.find_elements(By.TAG_NAME, "span")
                
            except Exception as e:
                print(f"Warning: Could not fully extract data from headers. Error: {e}")
                continue
        
            company_data["name"] = name
            company_data["url"] = url
            company_data["logo"] = image_src
            for header, info in zip(headers, header_info[1:]):
                company_data[header.text.lower()] = info.text
        
            dataset.append(company_data)
    except Exception as e:
        print(f"❌ Scraping failed after switching to iframe. Error: {e}")
        
    finally:
        # Switch back to the main document context
        if iframe_css != None:
            driver.switch_to.default_content()
            print("Switched back to default content.")
            driver.close()
    return dataset



def save_list_dict_to_csv(dataset, output_file_path):
    df = pd.DataFrame(dataset)
    df.to_csv(output_file_path,index=False)
    return df


In [2]:
iframe_css ="#theme-content > div.field.field--name-field-blocks.field--type-entity-reference-revisions.field--label-hidden > ul > li:nth-child(4) > div > div > div > div > p:nth-child(4) > div > iframe"
company_item_css = "div.grid.grid-cols-4.gap-3.mb-2 > div:nth-child(1)"

root_url = "https://bosa.belgium.be/nl/AI4Belgium/observatorium"
dataset = extract_company_data(root_url, company_item_css, iframe_css)
df = save_list_dict_to_csv(dataset, "../data/first_page_iframe.csv")

Setting up WebDriver...
Navigating to https://bosa.belgium.be/nl/AI4Belgium/observatorium
Optional cookies rejected
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
Switched back to default content.


## Scraping from underlying HTML pages
In the previous section we noticed, we can directly scrape the information for startups by using the url: 

`https://community.ai4belgium.be/en/ai-landscape?nav=0&page={i}`

and loop through the pages.

In [None]:
def scrape_pages_ai4belgium(pages = 18):
    company_item_css = "div.grid.grid-cols-4.gap-3.mb-2 > div:nth-child(1)"
    combined_data = pd.DataFrame()
    num_pages = pages
    for page_num in range(1,num_pages+1):
        suburl = f"https://community.ai4belgium.be/en/ai-landscape?nav=0&page={page_num}"
        print(suburl)
        dataset = extract_company_data(suburl, company_item_css)
        df = pd.DataFrame(dataset)
        combined_data = pd.concat([combined_data,df], ignore_index=True)

    combined_data.to_csv("../data/raw_scraped_dataset.csv", index = False)
    return combined_data

scrape_pages_ai4belgium()

https://community.ai4belgium.be/en/ai-landscape?nav=0&page=1
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=1
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=2
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=2
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=3
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=3
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=4
Setting up WebDriver...
Navigating to https:/

## Scraping company address

1. Using the https://kbopub.economie.fgov.be/ website that contains information about registered entities.
2. Google search for those companies that don't have their legal entity registered

If the company is not listed in the Belgian registry and their website is not active, we will exclude it from the dataset. 

In [21]:
import requests
from lxml import etree
from bs4 import BeautifulSoup

def extract_from_dom_tree_CBE(url, xpath):
    response = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
    soup = BeautifulSoup(response.content,"html.parser")
    # Convert to etree for XPath
    dom = etree.HTML(str(soup))
    try: 
        street= dom.xpath(xpath+'[1]')[0].replace("\xa0"," ")
        zip_code, city = dom.xpath(xpath+'[2]')[0].split("\xa0") 
    except:
         street, zip_code, city =  None, None, None
    return [street, zip_code, city]

def extract_address_CBE(company_name):
       # Extract street address using XPath
        try:
            # We use the exact name search 
            cbe_url = f"https://kbopub.economie.fgov.be/kbopub/zoeknaamexactform.html?natuurlijkPersoon=vestiging&searchWord=&firmName=&_oudeBenaming=on&establishmentname={company_name}&rechtsvormFonetic=ALL&firstName=&postcode=&postgemeente1=&filterEnkelActieve=true&_filterEnkelActieve=on&actionNPRP=Search"
            exact_name_xpath = '//*[@id="vestiginglist"]/tbody/tr/td[6]/text()'
            return extract_from_dom_tree_CBE(cbe_url, exact_name_xpath)     
        except:
            # If that fails, we use the general name search
            cbe_url = f"https://kbopub.economie.fgov.be/kbopub/zoeknaamfonetischform.html?lang=en&searchWord={company_name}&_oudeBenaming=on&pstcdeNPRP=&postgemeente1=&ondNP=true&_ondNP=on&ondRP=true&_ondRP=on&rechtsvormFonetic=ALL&vest=true&_vest=on&filterEnkelActieve=true&_filterEnkelActieve=on&actionNPRP=Search"
            general_name_xpath = '//*[@id="onderneminglistfonetisch"]/tbody/tr/td[6]/text()'
            return extract_from_dom_tree_CBE(cbe_url, general_name_xpath)   

extract_address_CBE("AdShot")

[None, None, None]

In [29]:
import numpy as np
# OpenStreetMap API
from geopy.geocoders import Nominatim

# We use the address column to obtain the geocodes for each company
geolocator = Nominatim(user_agent="http", timeout= 10) 
geocoded_address = geolocator.geocode("Granbonpré,1348,Ottignies-Louvain-la-Neuve")
print(f"Latitude:{geocoded_address.latitude}, Longitude:{geocoded_address.longitude}")
print(geocoded_address.raw)

Latitude:50.66389, Longitude:4.6387666
{'place_id': 95913698, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 661016916, 'lat': '50.6638900', 'lon': '4.6387666', 'class': 'highway', 'type': 'unclassified', 'place_rank': 26, 'importance': 0.05338644464789803, 'addresstype': 'road', 'name': 'Granbonpré', 'display_name': 'Granbonpré, Parc Fleming, Louvain-la-Neuve, Ottignies-Louvain-la-Neuve, Nivelles, Brabant wallon, Wallonie, 1348, België / Belgique / Belgien', 'boundingbox': ['50.6626136', '50.6654084', '4.6371223', '4.6406426']}
