# Belgian AI Landscape

## Part 1: Scraping and storing data

### 1. Import Modules

In [2]:
import time
import numpy as np
import pandas as pd
from scrapeData import scrape_pages_ai4belgium, extract_address_CBE, extract_address_google_search
from concurrent.futures import ThreadPoolExecutor

### 2. Scrape company information from AI4Belgium

In [None]:
# Create a DataFrame from the AI4Belgium website and save as a DataFrame
ai_df = scrape_pages_ai4belgium()

https://community.ai4belgium.be/en/ai-landscape?nav=0&page=1
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=1
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=2
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=2
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=3
Setting up WebDriver...
Navigating to https://community.ai4belgium.be/en/ai-landscape?nav=0&page=3
No GDPR cookie banner found! Continue ...
Attempting to locate and switch to the iframe...
✅ Found the company item element(s)! Count: 25
https://community.ai4belgium.be/en/ai-landscape?nav=0&page=4
Setting up WebDriver...
Navigating to https:/

In [None]:
ai_df.head()

Unnamed: 0,name,url,logo,categories,creation date,region
0,6Wolves,http://www.6wolves.ai/,https://community.ai4belgium.be/ai-landscape//...,Healthcare & biotech,2019,Brussels
1,ActiveMe,http://activeme.be/,https://community.ai4belgium.be/ai-landscape//...,"Social (Education, Civic Tech, Entertainment, ...",2012,Wallonia
2,Adshot,http://adshot.io/,https://community.ai4belgium.be/ai-landscape//...,Image & video processing,2016,Flanders
3,Adunio,http://www.adun.io/,https://community.ai4belgium.be/ai-landscape//...,Sales & Marketing,2019,Flanders
4,Agilytic,http://www.agilytic.be/,https://community.ai4belgium.be/ai-landscape//...,Services,2015,Wallonia


### 3. Search for company address

    Note: Here we use threading to speed-up our code.

In [21]:
ai_df = pd.read_csv("../data/raw_scraped_dataset.csv")

# 1. Apply the function to the 'name' column, resulting in a Series of tuples.
results_series = ai_df['name'].apply(extract_address_CBE)

# 2. Convert the Series of tuples into a DataFrame and assign column names.
new_cols_df = results_series.apply(pd.Series)
new_cols_df.columns = ["street", "zip_code", "city"]

# 3. Concatenate the new columns to the original DataFrame.
ai_df = pd.concat([ai_df, new_cols_df], axis=1)

In [22]:
display(ai_df.head(10))

Unnamed: 0,name,url,logo,categories,creation date,region,street,zip_code,city
0,6Wolves,http://www.6wolves.ai/,https://community.ai4belgium.be/ai-landscape//...,Healthcare & biotech,2019.0,Brussels,,,
1,ActiveMe,http://activeme.be/,https://community.ai4belgium.be/ai-landscape//...,"Social (Education, Civic Tech, Entertainment, ...",2012.0,Wallonia,Granbonpré,1348.0,Ottignies-Louvain-la-Neuve
2,Adshot,http://adshot.io/,https://community.ai4belgium.be/ai-landscape//...,Image & video processing,2016.0,Flanders,,,
3,Adunio,http://www.adun.io/,https://community.ai4belgium.be/ai-landscape//...,Sales & Marketing,2019.0,Flanders,Luchthavenlaan 27 / 56,1800.0,Vilvoorde
4,Agilytic,http://www.agilytic.be/,https://community.ai4belgium.be/ai-landscape//...,Services,2015.0,Wallonia,Clos de Rambouillet,1410.0,Waterloo
5,Aividens,http://www.aividens.com/,https://community.ai4belgium.be/ai-landscape//...,Legal & FinTech,2019.0,Brussels,Avenue des Volontaires 19,1160.0,Auderghem
6,Alberts,http://alberts.be/,https://community.ai4belgium.be/ai-landscape//...,"Agritech, Cleantech & energy",2015.0,Flanders,Nerviërsstraat 49,1730.0,Asse
7,Allthingsblue,http://allthingsblue.eu/,https://community.ai4belgium.be/ai-landscape//...,HR & Skills,2019.0,Flanders,,,
8,AlterEdu,http://www.alteredu.be/,https://community.ai4belgium.be/ai-landscape//...,HR & Skills,2019.0,Flanders,Eliaertsstraat 32,2140.0,Antwerpen
9,Amethix,http://amethix.com/,https://community.ai4belgium.be/ai-landscape//...,"RPA, Internal Data & Intelligence",2017.0,Flanders,Rue de l'Enseignement 25,1000.0,Bruxelles


In [24]:
na_df = ai_df[ai_df["street"].isna()]
print(f"Address information missing for {round(len(na_df)/len(ai_df),2)*100}% records.")
display(na_df)

Address information missing for 24.0% records.


Unnamed: 0,name,url,logo,categories,creation date,region,street,zip_code,city
0,6Wolves,http://www.6wolves.ai/,https://community.ai4belgium.be/ai-landscape//...,Healthcare & biotech,2019.0,Brussels,,,
2,Adshot,http://adshot.io/,https://community.ai4belgium.be/ai-landscape//...,Image & video processing,2016.0,Flanders,,,
7,Allthingsblue,http://allthingsblue.eu/,https://community.ai4belgium.be/ai-landscape//...,HR & Skills,2019.0,Flanders,,,
10,AMIA Systems,http://www.amia-systems.com/,https://community.ai4belgium.be/ai-landscape//...,Industrial,2014.0,Brussels,,,
11,Amigrow,https://amigrowfarming.com/,https://community.ai4belgium.be/ai-landscape//...,"Agritech, Cleantech & energy",2018.0,Flanders,,,
...,...,...,...,...,...,...,...,...,...
381,VeriFlix,http://www.veriflix.ai/en/,https://community.ai4belgium.be/ai-landscape//...,Image & video processing,2016.0,Wallonia,,,
386,WEPOC,https://wepoc.be/,https://community.ai4belgium.be/ai-landscape//...,Services,2019.0,Wallonia,,,
389,AI Blackbelt,https://www.aiblackbelt.com/,https://community.ai4belgium.be/ai-landscape//...,"Social (Education, Civic Tech, Entertainment, ...",2018.0,Brussels,,,
390,La Scientotheque,https://lascientotheque.be/,https://community.ai4belgium.be/ai-landscape//...,"Social (Education, Civic Tech, Entertainment, ...",,Brussels,,,


We will drop the missing data for now and save as our clean CSV.

In [29]:
final_df = ai_df.dropna()
print(f'Final dataset Size: {len(final_df)} companies.')
final_df.to_csv("../data/clean_scraped_dataset.csv", index = False)

Final dataset Size: 279 companies.


### 5. Next Steps

1. Automate filling in missing values. Idea: Check other websites for scraping.
2. Visualize gathered data in a map.
3. Create Streamlit App and deploy.

To address the missing values, we opted to manually input the missing addresses while working on automating this process.

To get our data ready for our app, we wanted to preprocess the addresses to also extract the zipcode and city as separate columns for further filtering.

## Part II: Visualizing

In [1]:
import pandas as pd
from geocodeAddress import create_address_column, geocode_address

final_df = pd.read_csv("../data/clean_scraped_dataset.csv")

# 1. Apply the function to the 'name' column, resulting in a Series of tuples.
final_df = create_address_column(final_df)
geocoded_address_series = final_df['address'].apply(geocode_address)


# 2. Convert the Series of tuples into a DataFrame and assign column names.
new_cols_df = geocoded_address_series.apply(pd.Series)
new_cols_df.columns = ["lat", "lon"]

# 3. Concatenate the new columns to the original DataFrame.
geocoded_df = pd.concat([final_df, new_cols_df], axis=1)

In [2]:
geocoded_df.to_csv("../data/geocoded_dataset.csv", index = False)
display(geocoded_df.head())

Unnamed: 0,name,url,logo,categories,creation date,region,street,zip_code,city,address,lat,lon
0,ActiveMe,http://activeme.be/,https://community.ai4belgium.be/ai-landscape//...,"Social (Education, Civic Tech, Entertainment, ...",2012.0,Wallonia,Granbonpré,1348,Ottignies-Louvain-la-Neuve,Granbonpré 1348 Ottignies-Louvain-la-Neuve,50.66389,4.638767
1,Adunio,http://www.adun.io/,https://community.ai4belgium.be/ai-landscape//...,Sales & Marketing,2019.0,Flanders,Luchthavenlaan 27 / 56,1800,Vilvoorde,Luchthavenlaan 27 / 56 1800 Vilvoorde,50.922049,4.443258
2,Agilytic,http://www.agilytic.be/,https://community.ai4belgium.be/ai-landscape//...,Services,2015.0,Wallonia,Clos de Rambouillet,1410,Waterloo,Clos de Rambouillet 1410 Waterloo,50.727426,4.388866
3,Aividens,http://www.aividens.com/,https://community.ai4belgium.be/ai-landscape//...,Legal & FinTech,2019.0,Brussels,Avenue des Volontaires 19,1160,Auderghem,Avenue des Volontaires 19 1160 Auderghem,50.826944,4.404133
4,Alberts,http://alberts.be/,https://community.ai4belgium.be/ai-landscape//...,"Agritech, Cleantech & energy",2015.0,Flanders,Nerviërsstraat 49,1730,Asse,Nerviërsstraat 49 1730 Asse,50.916292,4.18998


In [3]:
#Test interactive map function
import pandas as pd
from interactiveMap import get_location_interactive

geocoded_df = pd.read_csv("../data/geocoded_dataset.csv")
get_location_interactive(geocoded_df)