In [None]:
# ====================================================
# 00 - Scraping EU Funding & Tenders Portal
# Goal: Reproducibly load the raw dataset (via public link),
#       build topic URLs, scrape topic descriptions,
#       and save results under data/processed/
# ====================================================

In [None]:
# --- 1) Setup (Chrome + Selenium) ---

!pip -q install selenium webdriver_manager gdown

#!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
#!sudo dpkg -i google-chrome-stable_current_amd64.deb || true
#!sudo apt-get -y -qq install -f


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# --- 2) Download raw CSV from public Google Drive link ---

import os, sys
import pandas as pd

# Public Drive file id from your link:
# https://drive.google.com/file/d/1K8Wb2p4l6Lr7-UxgXPQ1m8iajeYAK3IT/view?usp=sharing
DRIVE_FILE_ID = "1K8Wb2p4l6Lr7-UxgXPQ1m8iajeYAK3IT"

RAW_DIR = "../data/raw"
PROC_DIR = "../data/processed"
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)

RAW_PATH = os.path.join(RAW_DIR, "project.csv")

# Download with gdown (handles Drive specifics & big files)
if not os.path.exists(RAW_PATH):
    import gdown
    print("Downloading raw CSV from Google Drive…")
    gdown.download(id=DRIVE_FILE_ID, output=RAW_PATH, quiet=False)
else:
    print(f"Found existing raw file at {RAW_PATH}")

# Load CSV (semicolon-delimited in your pipeline)
# If the source has occasional malformed lines, you can add on_bad_lines='skip'
df_raw = pd.read_csv(RAW_PATH, delimiter=";", encoding="utf-8", on_bad_lines='skip')
print(f"Loaded raw dataframe: {df_raw.shape[0]} rows, {df_raw.shape[1]} columns")
display(df_raw.head(3))

Downloading raw CSV from Google Drive…


Downloading...
From: https://drive.google.com/uc?id=1K8Wb2p4l6Lr7-UxgXPQ1m8iajeYAK3IT
To: /Users/timschnelzer/Developer/tender-matching-horizon-europe/data/raw/project.csv
100%|██████████| 41.8M/41.8M [00:01<00:00, 25.6MB/s]


Loaded raw dataframe: 18110 rows, 20 columns


Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,ecSignatureDate,frameworkProgramme,masterCall,subCall,fundingScheme,objective,contentUpdateDate,rcn,grantDoi,keywords
0,101234994,OPTIMALMINE,SIGNED,OPTIMALMINE: slope optimal design for a paradi...,2025-09-01,2029-08-31,0,1072140,HORIZON.1.2,HORIZON-MSCA-2024-SE-01-01,2025-07-10,HORIZON,HORIZON-MSCA-2024-SE-01,HORIZON-MSCA-2024-SE-01,HORIZON-TMA-MSCA-SE,The European Union is currently addressing cha...,2025-07-25 11:08:05,274682,10.3030/101234994,"mine optimisation, rock slope engineering, o..."
1,101232577,HSAFE,SIGNED,Innovative high-sensitivity avalanche field-ef...,2025-09-01,2029-08-31,0,1618230,HORIZON.1.2,HORIZON-MSCA-2024-SE-01-01,2025-07-22,HORIZON,HORIZON-MSCA-2024-SE-01,HORIZON-MSCA-2024-SE-01,HORIZON-TMA-MSCA-SE,"The focus of the HSAFE project, which aligns w...",2025-07-25 11:08:05,274696,10.3030/101232577,"Field-effect transistor-based biosensors, Canc..."
2,101236527,DRU,SIGNED,Democratic Roles of Universities (DRU): Practi...,2026-02-01,2030-01-31,0,1593180,HORIZON.1.2,HORIZON-MSCA-2024-SE-01-01,2025-07-10,HORIZON,HORIZON-MSCA-2024-SE-01,HORIZON-MSCA-2024-SE-01,HORIZON-TMA-MSCA-SE,DRU’s objective is to find new ways that unive...,2025-07-25 11:08:04,274676,10.3030/101236527,"Universities, Citizen science, Civic engagement"


In [None]:
# --- 3) Build topic-detail URLs ---

# Defensive: make sure 'topics' exists
assert "topics" in df_raw.columns, "Column 'topics' not found in raw CSV."

# Unique topic IDs (non-empty strings)
topic_ids = (
    df_raw["topics"]
    .dropna()
    .astype(str)
    .map(str.strip)
    .loc[lambda s: s.ne("")]
    .unique()
    .tolist()
)
print(f"Unique topic IDs: {len(topic_ids)}")

BASE = "https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/"
PARAMS = "?isExactMatch=true&status=31094501,31094502,31094503"  # generic open/closed filters

topic_urls = {tid: f"{BASE}{tid}{PARAMS}" for tid in topic_ids}
print("Example URL:", next(iter(topic_urls.values())))

Unique topic IDs: 2225
Example URL: https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/HORIZON-MSCA-2024-SE-01-01?isExactMatch=true&status=31094501,31094502,31094503


In [None]:
# --- 4) Scrape topic descriptions with Selenium ---

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException,
    ElementClickInterceptedException, ElementNotInteractableException
)
import time

scraped = {}
errors = {}

print(f"Start scraping {len(topic_urls)} topic pages…")

driver = None
try:
    chrome_opts = webdriver.ChromeOptions()
    chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_experimental_option('excludeSwitches', ['enable-logging'])

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_opts)

    wait = WebDriverWait(driver, 20)

    for i, (topic_id, url) in enumerate(topic_urls.items(), start=1):
        print(f"[{i}/{len(topic_urls)}] {topic_id}")
        try:
            driver.get(url)

            # Locate the "Topic description" card
            card = wait.until(EC.presence_of_element_located((
                By.XPATH,
                "//eui-card[.//eui-card-header-title[normalize-space()='Topic description']]"
            )))

            # Try expanding "Show more"
            try:
                show_more = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((
                    By.XPATH,
                    "//eui-card[.//eui-card-header-title[normalize-space()='Topic description']]//eui-card-content//button"
                )))
                try:
                    show_more.click()
                except (ElementClickInterceptedException, ElementNotInteractableException):
                    driver.execute_script("arguments[0].click();", show_more)
                time.sleep(0.5)
            except (TimeoutException, NoSuchElementException):
                pass  # No button present

            content = driver.find_element(
                By.XPATH,
                "//eui-card[.//eui-card-header-title[normalize-space()='Topic description']]//eui-card-content"
            ).text

            scraped[topic_id] = content

        except TimeoutException:
            errors[topic_id] = "Timeout while locating description"
        except Exception as e:
            errors[topic_id] = f"Unexpected error: {e}"

        # Be a good citizen
        time.sleep(0.3)

finally:
    if driver:
        driver.quit()
        print("WebDriver closed.")

print(f"Scraped OK: {len(scraped)} | Errors: {len(errors)}")

Start scraping 2225 topic pages…
[1/2225] HORIZON-MSCA-2024-SE-01-01
[2/2225] HORIZON-CL3-2024-CS-01-02
[3/2225] HORIZON-MSCA-2024-DN-01-01
[4/2225] HORIZON-CL3-2024-DRS-01-02
[5/2225] HORIZON-CL6-2024-ZEROPOLLUTION-02-2-two-stage
[6/2225] HORIZON-MSCA-2024-PF-01-01
[7/2225] HORIZON-JU-EUROHPC-2025-AI-01-IBA-01
[8/2225] HORIZON-CL3-2024-SSRI-01-02
[9/2225] HORIZON-JTI-CLEANH2-2024-05-02
[10/2225] HORIZON-WIDERA-2024-TALENTS-03-01
[11/2225] HORIZON-EIC-2024-PATHFINDERCHALLENGES-01-05
[12/2225] ERC-2024-ADG
[13/2225] HORIZON-EIC-2024-ACCELERATORCHALLENGES-01
[14/2225] HORIZON-WIDERA-2024-ERA-02-01
[15/2225] HORIZON-WIDERA-2024-TALENTS-02-01
[16/2225] ERC-2025-POC
[17/2225] HORIZON-CL6-2024-FARM2FORK-02-3-two-stage
[18/2225] ERC-2024-COG
[19/2225] HORIZON-CL6-2024-FARM2FORK-02-2-two-stage
[20/2225] HORIZON-CL3-2024-FCT-01-08
[21/2225] HORIZON-CL3-2024-FCT-01-06
[22/2225] HORIZON-CL3-2024-FCT-01-02
[23/2225] HORIZON-CL3-2024-CS-01-01
[24/2225] HORIZON-CL3-2024-INFRA-01-03
[25/2225] HORIZON

KeyboardInterrupt: 

In [None]:
# --- 5) Save outputs under data/processed/ ---

import pandas as pd

# 1) Topic descriptions
df_topics = pd.DataFrame(
    [{"topics": k, "topic_description": v} for k, v in scraped.items()]
).sort_values("topics")
topics_out = os.path.join(PROC_DIR, "topic_descriptions.csv")
df_topics.to_csv(topics_out, index=False, encoding="utf-8")
print("Saved:", topics_out)

# 2) Scrape errors (if any)
if errors:
    df_err = pd.DataFrame(
        [{"topics": k, "error": v} for k, v in errors.items()]
    ).sort_values("topics")
    err_out = os.path.join(PROC_DIR, "topic_description_errors.csv")
    df_err.to_csv(err_out, index=False, encoding="utf-8")
    print("Saved:", err_out)

# 3) (Optional) Persist minimal mapping of topics->URL for reuse
df_url = pd.DataFrame(
    [{"topics": k, "topic_url": topic_urls[k]} for k in scraped.keys()]
).sort_values("topics")
url_out = os.path.join(PROC_DIR, "topic_urls.csv")
df_url.to_csv(url_out, index=False, encoding="utf-8")
print("Saved:", url_out)

In [None]:
# --- Step 6: Merge scraped descriptions into the DataFrame ---

import os

PROC_DIR = "data/processed"
os.makedirs(PROC_DIR, exist_ok=True)

if 'df_cleaned' not in globals():
    raise RuntimeError("df_cleaned not found. Run the loading/parsing step first.")
if 'scraped_descriptions' not in globals():
    raise RuntimeError("scraped_descriptions not found. Run the scraping step first.")

print("Merging scraped descriptions into the DataFrame…")

scraped_df = pd.DataFrame(
    list(scraped_descriptions.items()),
    columns=['topics', 'topics_description']
)

# Left-join on 'topics'
df_merged = pd.merge(df_cleaned, scraped_df, on='topics', how='left', validate='m:1')

print("Merge complete. Head:")
display(df_merged.head())
print(f"Rows with scraped descriptions: {df_merged['topics_description'].notna().sum()} / {len(df_merged)}")

# Step 7: Filter out projects with missing topic descriptions

In [None]:
print("Filtering out projects with missing topic descriptions…")
df_filtered = df_merged.dropna(subset=['topics_description']).copy()

print("Filtering complete.")
print(f"Original rows: {len(df_merged)}")
print(f"Kept rows (with topic descriptions): {len(df_filtered)}")
display(df_filtered.head())

In [None]:
# --- Step 8: Save the filtered DataFrame to the project folder ---

import re, ast
import numpy as np

# Resolve project root (assumes notebooks/ and data/ at same level)
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
PROC_DIR = os.path.join(DATA_DIR, "processed")
os.makedirs(PROC_DIR, exist_ok=True)

# Input/Output paths
IN_PATH  = os.path.join(PROC_DIR, "filtered_project_data_with_descriptions.csv")
OUT_PATH = os.path.join(PROC_DIR, "cleaned_project_data.csv")

# Google Drive file (public)
DRIVE_FILE_ID = "18FQxIKGF32Qi-RZtLFVg7HHcFRsFMBOF"

print("Project root:", PROJECT_ROOT)
print("Input file  :", IN_PATH)
print("Output file :", OUT_PATH)
