# Scraping Websites

Notes:

1) Importing the first_step_name_and_description_url.xlsx ("name" and "description url")
2) Detecting the website of each school and the page with the "Organigramme" in DuckDuckGo, we make two searches : "{name}" and "{name} organigramme"
3) We save the results in excel
4) Browsing the website to find emails and particularly domain name
5) From the page organigramme, we fetch the emails and the names of each person. If there is the email associated to each person -> ok
If there is no email associated to each person -> we build the email and we test them.
6) We save them.

In [6]:
from collections import deque
from selenium import webdriver
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import (
    TimeoutException,
    StaleElementReferenceException,
    NoSuchElementException,
    WebDriverException,
)
from openai import OpenAI
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
from collections import Counter

import os
import random
import psutil
import time 
import re 
import pandas as pd 
import json 

load_dotenv()

True

In [7]:
deep_seek_api_key = os.getenv("DEEPSEEK_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
qwen_api_key = os.getenv("QWEN_API_KEY")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import os
import random
import time
import psutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


def thread_worker(row):
    """Each thread creates its own FetchWebsites instance + driver."""
    fetcher = FetchWebsites()
    name = row["name"]
    desc_url = row["url"]

    try:
        time.sleep(random.uniform(1, 5))  # stagger startup
        print(f"[Thread] 🔍 Searching for: {name}")

        website = fetcher.search_duckduckgo(name)
            
        print(f"Found Website: {website}")

        fetcher._human_delay(2, 5)

        organigramme = fetcher.search_duckduckgo(f"{name} organigramme")

        print(f"Found Organigramme: {organigramme}")
            
        return {
            "index": row.name,
            "name": name,
            "description url": desc_url,
            "website": website,
            "organigramme_page": organigramme
        }
    finally:
        fetcher.clean_selenium()


class FetchWebsites:
    def __init__(self):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"
        ]
        self.driver = self._init_browser()

    def _init_browser(self):
        """Initialize Selenium Chrome driver with random user agent."""
        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        random_user_agent = random.choice(self.user_agents)
        chrome_options.add_argument(f"user-agent={random_user_agent}")
        return webdriver.Chrome(options=chrome_options)

    def clean_selenium(self):
        if getattr(self, "driver", None):
            try:
                self.driver.quit()
            except Exception as e:
                print(f"Error quitting WebDriver: {e}")

    def _human_delay(self, min_t=0.5, max_t=1.5):
        time.sleep(random.uniform(min_t, max_t))

    def _human_scroll(self):
        scroll_height = random.randint(200, 600)
        self.driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        self._human_delay(0.5, 1)

    def _human_mouse_move(self):
        actions = ActionChains(self.driver)
        body = self.driver.find_element(By.TAG_NAME, "body")
        actions.move_to_element_with_offset(body, random.randint(0, 500), random.randint(0, 300)).perform()
        self._human_delay(0.3, 0.7)

    def search_duckduckgo(self, query, retries=2):
        for attempt in range(retries + 1):
            try:
                # self.driver.get("https://duckduckgo.com/?kl=fr-fr")
                self._human_delay(1, 2)
                box = self.driver.find_element(By.NAME, "q")
                box.clear()
                for char in query:
                    box.send_keys(char)
                    self._human_delay(0.05, 0.2)
                box.send_keys(Keys.RETURN)
                self._human_delay(2, 3)
                self._human_scroll()
                if random.random() > 0.5:
                    self._human_mouse_move()
                results = self.driver.find_elements(By.CSS_SELECTOR, "li[data-layout='organic'] a[data-testid='result-title-a']")
                if results:
                    return results[0].get_attribute("href")
            except Exception as e:
                print(f"⚠️ Search failed for '{query}' attempt {attempt+1}: {e}")
                self.driver = self._init_browser()
                self._human_delay(2, 4)
        return None

    def run(self, input_file, output_file, max_workers=3):
        df = pd.read_excel(input_file)
        nb_rows = len(df)

        if os.path.exists(output_file):
            out_df = pd.read_excel(output_file)
            done_idx = set(out_df["index"])
            results = out_df.to_dict("records")
            print(f"▶️ Resuming from {len(done_idx)}/{nb_rows} already completed.")
        else:
            results, done_idx = [], set()

        pending = df[~df.index.isin(done_idx)]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(thread_worker, row): row for _, row in pending.iterrows()}

            for future in as_completed(futures):
                row = futures[future]
                try:
                    result = future.result()
                    results.append(result)
                    pd.DataFrame(results).to_excel(output_file, index=False)
                    print(f"💾 Saved {len(results)}/{nb_rows} (just finished {row['name']})")
                    time.sleep(random.uniform(1, 3))  # small cooldown
                except Exception as e:
                    print(f"⚠️ Error processing {row['name']}: {e}")

        print(f"✅ All results saved to {output_file}")


In [None]:
fetchWebsites = FetchWebsites()

fetchWebsites.run("first_step_name_and_description_url.xlsx", "websites_schools.xlsx")

In [None]:
class ScrapingStaff:
    def __init__(self, website, website_staff):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
        ]

        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        random_user_agent = random.choice(self.user_agents)
        chrome_options.add_argument(f"user-agent={random_user_agent}")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.emails = set()
        self.qwen_client = OpenAI(
            api_key=qwen_api_key,
            base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
        )
        self.website = website
        self.website_staff = website_staff
        self.visited_subpages = set()

    def restart_browser(self):
        """Fully restarts the Selenium browser."""
        self.clean_selenium()

        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        random_user_agent = random.choice(self.user_agents)
        chrome_options.add_argument(f"user-agent={random_user_agent}")

        # if os.getuid() == 0:
        #     chrome_options.add_argument("--disable-gpu")
        #     chrome_options.add_argument("--remote-debugging-port=9222")

        self.driver = webdriver.Chrome(
            options=chrome_options
        )

    def clean_selenium(self):
        if self.driver:
            try:
                self.driver.quit()  # Gracefully quit the browser
            except Exception as e:
                print(f"Error quitting WebDriver: {e}")

        # Kill and properly reap any remaining browser processes
        for proc in psutil.process_iter():
            try:
                if proc.name().lower() in [
                    "chromedriver",
                    "chrome",
                    "google-chrome",
                    "chromium",
                    "geckodriver",
                    "msedgedriver",
                    "chrome_crashpad",  # include this too
                    "cat",  # since you're seeing this too
                ]:
                    proc.kill()
                    try:
                        proc.wait(timeout=5)  # ✅ Reap the process
                    except (psutil.TimeoutExpired, psutil.NoSuchProcess):
                        pass
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue

    def get_emails(self, text):
        email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
        possible_emails = set(re.findall(email_regex, text))
        invalid_extensions = {"jpg", "jpeg", "png", "gif", "webp", "bmp", "svg", "tiff", "pdf"}
        valid_emails = set()
        for email in possible_emails:
            tld = email.rsplit(".", 1)[-1].lower()
            if tld not in invalid_extensions:
                valid_emails.add(email)
        return valid_emails

    def extract_visible_text(self, html):
        soup = BeautifulSoup(html, "html.parser")
        for script in soup(["script", "style", "meta", "svg", "noscript"]):
            script.decompose()
        return soup.get_text(separator=" ")

    def crawl_site_depth(self, max_depth=1):
        """
        Crawls a site using Selenium to extract emails, subpages, and external links.
        Limits crawling to `max_depth` hierarchical levels.
        """
        queue = deque([(self.website, 0)])
        failed_urls = set()

        while queue:
            url, depth = queue.popleft()
            normalized_url = url.split("#")[0]

            if normalized_url in self.visited_subpages or depth > max_depth:
                continue

            try:
                self.driver.set_page_load_timeout(10)
                self.driver.get(normalized_url)

                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                self.driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);"
                )

                time.sleep(2)

                html = self.driver.page_source
                self.visited_subpages.add(normalized_url)
                soup = BeautifulSoup(html, "html.parser")

                visible_text = self.extract_visible_text(html)
                new_emails = self.get_emails(visible_text)

                if new_emails:
                    print(f"📧 Emails found: {new_emails}")
                    self.emails.update(new_emails)

                for link in soup.find_all("a", href=True):
                    href = link["href"].split("#")[0]
                    absolute_link = urljoin(self.website, href)
                    parsed_url = urlparse(absolute_link)

                    if absolute_link in self.visited_subpages:
                        continue
                    
                    if any(absolute_link.endswith(ext) for ext in [".js", ".css", ".jpg", ".jpeg", ".png", ".pdf"]):
                        continue
                    if absolute_link.startswith("mailto:") or "javascript:void" in absolute_link:
                        continue

                    if parsed_url.netloc == urlparse(self.website).netloc:
                        if depth + 1 <= max_depth:
                            print(absolute_link)
                            queue.append((absolute_link, depth + 1))


            except (TimeoutException, WebDriverException) as e:
                print(f"⚠️ WebDriver error on {normalized_url}: {e}")

                if normalized_url not in failed_urls:
                    failed_urls.add(normalized_url)
                    print("🔁 Restarting browser and retrying once...")
                    self.restart_browser()
                    queue.append((url, depth))  # Requeue the same URL just once
                else:
                    print("❌ Already retried once. Skipping permanently.")
                continue

            except Exception as e:
                print(f"⚠️ Unexpected error loading {url}: {e}")
                continue

        return

    def find_url_staff_llm(self):

        print(self.visited_subpages)
        messages = [
            {
                "role": "system",
                "content": """
                    You are an assistant that selects the most relevant webpage URL for finding staff or organizational structure information.

                    From the provided list of URLs, return ONLY the single best candidate URL that is most likely to contain staff, team, personnel, or organigramme information.

                    - Output must be valid JSON in this format:

                    {
                    "staff_url": "https://example.com/..."
                    }

                    - If no suitable URL is found, return:

                    {
                    "staff_url": ""
                    }

                    Do not include explanations or any text outside the JSON.
        """,
            },
            {
                "role": "user",
                "content": f"Visited Urls:\n{self.visited_subpages}",
            },
        ]

        staff_url_obj = {"staff_url": ""}
        
        try:

            llm_response = self.qwen_client.chat.completions.create(
                model="qwen-plus",
                messages=messages,
                response_format={"type": "json_object"},
                temperature=0,
                max_tokens=8192,
            )

            content = llm_response.choices[0].message.content.strip()
            staff_url_obj = json.loads(content)
            print(staff_url_obj, "yeahhhhh")
        except json.JSONDecodeError:
            try:
                llm_response = self.qwen_client.chat.completions.create(
                    model="qwen-plus",
                    messages=[
                        {
                            "role": "system",
                            "content": """
                        You are an AI specialized in returning JSON in a valid format.
                        Do NOT modify the content—only ensure the JSON is properly formatted.
                        """,
                        },
                        {
                            "role": "user",
                            "content": f"Fix the JSON format:\n{llm_response.choices[0].message.content.strip()}",
                        },
                    ],
                    response_format={"type": "json_object"},
                    temperature=0,
                    max_tokens=8192,
                )

                content = llm_response.choices[0].message.content.strip()
                staff_url_obj = json.loads(content)

            except Exception as e:
                print(f"⚠️ LLM JSON Parsing Error: {e}")

        except Exception as e:
            print(f"⚠️ Error processing {self.website_staff}: {e}")
            
        return staff_url_obj.get("staff_url") or ""

    def extract_staff_llm(self):

        self.driver.get(self.website_staff)

        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        html = self.driver.page_source

        visible_text = self.extract_visible_text(html)

        messages = [
            {
                "role": "system",
                "content": """
                You are an information extraction assistant. 
                Your task is to analyze the provided webpage text and extract a list of staff members.

                Return the result in valid JSON with the following structure only:

                {
                "staff": [
                    {"name": "Full Name", "function": "Job Title or Role", "email": "Email"},
                    ...
                ]
                }

                - If any field is missing (e.g., no email), leave it as an empty string "".
                - Do not include extra text, explanations, or formatting outside of the JSON.
                - Preserve names and job titles as they appear in the text.
                - Extract only staff-related information (ignore unrelated text).
        """,
            },
            {
                "role": "user",
                "content": f"PAGE TEXT:\n{visible_text}",
            },
        ]

        result = {"staff": []}
        
        try:

            llm_response = self.qwen_client.chat.completions.create(
                model="qwen-plus",
                messages=messages,
                response_format={"type": "json_object"},
                temperature=0,
                max_tokens=8192,
            )

            content = llm_response.choices[0].message.content.strip()
            result = json.loads(content)
            if not isinstance(result, dict) or "staff" not in result:
                result = {"staff": []}

        except json.JSONDecodeError:
            try:
                llm_response = self.qwen_client.chat.completions.create(
                    model="qwen-plus",
                    messages=[
                        {
                            "role": "system",
                            "content": """
                        You are an AI specialized in returning JSON in a valid format.
                        Do NOT modify the content—only ensure the JSON is properly formatted.
                        """,
                        },
                        {
                            "role": "user",
                            "content": f"Fix the JSON format:\n{llm_response.choices[0].message.content.strip()}",
                        },
                    ],
                    response_format={"type": "json_object"},
                    temperature=0,
                    max_tokens=8192,
                )

                content = llm_response.choices[0].message.content.strip()
                result = json.loads(content)
                if not isinstance(result, dict) or "staff" not in result:
                    result = {"staff": []}

            except Exception as e:
                print(f"⚠️ LLM JSON Parsing Error: {e}")

        except Exception as e:
            print(f"⚠️ Error processing {self.website_staff}: {e}")

        return result
    
    def extract_staff(self):
        results = []

        def same_root(url1, url2):
            """Check if two URLs belong to the same root domain."""
            netloc1 = urlparse(url1).netloc.lower()
            netloc2 = urlparse(url2).netloc.lower()
            return netloc1 == netloc2 or netloc1.endswith("." + netloc2) or netloc2.endswith("." + netloc1)
        
        # 1) First try current website_staff
        staff_data = {}
        if self.website_staff and same_root(self.website, self.website_staff):
            print("NOPE")
            staff_data = self.extract_staff_llm()
            
        # 2) If no staff OR website_staff outside root → ask LLM to find a better page
        if (not isinstance(staff_data, dict) or not staff_data.get("staff")):
            # Try finding a better staff page and re-run once
            candidate = self.find_url_staff_llm()
            print("candidate : ", candidate)
            if candidate:
                self.website_staff = candidate
                staff_data = self.extract_staff_llm()

        if isinstance(staff_data, dict) and staff_data.get("staff"):
            for person in staff_data["staff"]:
                if isinstance(person, dict) and person.get("name"):
                    # Ensure required keys exist
                    results.append({
                        "name": person.get("name", "").strip(),
                        "function": person.get("function", "").strip(),
                        "email": person.get("email", "").strip(),
                    })
                    
        return results        
    
    def get_most_common_domain(self):
        domains = []
        for email in self.emails:
            if "@" in email:
                domain = email.split("@")[1].lower().strip()
                domains.append(domain)
        
        if not domains:
            return None, 0
        
        counter = Counter(domains)
        most_common_domain, count = counter.most_common(1)[0]
        return most_common_domain, count

    def __call__(self):
        print(self.website, self.website_staff)
        
        self.crawl_site_depth()
        
        staff = self.extract_staff()
        
        print(staff)
        
        print(self.emails)
        
        domain, count = self.get_most_common_domain()
        
        print(domain, count)
        
        self.clean_selenium()
        
        return staff, domain, self.emails
        
        

In [None]:
websites_schools = pd.read_excel("websites_schools.xlsx")

# Add empty columns for results
websites_schools["staff"] = None
websites_schools["domain"] = None
websites_schools["emails"] = None

for idx, row in websites_schools.iterrows():
    name = row["name"]
    description_url = row["description url"]
    website = row["website"]
    website_staff = row["organigramme_page"]

    if idx == 1:  # limit for testing
        scraping_staff = ScrapingStaff(website, website_staff)
        staff, domain, emails = scraping_staff()
        print(staff, domain, emails)

        # save results into DataFrame
        websites_schools.at[idx, "staff"] = str(staff)   # store as JSON-like string
        websites_schools.at[idx, "domain"] = domain
        websites_schools.at[idx, "emails"] = str(emails)

        # save to file after each loop
        websites_schools.to_excel("websites_schools_results.xlsx", index=False)