# Scraping Websites

Notes:

1) Importing the first_step_name_and_description_url.xlsx ("name" and "description url")
2) Detecting the website of each school and the page with the "Organigramme" in DuckDuckGo, we make two searches : "{name}" and "{name} organigramme"
3) We save the results in excel
4) Browsing the website to find emails and particularly domain name
5) From the page organigramme, we fetch the emails and the names of each person. If there is the email associated to each person -> ok
If there is no email associated to each person -> we build the email and we test them.
6) We save them.

In [None]:
from collections import deque
from selenium import webdriver
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import (
    TimeoutException,
    StaleElementReferenceException,
    NoSuchElementException,
    WebDriverException,
)
from openai import OpenAI
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse

import os
import random
import psutil
import time 
import re 
import pandas as pd 

load_dotenv()

In [None]:
deep_seek_api_key = os.getenv("DEEPSEEK_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
qwen_api_key = os.getenv("QWEN_API_KEY")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import os
import random
import time
import psutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


def thread_worker(row):
    """Each thread creates its own FetchWebsites instance + driver."""
    fetcher = FetchWebsites()
    name = row["name"]
    desc_url = row["url"]

    try:
        time.sleep(random.uniform(1, 5))  # stagger startup
        print(f"[Thread] 🔍 Searching for: {name}")

        website = fetcher.search_duckduckgo(name)
            
        print(f"Found Website: {website}")

        fetcher._human_delay(2, 5)

        organigramme = fetcher.search_duckduckgo(f"{name} organigramme")

        print(f"Found Organigramme: {organigramme}")
            
        return {
            "index": row.name,
            "name": name,
            "description url": desc_url,
            "website": website,
            "organigramme_page": organigramme
        }
    finally:
        fetcher.clean_selenium()


class FetchWebsites:
    def __init__(self):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"
        ]
        self.driver = self._init_browser()

    def _init_browser(self):
        """Initialize Selenium Chrome driver with random user agent."""
        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        random_user_agent = random.choice(self.user_agents)
        chrome_options.add_argument(f"user-agent={random_user_agent}")
        return webdriver.Chrome(options=chrome_options)

    def clean_selenium(self):
        if self.driver:
            try:
                self.driver.quit()
            except Exception as e:
                print(f"Error quitting WebDriver: {e}")

        for proc in psutil.process_iter():
            try:
                if proc.name().lower() in [
                    "chromedriver", "chrome", "google-chrome",
                    "chromium", "geckodriver", "msedgedriver", "chrome_crashpad"
                ]:
                    proc.kill()
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue

    def _human_delay(self, min_t=0.5, max_t=1.5):
        time.sleep(random.uniform(min_t, max_t))

    def _human_scroll(self):
        scroll_height = random.randint(200, 600)
        self.driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        self._human_delay(0.5, 1)

    def _human_mouse_move(self):
        actions = ActionChains(self.driver)
        body = self.driver.find_element(By.TAG_NAME, "body")
        actions.move_to_element_with_offset(body, random.randint(0, 500), random.randint(0, 300)).perform()
        self._human_delay(0.3, 0.7)

    def search_duckduckgo(self, query, retries=2):
        for attempt in range(retries + 1):
            try:
                self.driver.get("https://duckduckgo.com/?kl=fr-fr")
                self._human_delay(1, 2)
                box = self.driver.find_element(By.NAME, "q")
                box.clear()
                for char in query:
                    box.send_keys(char)
                    self._human_delay(0.05, 0.2)
                box.send_keys(Keys.RETURN)
                self._human_delay(2, 3)
                self._human_scroll()
                if random.random() > 0.5:
                    self._human_mouse_move()
                results = self.driver.find_elements(By.CSS_SELECTOR, "li[data-layout='organic'] a[data-testid='result-title-a']")
                if results:
                    return results[0].get_attribute("href")
            except Exception as e:
                print(f"⚠️ Search failed for '{query}' attempt {attempt+1}: {e}")
                self.driver = self._init_browser()
                self._human_delay(2, 4)
        return None

    def run(self, input_file, output_file, max_workers=3):
        df = pd.read_excel(input_file)
        nb_rows = len(df)

        if os.path.exists(output_file):
            out_df = pd.read_excel(output_file)
            done_idx = set(out_df["index"])
            results = out_df.to_dict("records")
            print(f"▶️ Resuming from {len(done_idx)}/{nb_rows} already completed.")
        else:
            results, done_idx = [], set()

        pending = df[~df.index.isin(done_idx)]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(thread_worker, row): row for _, row in pending.iterrows()}

            for future in as_completed(futures):
                row = futures[future]
                try:
                    result = future.result()
                    results.append(result)
                    tmp_file = output_file + ".tmp"
                    pd.DataFrame(results).to_excel(tmp_file, index=False)
                    os.replace(tmp_file, output_file) 
                    print(f"💾 Saved {len(results)}/{nb_rows} (just finished {row['name']})")
                    time.sleep(random.uniform(1, 3))  # small cooldown
                except Exception as e:
                    print(f"⚠️ Error processing {row['name']}: {e}")

        print(f"✅ All results saved to {output_file}")


In [None]:
fetchWebsites = FetchWebsites()

fetchWebsites.run("first_step_name_and_description_url.xlsx", "websites_schools.xlsx")

In [None]:
# def get_emails(self, text):
#     """Extracts and filters valid emails from the given text."""
#     email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
#     possible_emails = set(re.findall(email_regex, text))
#     valid_tlds = {
#         "com",
#         "org",
#         "net",
#         "edu",
#         "gov",
#         "eu",
#         "br",
#         "fr",
#         "de",
#         "es",
#         "pl",
#         "it",
#         "uk",
#         "ru",
#         "in",
#         "ch",
#     }
#     invalid_extensions = {"jpg", "jpeg", "png", "gif", "webp", "bmp", "svg", "tiff"}

#     valid_emails = set()

#     for email in possible_emails:
#         domain_parts = email.split(".")
#         tld = domain_parts[-1].lower()

#         if tld not in invalid_extensions:
#             valid_emails.add(email)

#     return valid_emails

# def extract_visible_text(html):
#     soup = BeautifulSoup(html, "html.parser")
#     for script in soup(["script", "style"]):
#         script.decompose()
#     return soup.get_text(separator=" ")

# def crawl_site_depth(base_url, max_depth=1):
#     """
#     Crawls a site using Selenium to extract emails, subpages, and external links.
#     Limits crawling to `max_depth` hierarchical levels.
#     """
#     visited_subpages = set()
#     queue = deque([(base_url, 0)])
#     failed_urls = set()

#     while queue:
#         url, depth = queue.popleft()
#         normalized_url = url.split("#")[0]
        
#         print(normalized_url)

#         if normalized_url in visited_subpages or depth > max_depth:
#             continue

#         try:
#             driver.set_page_load_timeout(10)
#             driver.get(normalized_url)
            
#             WebDriverWait(driver, 10).until(
#                 EC.presence_of_element_located((By.TAG_NAME, "body"))
#             )
            

#             driver.execute_script(
#                 "window.scrollTo(0, document.body.scrollHeight);"
#             )
        
#             time.sleep(2)
                            
#             html = driver.page_source
#             visited_subpages.add(normalized_url)
#             soup = BeautifulSoup(html, "html.parser")
                            
#             visible_text = extract_visible_text(html)                
#             new_emails = get_emails(visible_text)
                            
#             if new_emails:
#                 print(f"📧 Emails found: {new_emails}")
#                 emails.update(new_emails)
            
            
#             VIDEO_KEYWORDS = ["youtube", "vimeo", "dailymotion", "wistia", "player.", "video"]
            
#             for iframe in soup.find_all("iframe", src=True): #job boards in iframe
#                 iframe_src = iframe["src"].split("#")[0]
#                 iframe_url = urljoin(normalized_url, iframe_src)
#                 parsed_iframe_url = urlparse(iframe_url)

#                 if any(keyword in iframe_url.lower() for keyword in VIDEO_KEYWORDS):
#                     continue
                
#                 if iframe_url in visited_subpages:
#                     continue
                
#                 if any(iframe_url.endswith(ext) for ext in [".js", ".css", ".jpg", ".jpeg", ".png", ".pdf"]):
#                     continue
                
#                 if iframe_url.startswith("mailto:") or "javascript:void" in iframe_url:
#                     continue
                
#                 if parsed_iframe_url.netloc == urlparse(base_url).netloc:
#                     if depth + 1 <= max_depth:
#                         queue.append((iframe_url, depth + 1))
#                 else:
#                     external_urls.add(iframe_url)
    
#             for link in soup.find_all("a", href=True):
#                 # absolute_link = urljoin(base_url, link["href"])
#                 href = link["href"].split("#")[0]
#                 absolute_link = urljoin(base_url, href)
#                 parsed_url = urlparse(absolute_link)

#                 if absolute_link in visited_subpages:
#                     continue
                
#                 if any(absolute_link.endswith(ext) for ext in [".js", ".css", ".jpg", ".jpeg", ".png", ".pdf"]):
#                     continue
#                 if absolute_link.startswith("mailto:") or "javascript:void" in absolute_link:
#                     continue

#                 if parsed_url.netloc == urlparse(base_url).netloc:
#                     if depth + 1 <= max_depth:
#                         queue.append((absolute_link, depth + 1))
#                 else:
#                     external_urls.add(absolute_link)

#         except (TimeoutException, WebDriverException) as e:
#             print(f"⚠️ WebDriver error on {normalized_url}: {e}")
            
#             if normalized_url not in failed_urls:
#                 failed_urls.add(normalized_url)
#                 print("🔁 Restarting browser and retrying once...")
#                 restart_browser()
#                 queue.append((url, depth))  # Requeue the same URL just once
#             else:
#                 print("❌ Already retried once. Skipping permanently.")
#             continue

#         except Exception as e:
#             print(f"⚠️ Unexpected error loading {url}: {e}")
#             continue

#     return list(visited_subpages)
