In [6]:
import nest_asyncio
from flask import Flask, request, render_template, render_template_string, send_file
import pandas as pd
import os
from duckduckgo_search import DDGS
import requests
from bs4 import BeautifulSoup
from io import BytesIO
import threading
import re
import json
from together import Together
import openai  
from groq import Groq
from openai import OpenAI
import tiktoken
from urllib.parse import urlparse, urljoin
from itertools import chain
import concurrent.futures
from functools import partial
from time import sleep
import time
import unicodedata
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [7]:
api_key = os.getenv("api_key")
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY") 
SERPER_API_KEY = os.getenv("SERPER_API_KEY") 

# 1- AVAILABLE LLM WITH FALLBACK LOGIC

In [10]:
def call_ai_model(prompt):  
    # Try OpenRouter
    model = ""
    try:
        client = OpenAI(
          base_url="https://openrouter.ai/api/v1",
          api_key= OPENROUTER_API_KEY,
        ) 
        response = client.chat.completions.create(
            model="mistralai/mistral-small-3.2-24b-instruct:free",
            messages=[{"role": "user", "content": [ {"type": "text", "text": prompt}]}])
        # print("✅ Used OpenRouter")
        return response.choices[0].message.content

    except Exception as e:
        # print(f"⚠️ OpenRouter AI failed")
        print ("")

    # Try Groq
    try:
        groq_client = Groq(api_key=GROQ_API_KEY)
        response = groq_client.chat.completions.create(
            model="compound-beta-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=1,
            max_completion_tokens=3000,
            top_p=1,
            stream=False,
            stop=None
        )
        # print("✅ Used Groq")
        return response.choices[0].message.content

    except Exception as e:
        # print(f"⚠️ Groq failed: {e}")
        print ("")

    # Try Together AI
    try:
        os.environ["TOGETHER_API_KEY"] = TOGETHER_API_KEY
        together_client = Together()
        response = together_client.chat.completions.create(
            model="deepseek-ai/DeepSeek-V3",
            messages=[{"role": "user", "content": prompt}]
        )
        # print("✅ Used Together AI")
        return response.choices[0].message.content

    except Exception as e:
        print(f"❌ All models failed: {e}")
        return "Sorry, no AI model is available right now."

# 2- LINKS OF THE SUPPLIERS WITH SERPER API

# 2-1 Serper Search function

In [13]:
def perform_search(url, headers, q, max_results):
    # JSON payload with the search query q
    payload = {"q": q}
    try:
        # API call to url serper with json query
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        data = response.json()
        # Extract the list of organic search results (ignores ads or sponsored content) and limits them to max_results.
        organic_results = data.get("organic", [])[:max_results]

        # Liste des domaines / extensions à exclure
        excluded_domains = ["facebook.com", "instagram.com", "linkedin.com", "youtube.com", "twitter.com"]
        excluded_extensions = [".pdf", ".doc", ".xls", ".xlsx", ".ppt", ".pptx"]

        filtered_results = []
        for item in organic_results:
            link = item.get("link", "").lower()

            # Exclure si extension interdite
            if any(link.endswith(ext) for ext in excluded_extensions):
                continue

            # Exclure si domaine interdit (on regarde si l'URL contient le domaine)
            if any(domain in link for domain in excluded_domains):
                continue

            filtered_results.append({"title": item.get("title", ""), "href": item.get("link", "")})

            # Stopper dès qu'on atteint max_results après filtrage
            if len(filtered_results) >= max_results:
                break

        return filtered_results

    except Exception as e:
        print("Serper API call failed:", e)
        return []


# 2-2 Test scrapability

In [17]:
def is_scrapable(url, min_text_length=100, min_tags=5):
    """Détermine si une page nécessite JS (Selenium) ou peut être scrapée par requests.

    Retourne :
    - "requests" si scrapable sans JS,
    - "selenium" si JS nécessaire,
    - "impossible" en cas d’erreur ou d’échec.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.199 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }

    def fetch(url, timeout):
        return requests.get(url, timeout=timeout, headers=headers)

    try:
        # Premier essai timeout court
        response = fetch(url, timeout=5)

        if response.status_code != 200:
            logger.warning(f"❌ {url} returned status {response.status_code}")
            return "impossible"

    except requests.exceptions.Timeout:
        logger.warning(f"⏳ Timeout 5s sur {url}, nouvelle tentative avec timeout 10s...")
        try:
            response = fetch(url, timeout=10)
            if response.status_code != 200:
                logger.warning(f"❌ {url} returned status {response.status_code} après retry")
                return "impossible"
        except Exception as e:
            logger.error(f"⚠️ Erreur d'accès à {url} après retry 10s: {e}")
            return "impossible"

    except Exception as e:
        logger.error(f"⚠️ Erreur d'accès à {url}: {e}")
        return "impossible"

    # Analyse du contenu
    soup = BeautifulSoup(response.text, "html.parser")
    visible_text = soup.get_text(strip=True)
    tag_count = len(soup.find_all(["div", "p", "span", "section"]))

    if len(visible_text) < min_text_length or tag_count < min_tags:
        logger.info(f"⚠️ {url} semble nécessiter JavaScript → Selenium recommandé.")
        return "selenium"

    logger.info(f"✅ {url} est scrapable sans JavaScript.")
    return "requests"


# 2-3 Main Search

In [20]:
def search_with_serper(input_ma, input_com, max_domain): 
    url = "https://google.serper.dev/search"
    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json"
    }

    preferred_sites = ["alstena.com", "aval.ma", "abratex.ma", "sdfi.ma", "ogicom.ma",
                       "polindus.ma", "ft2e.ma", "cfimmaroc.com", "mtsindustrie.ma"]

    site_query = " OR ".join([f"site:{site}" for site in preferred_sites])
    prioritized_query = f"{input_ma} ({site_query})"

    # query_fr = f"{input_com} site:.fr"
    query_ma = f"{input_com} site:.ma"
    
    seen_sites = set()
    filtered_results = []
    deduplicated_results = []

    results_pr = perform_search(url, headers, prioritized_query, max_domain)
    results_ma = perform_search(url, headers, query_ma, max_domain)
    # results_fr = perform_search(url, headers, query_fr, max_domain)
    max_domain = max_domain*2
    results_others = perform_search(url, headers, input_com, max_domain)
    all_results = results_pr + results_ma + results_others

    print("✅ All URLs Serper Search Results :", len (all_results))
    for u in all_results:
        print(u["href"])
    
# Checking scrapability
    print("🔍 Checking scrapability for search results...")
    for result in all_results:
        href = result.get("href")
        mode = is_scrapable(href)
        if mode == "requests":
            filtered_results.append(result)
        elif mode == "selenium":
            # html, text = scrape_with_selenium(url)
            print(f"YOU HAVE TO USE SELENIUM")
        else:
            print("URL not accessible")
# Avoid redundance
    for resulta in filtered_results:
        href = resulta.get("href")
        parsed = urlparse(href)
        base_url = f"{parsed.scheme}://{parsed.netloc}"
        if base_url not in seen_sites:
            seen_sites.add(base_url)
            deduplicated_results.append(resulta)

    print("✅ Filtered URLs with total :", len (filtered_results))
    print("✅ Filtered and deduplicated URLs ready for scraping with total :", len (deduplicated_results))
    for r in deduplicated_results:
        print(r["href"])

    return deduplicated_results

# 3-EXTRACT INFO FROM HTML

# 3-1 Kerix.net Scraping

In [24]:
def normalize_query(search_query):
    # Convert to lowercase
    query = search_query.lower()

    # Remove common French prepositions/articles
    query = re.sub(r"\b(de|du|des|la|le|l'|d'|au|aux|en|et)\b", "", query)

    # Normalize accents: ç → c, é → e, etc.
    query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore').decode('utf-8')

    # Replace remaining non-alphanum with dashes
    query = re.sub(r"[’'\s]+", "-", query)

    # Remove multiple consecutive dashes
    query = re.sub(r"-+", "-", query)

    # Strip leading/trailing dashes
    return query.strip("-")


In [26]:
def extract_kerix_phone(soup):
    """Extract Moroccan phone numbers from page"""
    phone_text = soup.get_text()
    phones = re.findall(r"(?:\+212|0)[\s-]?[5-7]\d[\s-]?\d{3}[\s-]?\d{3}", phone_text)
    return phones[0] if phones else None

def extract_kerix_email(soup):
    """Extract email from contact section"""
    for link in soup.select('a[href^="mailto:"]'):
        if "@" in link.get_text():
            return link.get_text().strip()
    return None

def extract_kerix_address(soup):
    """Extract address from Kerix.net profile pages"""
    # Method 1: Check the specific address container
    if address_div := soup.select_one('div.col-lg-6 p.card-text'):
        # Clean up the address text
        address = ' '.join(address_div.get_text(strip=True, separator=' ').split())
        return address
    
    # Method 2: Fallback to general card text search
    for p in soup.select('p.card-text'):
        text = p.get_text(strip=True)
        if any(x in text.lower() for x in ["bd", "rue", "av.", "casablanca", "maroc"]):
            return ' '.join(text.split())  # Normalize whitespace

    return None

In [28]:
def get_kerix_contacts(search_query, max_links, delay=3):

    base_url = "https://www.kerix.net"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept-Language": "fr-FR,fr;q=0.9",
    }

    # Step 1: Search page
    # formatted_query = re.sub(r"[’'\s]", "-", search_query.lower())
    formatted_query = normalize_query(search_query)
    search_url = f"{base_url}/fr/annuaire-entreprise/{formatted_query}.html"
    
    try:
        time.sleep(delay)
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"❌ Kerix Search failed: {e}")
        return []

    # Step 2: Extract company links using PRECISE selectors
    soup = BeautifulSoup(response.text, 'html.parser')
    company_links = []
    
    # NEW: Correct selector based on actual HTML
    for card in soup.select('div.card.mt-2'):  # Matches the company cards exactly
        # Method 1: Get link from title
        if title := card.select_one('h5.card-title a[href*="/fr/annuaire-entreprise/"]'):
            company_links.append(urljoin(base_url, title['href']))
        
        # Method 2: Alternative fallback (the "Voir plus" button)
        elif voir_plus := card.select_one('a.btn-success[href*="/fr/annuaire-entreprise/"]'):
            company_links.append(urljoin(base_url, voir_plus['href']))
        if len(company_links) >= max_links:
            break

    # Step 3: Scrape company pages with contact extraction
    results = []
    for url in company_links:
        try:
            time.sleep(delay)
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            company_soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract key information
            company_info = {
                "part_number": "Not identified",
                "name": company_soup.select_one('h1').get_text(strip=True) if company_soup.select_one('h1') else None,
                "email": extract_kerix_email(company_soup),
                "phone": extract_kerix_phone(company_soup),              
                "address": extract_kerix_address(company_soup),
                "price": "Demander un devis",
                "url": url,
            }
            results.append(company_info)
            
        except Exception as e:
            print(f"⚠️ Failed to process {url}: {e}")
            continue

    # Final Output
    if results:
        print("\n📋 Résultats extraits depuis Kerix.net :")
        for res in results:
            print("-" * 80)
            for key, value in res.items():
                print(f"{key.capitalize():<10}: {value}")
    else:
        print("❗ Aucun résultat trouvé sur Kerix.net.")
        
    return results

# 3-2 Using LLM

In [31]:
def extract_supplier_info_from_html(text, url):
    prompt = f"""
You are a structured information extraction agent. Analyze the following extracted text from an industrial materials or tools web page ({url}) and extract the supplier and product information in **strict JSON format**.

Extract and return these exact fields:
{{
  "part_number": "Product reference such as SKU, NSN, ref, model number, etc.",
  "name": "Company or supplier name. If the name is not found, extract the domain name from the URL (e.g., 'example.com') and use it.",
  "email": "Professional business email address",
  "phone": "Phone or WhatsApp number with area/country code if available",
  "address": "Physical or business address of the supplier",
  "price": "Price of the product (with currency symbol or code if mentioned)",
  "url": "Use the provided URL exactly as given"
}}

⚠️ Extraction Guidelines:
- Only include fields **explicitly found** in the text — do **not guess** or infer missing data.
- If a field is **not present**, return it as an empty string "".
- Do not include explanations, comments, or any extra text.
- Focus only on supplier contact details and product identifiers/prices.
- Ignore navigation menus, disclaimers, ads, or general company descriptions.

Your response must be a **valid JSON object**. No markdown, no code blocks, and no human language output.

Here is the extracted text:
{text[:5000]}
"""
    content = call_ai_model(prompt)   
    try:
        json_str = re.search(r"\{.*\}", content, re.DOTALL).group()
        resultat= json.loads(json_str)
        logger.debug(f"✅ voici le résultat de passage au LLM de l'Url: {url}")
        print (resultat)
        return resultat
        
    except Exception as e:
        print(f"⚠️ Failed to parse response for URL: {url}\nRaw content:\n{content}\nError: {e}")
        return {
            "part_number": "",
            "name": "",
            "email": "",
            "phone": "",
            "address": "",
            "price": "",
            "url": url
        }

# 4-INFO FROM QUERY USING LLM

In [34]:
def extract_vehicle_and_part(user_query):
    prompt = f"""
You are a multilingual smart assistant.

The user is looking for an industrial material. Based on this request:
"{user_query}"

Return the result as a JSON object with the following fields:

- "category": General category like "tools", "equipment", "fasteners", "hydraulic parts", etc.
- "item": Specific English name of the material (e.g., "torque wrench", "abrasive paper")
- "item_french": French translation of "item" (e.g., "clé dynamométrique", "papier abrasif")
- "context": Optional detail like usage, target machine, or size.

Return ONLY valid JSON, like this:
{{
  "category": "Tools",
  "item": "Abrasive paper",
  "item_french": "Papier abrasif",
  "context": "for automotive bodywork sanding"
}}
If a field is not mentioned, return it as an empty string "".
"""

    content = call_ai_model(prompt)

    # Clean code block formatting if present
    content_clean = re.sub(r"```json|```", "", content)

    try:
        json_match = re.search(r"\{.*\}", content_clean, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group())
            item = parsed.get("item", "")
            item_french = parsed.get("item_french", "")
            category = parsed.get("category", "")
            context = parsed.get("context", "")
            # print(f"item: {item} | item_french: {item_french} | category: {category} | context: {context}")
        else:
            raise ValueError("No valid JSON found in response")

    except Exception as e:
        print("⚠️ JSON parsing failed. Raw content:", content)
        item = item_french = category = context = ""

    return item, item_french, category, context


# 5-TEXT FROM HTML USING DOM TREE WITH BEAUTIFULSOUP

In [37]:
def extract_relevantText_from_HTML (sub_url):

    sub_page = fetch_protected_page(sub_url)
    soup = BeautifulSoup(sub_page.text, "html.parser")
    
    
# ➤ DOM-Aware content extraction from main page
    supplier_sections = []
    price_sections = []
    reference_sections = []

    # Define contact-related keywords
    contact_keywords = ["contact", "info", "email", "mail", "phone", "tel", "adresse", "address", "whatsapp"]
    # Loop through candidate tags
    for tag in ["footer", "address", "header", "section", "div"]:
        for section in soup.find_all(tag):
            classes = " ".join(section.get("class", [])).lower()
            id_attr = section.get("id", "").lower()
            
            # Only keep blocks where id/class contains contact-related info
            if any(keyword in classes or keyword in id_attr for keyword in contact_keywords):
                contact_section = section.get_text(separator=" ", strip=True)
                supplier_sections.append(contact_section)  
    
    # Search for price candidates
    for tag in soup.find_all(["div", "span", "p", "strong"]):
        class_name = " ".join(tag.get("class", []))
        id_name = tag.get("id", "")
    
        if any(keyword in class_name.lower() or keyword in id_name.lower() for keyword in ["price", "cost", "amount", "tarif", "prix"]):
            price_text = tag.get_text(strip=True)
            if any(c in price_text for c in "$€£DH") or any(char.isdigit() for char in price_text):        
                price_sections.append(price_text)
    
    # Search for part number candidates
    for tag in soup.find_all(["div", "span", "p", "li", "td", "th"]):
        text = tag.get_text(strip=True).lower()
        if any(kw in text for kw in ["part number", "référence", "sku", "ref", "nsn"]):
            reference = tag.get_text(strip=True)
            reference_sections.append(reference)
    
    all_sections = list(dict.fromkeys(chain(supplier_sections, price_sections, reference_sections)))

    text = "\n".join(all_sections)
    # print (text)

    return soup, text

# 6-FETCHING

In [40]:
def build_search_query(item, category, context):

    item_part = f"{item} suppliers" if item else "industrial material suppliers"
    category_part = f"in the {category} category" if category else ""
    context_part = f"for {context}" if context else "for industrial applications"

    input_ma = item
    # Assemble the full query
    input_com = f"{item_part}"
    if category_part:
        input_com += f" {category_part}"
    input_com += f" {context_part}"
    
    print(input_com)
    return input_ma, input_com


# 6.1 Smart Direct Request

In [43]:
def smart_request(url, headers=None, max_retries=5):
    delay = 1
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            else:
                print(f"[{response.status_code}] Non-200 response. Retry...")
        except requests.RequestException as e:
            print(f"❌ Error: {e}. Retry in {delay}s...")

        time.sleep(delay + random.uniform(0, 1))
        delay *= 2  # Backoff exponentiel

    print("❌ Échec après plusieurs tentatives.")
    return None


# 6.2 Use of Selenium

In [46]:
def scrape_with_selenium(url):
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        time.sleep(5)  # wait for JS to load; adjust if needed

        html = driver.page_source
        driver.quit()

        # Create a fake Response object
        response = Response()
        response.status_code = 200
        response._content = html.encode("utf-8")
        response.url = url

        return response

    except Exception as e:
        driver.quit()
        raise e


# 6.3 Unified fetching and Use of ScraperAPI

In [49]:
def fetch_protected_page(url, use_selenium=True):
    """
    Try fetching a protected page:
      1. Smart requests
      2. ScraperAPI
      3. Selenium (optional)
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36"
    }

    # 1️⃣ Try direct smart request
    response = smart_request(url, headers)
    if response is not None:
        return response


    # 2️⃣ Try ScraperAPI
    logger.info("⏱️ Passage à ScraperAPI...")
    payload = {"api_key": api_key, "url": url}
    scraper_url = "http://api.scraperapi.com"

    try:
        resp = requests.get(scraper_url, params=payload, timeout=15)
        if resp.status_code == 200:
            return resp
    except Exception as e:
        logger.error(f"ScraperAPI a échoué: {e}")

    # 3️⃣ Selenium fallback (optional)
    if use_selenium:
        logger.info("⚡ Passage à Selenium...")
        resp = scrape_with_selenium(url)
        if resp and resp.status_code == 200:
            return resp

    # If all fails
    logger.error(f"❌ Impossible de récupérer {url}")
    return "", ""

# 7- GET THE LINKS AND SUBLINKS

In [52]:
def get_contact_links(soup, base_url):
    # contact_links = {base_url}  # Avoid duplicates
    contact_links = set()  # Avoid duplicates
     
    for link in soup.find_all("a", href=True):
        href = link["href"].lower()
        if any(kw in href for kw in ["contact", "about", "a-propos", "entreprise"]):
            full_url = requests.compat.urljoin(base_url, href)
            contact_links.add(full_url)
    
    contact_links_list = list(contact_links)
    logger.debug(f"✅ voici le principal Url: {base_url}")
    for link in contact_links_list[:4]:
        print(f"⚠️ voici Ses subUrl à analyser: {link}")
        
    return contact_links_list

# 8-PARALLEL SCRAPING

In [55]:
def scrape_supplier(url, max_subpages=4):
    """Scrape a supplier's main URL + subpages, aggregate text, and extract info once."""

    # Extract main page
    soup, main_text = extract_relevantText_from_HTML(url)

    # Find subpage links
    contact_links = get_contact_links(soup, url)[:max_subpages]

    # Extract texts from subpages (in parallel)
    sub_texts = scrape_subpages_texts(contact_links)

    # Combine all texts
    all_texts = [main_text] + sub_texts
    grouped_text = "\n".join(all_texts)

    # Call AI **once** with grouped text
    supplier_info = extract_supplier_info_from_html(grouped_text, url)
    supplier_info["url"] = url  # Always include base URL

    return supplier_info


def scrape_subpages_texts(urls, delay=0.5, timeout=10):
    """Return list of texts extracted from subpages (no AI calls here)."""
    texts = []
    if not urls:
        return texts

    with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor:
        futures = {}
        for url in urls:
            futures[executor.submit(extract_relevantText_from_HTML, url)] = url
            sleep(delay)

        for future in concurrent.futures.as_completed(futures):
            try:
                _, sub_text = future.result(timeout=timeout)
                if sub_text:
                    texts.append(sub_text)
            except Exception as e:
                print(f"⚠️ Subpage scrape failed ({futures[future]}): {e}")

    return texts


# 8- INDEX()

In [58]:
# Configuration du logger
logger = logging.getLogger("scraper_logger")
logger.setLevel(logging.DEBUG)  # Affiche tout, y compris les debug

# Supprimer les handlers existants (important en Jupyter)
if logger.hasHandlers():
    logger.handlers.clear()

# Handler pour afficher dans la console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)

# Format des messages
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)

# Ajout du handler au logger
logger.addHandler(console_handler)

In [60]:
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    global results
    results = []

    if request.method == "POST":
        user_query = request.form["query"]
        logger.info(f"🔍 Nouvelle requête reçue : {user_query}")
        
        try:
            item, item_french, category, context = extract_vehicle_and_part(user_query)
            logger.debug(f"Extraction réussie: item: {item} | item_french: {item_french} | category: {category} | context: {context}")   
        except Exception as e:
            logger.error(f"Erreur dans extract_vehicle_and_part: {e}")
            return render_template("error.html", error="Erreur d’analyse de la requête.")

        try:
            search_query_ma, search_query_com = build_search_query(item, category, context)
            search_results = search_with_serper(search_query_ma, search_query_com, 5)
            logger.info(f"🧠 Résultats Serper récupérés: {len(search_results)} liens")
        except Exception as e:
            logger.error(f"Erreur dans search_with_serper: {e}")
            return render_template("error.html", error="Erreur pendant la recherche Serper.")

        # Kerix.net scrapping
        try:
            contacts = get_kerix_contacts(item_french, max_links=5)
            logger.info(f"📞 Contacts Kerix récupérés: {len(contacts)}")
            results = contacts
        except Exception as e:
            logger.warning(f"Échec récupération contacts Kerix: {e}")

        # Scraping des autres fournisseurs
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {
                executor.submit(scrape_supplier, url["href"]): url["href"]
                for url in search_results
            }

            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        # logger.info(f"✅ Résultat trouvé pour {url}: {result}")
                        results.append(result)
                except Exception as e:
                    logger.error(f"❌ Thread échoué pour {url}: {e}")
                    results.append({
                        "url": url,
                        "error": str(e)
                    })

        # logger.info(f"🎯 Total résultats collectés: {len(results)}")
        return render_template("result.html", results=results, query=user_query)
    
    logger.debug("Page d’accueil chargée (GET)")
    return render_template("index1.html")


# 9- EXPORT RESULTS TO EXCEL

In [63]:
@app.route("/download")
def download():
    df = pd.DataFrame(results)
    df = df[["part_number", "name", "email", "phone", "address", "url"]] 
    output = BytesIO()
    df.to_excel(output, index=False)
    output.seek(0)
    return send_file(output, download_name="suppliers.xlsx", as_attachment=True)

# 10- RUNNING

In [66]:
if __name__ == "__main__":
    app.run(debug=True, port=5002, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5002
Press CTRL+C to quit
2025-09-02 19:24:10,050 - DEBUG - Page d’accueil chargée (GET)
127.0.0.1 - - [02/Sep/2025 19:24:12] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 19:24:17] "GET /static/icon_intelligent.png HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 19:24:17] "GET /static/icon_efficient.png HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 19:24:17] "GET /static/icon_accurate.png HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 19:24:17] "GET /static/logo5.png HTTP/1.1" 304 -
127.0.0.1 - - [02/Sep/2025 19:24:17] "GET /static/background9.png HTTP/1.1" 304 -
127.0.0.1 - - [02/Sep/2025 19:24:18] "GET /favicon.ico HTTP/1.1" 404 -
2025-09-02 19:24:33,787 - INFO - 🔍 Nouvelle requête reçue : pompe centrifuge
2025-09-02 19:26:28,206 - DEBUG - Extraction réussie: item: Centrifugal pump | item_french: Pompe centrifuge | category: Equipment | context: Used for transferring fluids by converting rotational kinetic energy into hydrodynamic energy


Centrifugal pump suppliers in the Equipment category for Used for transferring fluids by converting rotational kinetic energy into hydrodynamic energy
✅ All URLs Serper Search Results : 12
https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/
https://toubkal.imist.ma/bitstream/handle/123456789/25587/264-20-BELBSIR%20HAMZA.pdf?sequence=1
https://www.rotechpumps.com/types-of-centrifugal-pump/
https://www.waterpump-cn.com/product/cpmcentrifugal/
https://home.pumpsystemsacademy.com/blog/centrifugal-pumps
http://www.hiseamarine.com/marine-centrifugal-pump/
https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/
https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/
https://en.wikipedia.org/wiki/Centrifugal_pump
https://arroyoprocess.com/centrifugal-pumps/
https://winstonengineering.com/guide-industrial-pump-types/
https://www.trilliumflow.com/tf-news/vertical-pump-types-functions/
🔍 Checking scrapability for search results...


2025-09-02 19:26:36,242 - INFO - ✅ https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/ est scrapable sans JavaScript.
2025-09-02 19:26:36,395 - ERROR - ⚠️ Erreur d'accès à https://toubkal.imist.ma/bitstream/handle/123456789/25587/264-20-BELBSIR%20HAMZA.pdf?sequence=1: HTTPSConnectionPool(host='toubkal.imist.ma', port=443): Max retries exceeded with url: /bitstream/handle/123456789/25587/264-20-BELBSIR%20HAMZA.pdf?sequence=1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))


URL not accessible


2025-09-02 19:26:38,096 - INFO - ✅ https://www.rotechpumps.com/types-of-centrifugal-pump/ est scrapable sans JavaScript.
2025-09-02 19:26:41,274 - INFO - ✅ https://www.waterpump-cn.com/product/cpmcentrifugal/ est scrapable sans JavaScript.
2025-09-02 19:26:42,673 - INFO - ✅ https://home.pumpsystemsacademy.com/blog/centrifugal-pumps est scrapable sans JavaScript.
2025-09-02 19:26:44,519 - INFO - ✅ http://www.hiseamarine.com/marine-centrifugal-pump/ est scrapable sans JavaScript.
2025-09-02 19:26:45,810 - INFO - ✅ https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/ est scrapable sans JavaScript.
2025-09-02 19:26:46,770 - INFO - ✅ https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/ est scrapable sans JavaScript.
2025-09-02 19:26:47,255 - INFO - ✅ https://en.wikipedia.org/wiki/Centrifugal_pump est scrapable sans JavaScript.
2025-09-02 19:26:52,105 - INFO - ✅ https://arroyoprocess.com/centrifugal-pumps/ est scrapable sans JavaScript.


✅ Filtered URLs with total : 11
✅ Filtered and deduplicated URLs ready for scraping with total : 11
https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/
https://www.rotechpumps.com/types-of-centrifugal-pump/
https://www.waterpump-cn.com/product/cpmcentrifugal/
https://home.pumpsystemsacademy.com/blog/centrifugal-pumps
http://www.hiseamarine.com/marine-centrifugal-pump/
https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/
https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/
https://en.wikipedia.org/wiki/Centrifugal_pump
https://arroyoprocess.com/centrifugal-pumps/
https://winstonengineering.com/guide-industrial-pump-types/
https://www.trilliumflow.com/tf-news/vertical-pump-types-functions/


2025-09-02 19:26:58,722 - INFO - 📞 Contacts Kerix récupérés: 0


❗ Aucun résultat trouvé sur Kerix.net.


2025-09-02 19:27:17,853 - ERROR - ❌ Thread échoué pour https://www.rotechpumps.com/types-of-centrifugal-pump/: name 'random' is not defined
2025-09-02 19:27:17,926 - DEBUG - ✅ voici le principal Url: http://www.hiseamarine.com/marine-centrifugal-pump/


❌ Error: HTTPSConnectionPool(host='www.rotechpumps.com', port=443): Read timed out. (read timeout=10). Retry in 1s...
⚠️ voici Ses subUrl à analyser: http://www.hiseamarine.com/contacts.asp


2025-09-02 19:27:18,082 - DEBUG - ✅ voici le principal Url: https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/
2025-09-02 19:27:18,256 - DEBUG - ✅ voici le principal Url: https://www.waterpump-cn.com/product/cpmcentrifugal/
2025-09-02 19:27:18,402 - DEBUG - ✅ voici le principal Url: https://home.pumpsystemsacademy.com/blog/centrifugal-pumps


⚠️ voici Ses subUrl à analyser: https://mtsindustrie.ma/contact/
⚠️ voici Ses subUrl à analyser: https://mtsindustrie.ma/a-propos-de-nous/
⚠️ voici Ses subUrl à analyser: https://www.waterpump-cn.com/contact/
⚠️ voici Ses subUrl à analyser: https://www.waterpump-cn.com/about-us/
⚠️ voici Ses subUrl à analyser: https://home.pumpsystemsacademy.com/about
⚠️ voici Ses subUrl à analyser: https://home.pumpsystemsacademy.com/contact


2025-09-02 19:27:19,258 - DEBUG - ✅ voici le principal Url: https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/


⚠️ voici Ses subUrl à analyser: https://lincolnsuppliers.com/contact/


2025-09-02 19:27:22,353 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: http://www.hiseamarine.com/marine-centrifugal-pump/


{'part_number': '', 'name': 'Hi-Sea Group', 'email': 'manager@hiseamarine.com', 'phone': '+86-23-67956606', 'address': '', 'price': '', 'url': 'http://www.hiseamarine.com/marine-centrifugal-pump/'}


2025-09-02 19:27:22,853 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://home.pumpsystemsacademy.com/blog/centrifugal-pumps


{'part_number': '', 'name': 'home.pumpsystemsacademy.com', 'email': '', 'phone': '', 'address': '', 'price': '', 'url': 'https://home.pumpsystemsacademy.com/blog/centrifugal-pumps'}


2025-09-02 19:27:23,152 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/
2025-09-02 19:27:23,273 - DEBUG - ✅ voici le principal Url: https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/


{'part_number': '', 'name': 'mtsindustrie.ma', 'email': 'contact@mtsindustrie.ma', 'phone': '+212 0523 32 69 66', 'address': 'Apt 8 Etg 2 Imm D Les Orchidees Mohammedia, MA 28820', 'price': '', 'url': 'https://mtsindustrie.ma/nos-solutions/solutions-de-pompage/'}
⚠️ voici Ses subUrl à analyser: https://www.dxpe.com/about-us/corporate-sustainability/
⚠️ voici Ses subUrl à analyser: https://www.dxpe.com/about-us/
⚠️ voici Ses subUrl à analyser: https://www.dxpe.com/contact-us/
⚠️ voici Ses subUrl à analyser: https://www.dxpe.com/contact


2025-09-02 19:27:23,639 - DEBUG - ✅ voici le principal Url: https://en.wikipedia.org/wiki/Centrifugal_pump


⚠️ voici Ses subUrl à analyser: https://en.wikipedia.org/wiki/wikipedia:contact_us
⚠️ voici Ses subUrl à analyser: https://en.wikipedia.org/wiki/wikipedia:about


2025-09-02 19:27:23,869 - DEBUG - ✅ voici le principal Url: https://arroyoprocess.com/centrifugal-pumps/


⚠️ voici Ses subUrl à analyser: https://arroyoprocess.com/contact-us-s30/
⚠️ voici Ses subUrl à analyser: https://arroyoprocess.com/contact-us/
⚠️ voici Ses subUrl à analyser: https://arroyoprocess.com/contact-us
⚠️ voici Ses subUrl à analyser: https://arroyoprocess.com/about-us/


2025-09-02 19:27:24,719 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/
2025-09-02 19:27:24,999 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://www.waterpump-cn.com/product/cpmcentrifugal/


{'part_number': '', 'name': 'lincolnsuppliers.com', 'email': '', 'phone': '800-622-8425', 'address': '', 'price': '', 'url': 'https://lincolnsuppliers.com/sanitary-centrifugal-pump/types-centrifugal-pumps-uses/'}
{'part_number': '', 'name': 'waterpump-cn.com', 'email': 'sales18@elestarco.com', 'phone': '+86 18060570295', 'address': 'OFFICE:1301-03,SOHO BUILDING 1#,TAIHOT PLAZA,XINDIAN TOWN,JINAN DISTRICT,FUZHOU,FUJIAN,CHINA.', 'price': '$26.00', 'url': 'https://www.waterpump-cn.com/product/cpmcentrifugal/'}


2025-09-02 19:27:25,599 - DEBUG - ✅ voici le principal Url: https://www.trilliumflow.com/tf-news/vertical-pump-types-functions/


⚠️ voici Ses subUrl à analyser: https://www.trilliumflow.com/contact-us/
⚠️ voici Ses subUrl à analyser: https://trilliumflow.com/contact-us/
⚠️ voici Ses subUrl à analyser: https://www.trilliumflow.com/about/flowcast/
⚠️ voici Ses subUrl à analyser: https://www.trilliumflow.com/about/


2025-09-02 19:27:27,040 - DEBUG - ✅ voici le principal Url: https://winstonengineering.com/guide-industrial-pump-types/


⚠️ voici Ses subUrl à analyser: https://winstonengineering.com/contact-us/
⚠️ voici Ses subUrl à analyser: https://contact/
⚠️ voici Ses subUrl à analyser: https://winstonengineering.com/about-us/
⚠️ voici Ses subUrl à analyser: https://winstonengineering.com/about/#core


2025-09-02 19:27:27,871 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://en.wikipedia.org/wiki/Centrifugal_pump


{'part_number': '', 'name': 'Wikipedia', 'email': '', 'phone': '', 'address': '', 'price': '', 'url': 'https://en.wikipedia.org/wiki/Centrifugal_pump'}
❌ Error: HTTPSConnectionPool(host='contact', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000021BE1AD4E50>: Failed to resolve 'contact' ([Errno 11001] getaddrinfo failed)")). Retry in 1s...
⚠️ Subpage scrape failed (https://contact/): name 'random' is not defined


2025-09-02 19:27:31,200 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/


{'part_number': '', 'name': 'DXP', 'email': '', 'phone': '18008303973', 'address': '', 'price': '', 'url': 'https://www.dxpe.com/centrifugal-pumps-how-they-work-and-what-they-involve/'}


2025-09-02 19:27:31,482 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://arroyoprocess.com/centrifugal-pumps/


{'part_number': '', 'name': 'Arroyo Process Equipment', 'email': '', 'phone': '+1 (863) 533-9700', 'address': '1550 Centennial Blvd. Bartow, FL 33830, 6635 Hwy Ave, Jacksonville, FL 32254, 2647 West 81st Street, Hialeah, FL 33016', 'price': '', 'url': 'https://arroyoprocess.com/centrifugal-pumps/'}


2025-09-02 19:27:32,997 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://www.trilliumflow.com/tf-news/vertical-pump-types-functions/


{'part_number': '', 'name': 'Trillium Flow Technologies', 'email': '', 'phone': '+1 559 442 4000', 'address': '2495 S. Golden State Boulevard Fresno, CA 93706 USA', 'price': '', 'url': 'https://www.trilliumflow.com/tf-news/vertical-pump-types-functions/'}


2025-09-02 19:27:34,433 - DEBUG - ✅ voici le résultat de passage au LLM de l'Url: https://winstonengineering.com/guide-industrial-pump-types/
127.0.0.1 - - [02/Sep/2025 19:27:34] "POST / HTTP/1.1" 200 -


{'part_number': '', 'name': 'winstonengineering.com', 'email': '', 'phone': '', 'address': '', 'price': '', 'url': 'https://winstonengineering.com/guide-industrial-pump-types/'}


127.0.0.1 - - [02/Sep/2025 19:27:34] "GET /static/logo5.png HTTP/1.1" 304 -
127.0.0.1 - - [02/Sep/2025 19:27:35] "GET /static/background9.png HTTP/1.1" 304 -
