# Scraping companies from Welcome to the jungle

https://www.welcometothejungle.com/fr/companies?page=1&query=&aroundQuery=France&refinementList%5Boffices.country_code%5D%5B%5D=FR

### Scraping Notes – *Welcome to the Jungle* (Companies)

* **Base URL:**
  `https://www.welcometothejungle.com/fr/companies?page=1&query=&aroundQuery=France&refinementList%5Boffices.country_code%5D%5B%5D=FR`
  → To paginate, update the `page=` parameter.

* **Companies List:**

  * Container: `<ul data-testid="companies-search-search-results">`
  * Each company entry: `<li data-testid="companies-search-search-results-list-item-wrapper">`

* **Company Name:**

  * Located inside:
    `<div class="sc-brzPDJ sc-eXZHrJ ewyPqM">`

* **Company Details Page:**

  * Clicking the `<a />` link inside the company card (`<div class="sc-cvHfgK fWNrn">`) opens the company’s details page.
  * The website link is located in:
    `<p class="sc-izXThL kPYtCp sc-cVttbi dhUINZ wui-text">`
    → Contains an `<a>` tag → extract the `href`.
  * The company category is located in:
    `<div data-testid="showcase-header-sector" />`
  


## Librairies

In [1]:
import time
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options


## Configure Chrome with a real user agent

In [2]:
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

# Start the driver
driver = webdriver.Chrome(options=options)

def human_sleep(a=2, b=5):
    """Randomized sleep to simulate human behavior"""
    time.sleep(random.uniform(a, b))
    
def accept_country_banner(driver):
    """Click the country banner 'Stay' button if it appears."""
    try:
        stay_button = driver.find_element(By.CSS_SELECTOR, 'button[data-testid="country-banner-stay-button"]')
        stay_button.click()
        print("✅ Country banner dismissed")
        human_sleep(2, 3)
    except:
        # Banner not present
        pass


## Find the company list

In [3]:
results = []

for page in range(1, 35):
    print(f"\n--- Scraping page {page} ---")
    url = f"https://www.welcometothejungle.com/fr/companies?page={page}&query=&aroundQuery=France&refinementList%5Boffices.country_code%5D%5B%5D=FR"
    driver.get(url)
    human_sleep(4, 7)
    accept_country_banner(driver)

    try:
        companies_list = driver.find_element(
            By.CSS_SELECTOR, 'ul[data-testid="companies-search-search-results"]'
        )
        companies = companies_list.find_elements(
            By.CSS_SELECTOR,
            'li[data-testid="companies-search-search-results-list-item-wrapper"]',
        )
    except:
        print(f"No companies found on page {page}")
        continue

    company_links = []

    # --- Stage 1: Collect all company names + their detail page URLs ---
    for idx, company in enumerate(companies):
        try:
            driver.execute_script("arguments[0].scrollIntoView(true);", company)
            human_sleep(1, 3)

            # Company name
            name_div = company.find_element(
                By.CSS_SELECTOR, "div.sc-brzPDJ.sc-eXZHrJ.ewyPqM"
            )
            company_name = name_div.text.strip()

            # Company detail page link
            link = company.find_element(By.CSS_SELECTOR, "div.sc-cvHfgK.fWNrn a")
            company_url = link.get_attribute("href")

            company_links.append({"name": company_name, "detail_page": company_url})

        except Exception as e:
            print(f"Error collecting company {idx}: {e}")
            continue

    # --- Stage 2: Visit each detail page and extract the website URL ---

    for company in company_links:
        try:
            driver.get(company["detail_page"])
            human_sleep(3, 6)

            # Extract external website link
            try:
                website_block = driver.find_element(
                    By.CSS_SELECTOR, 'a[data-testid="showcase-header-website-link"]'
                )
                website_url = website_block.get_attribute("href")
            except:
                website_url = None

            try:
                sector_div = driver.find_element(
                    By.CSS_SELECTOR, 'div[data-testid="showcase-header-sector"] p'
                )
                category = sector_div.text.strip()
            except:
                category = None

            results.append(
                {
                    "name": company["name"],
                    "detail_page": company["detail_page"],
                    "website": website_url,
                    "category": category,
                }
            )
            print(results[-1])

        except Exception as e:
            print(f"Error visiting {company['detail_page']}: {e}")
            continue
        
    df = pd.DataFrame(results)
    df.to_excel("scraped_companies.xlsx", index=False)
    print(f"💾 Progress saved after page {page} — total {len(results)} companies")

driver.quit()

print("Scraped companies:", results)


--- Scraping page 1 ---
✅ Country banner dismissed
{'name': 'Renaissance Studio', 'detail_page': 'https://www.welcometothejungle.com/fr/companies/renaissance-studio', 'website': 'https://www.rowads.app/', 'category': 'AdTech / MarTech, Intelligence artificielle / Machine Learning, Publicité'}
{'name': 'Foxglove', 'detail_page': 'https://www.welcometothejungle.com/fr/companies/foxglove', 'website': 'https://www.foxglove-partner.com/', 'category': 'Design, Digital, Digital Marketing / Data Marketing'}
{'name': 'Horizon', 'detail_page': 'https://www.welcometothejungle.com/fr/companies/orizon', 'website': None, 'category': 'Digital, Digital Marketing / Data Marketing, Publicité'}
{'name': 'Planon', 'detail_page': 'https://www.welcometothejungle.com/fr/companies/planon', 'website': 'https://planonsoftware.com/fr', 'category': 'IT / Digital, Logiciels, SaaS / Cloud Services'}
{'name': 'Eurowind Energy', 'detail_page': 'https://www.welcometothejungle.com/fr/companies/eurowindenergy', 'websit

In [4]:
# df = pd.DataFrame(results)

# df.to_excel("scraped_companies.xlsx", index=False)

# print("Saved to scraped_companies.xlsx")