# Scraping companies from Job Teaser

https://www.jobteaser.com/fr/companies

### Scraping Notes – *Welcome to the Jungle* (Companies)

* **Base URL:**
  `https://www.jobteaser.com/fr/companies?page=2`
  → To paginate, update the `page=` parameter.

* **Companies List:**

  * Container: `<ul class="PageContent_resultsItems__DPv4g">`
  * Each company entry: `<li class="PageContent_resultsItem__0ULaz">`

* **Company Name:**

  * Located inside:
    `<a class="CompanyCard_link__2iIGc">`
  * Url description page is href of `<a />`

* **Company Details Page:**

  * Company type is the text of `<p data-testid="company_header_business_type" />`
  * The website link is located in:
    `<a data-testid="company_header_website_link">`
   → extract the `href`.
  * The company category is located in:
    `<p data-testid="company_header_sector" />`
  


## Librairies

In [4]:
import time
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options


## Configure Chrome with a real user agent

In [5]:
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

# Start the driver
driver = webdriver.Chrome(options=options)

def human_sleep(a=2, b=5):
    """Randomized sleep to simulate human behavior"""
    time.sleep(random.uniform(a, b))
    
def accept_country_banner(driver):
    """Click the country banner 'Stay' button if it appears."""
    try:
        stay_button = driver.find_element(By.CSS_SELECTOR, 'span[class="didomi-continue-without-agreeing"]')
        stay_button.click()
        print("✅ Country banner dismissed")
        human_sleep(2, 3)
    except:
        # Banner not present
        pass


## Find the company list

In [6]:

results = []

for page in range(1, 109):
    print(f"\n--- Scraping page {page} ---")
    url = f"https://www.jobteaser.com/fr/companies?page={page}"
    driver.get(url)
    human_sleep(4, 7)
    accept_country_banner(driver)

    try:
        companies_list = driver.find_element(
            By.CSS_SELECTOR, 'ul.PageContent_resultsItems__DPv4g'
        )
        companies = companies_list.find_elements(
            By.CSS_SELECTOR,
            'li.PageContent_resultsItem__0ULaz',
        )
    except:
        print(f"No companies found on page {page}")
        continue

    company_links = []

    # --- Stage 1: Collect all company names + their detail page URLs ---
    for idx, company in enumerate(companies):
        try:
            driver.execute_script("arguments[0].scrollIntoView(true);", company)
            human_sleep(1, 3)

            # Company name
            name_a = company.find_element(
                By.CSS_SELECTOR, "a.CompanyCard_link__2iIGc"
            )
            company_name = name_a.text.strip()

            # Company detail page link
            company_url = name_a.get_attribute("href")

            company_links.append({"name": company_name, "detail_page": company_url})

        except Exception as e:
            print(f"Error collecting company {idx}: {e}")
            continue

    print(company_links)
    

    # --- Stage 2: Visit each detail page and extract the website URL ---

    for company in company_links:
        try:
            driver.get(company["detail_page"])
            human_sleep(3, 6)

            # Extract external website link
            try:
                website_block = driver.find_element(
                    By.CSS_SELECTOR, 'a[data-testid="company_header_website_link"]'
                )
                website_url = website_block.get_attribute("href")
            except:
                website_url = None

            try:
                sector_p = driver.find_element(
                    By.CSS_SELECTOR, 'p[data-testid="company_header_sector"]'
                )
                category = sector_p.text.strip()
            except:
                category = None
                
            try:
                business_type_p = driver.find_element(
                    By.CSS_SELECTOR, 'p[data-testid="company_header_business_type"]'
                )
                business_type = business_type_p.text.strip()
            except:
                business_type = None

            results.append(
                {
                    "name": company["name"],
                    "detail_page": company["detail_page"],
                    "website": website_url,
                    "category": category,
                    "business_type": business_type
                }
            )

            print(results[-1])

        except Exception as e:
            print(f"Error visiting {company['detail_page']}: {e}")
            continue
        
    df = pd.DataFrame(results)
    df.to_excel("scraped_companies_job_teaser.xlsx", index=False)
    print(f"💾 Progress saved after page {page} — total {len(results)} companies")

driver.quit()

print("Scraped companies:", results)


--- Scraping page 1 ---
✅ Country banner dismissed
[{'name': 'Alveus club', 'detail_page': 'https://www.jobteaser.com/fr/companies/alveus-club'}, {'name': 'XPO', 'detail_page': 'https://www.jobteaser.com/fr/companies/xpo-logistics-france'}, {'name': 'Ardian', 'detail_page': 'https://www.jobteaser.com/fr/companies/ardian'}, {'name': 'Boehringer Ingelheim', 'detail_page': 'https://www.jobteaser.com/fr/companies/boehringer-ingelheim-de'}, {'name': 'Systematic', 'detail_page': 'https://www.jobteaser.com/fr/companies/systematic'}, {'name': 'RATIER-FIGEAC COLLINS AEROSPACE', 'detail_page': 'https://www.jobteaser.com/fr/companies/collins-aerospace'}, {'name': 'Datadog', 'detail_page': 'https://www.jobteaser.com/fr/companies/datadog'}, {'name': 'Valeo', 'detail_page': 'https://www.jobteaser.com/fr/companies/valeo'}, {'name': 'AKOYA Consulting', 'detail_page': 'https://www.jobteaser.com/fr/companies/akoya'}, {'name': 'Intermarché & Netto', 'detail_page': 'https://www.jobteaser.com/fr/companies

In [7]:
# df = pd.DataFrame(results)

# df.to_excel("scraped_companies_job_teaser_stage_1.xlsx", index=False)

# print("Saved to scraped_companies.xlsx")