# Offers Scraper

In [1]:
import random
import time
import json
import logging
from pathlib import Path
from datetime import datetime
from dotenv import dotenv_values  # type: ignore

import requests  # type: ignore
from requests.adapters import HTTPAdapter, Retry  # type: ignore
from requests_cache import CachedSession  # type: ignore
from requests.exceptions import RetryError, RequestException  # type: ignore

In [2]:
env = dotenv_values("../.env")
OFFERS_URL = env.get("OFFERS_URL")

In [3]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [4]:
class JobOffersScraper:
    
    def __init__(
        self,
        base_url: str,
        cache_name="fetcher_cache",
        cache_expire=60,
        use_cache=False,
        min_delay=1,
        max_delay=10,
        retries=5,
        backoff_range=(1, 7),
        save_dir="../data/raw/json"
    ):
        if not base_url:
            raise ValueError("URL is required and cannot be empty")
        self.base_url = base_url
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.retries = retries
        self.backoff_range = backoff_range
        self.session = self._init_session(cache_name, cache_expire, use_cache)
        self.save_dir = save_dir

    def _init_session(self, cache_name, cache_expire, use_cache):
        backoff_factor = random.uniform(*self.backoff_range)

        retry_strategy = Retry(
            total=self.retries,
            backoff_factor=backoff_factor,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET"],
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)

        if use_cache:
            session = CachedSession(
                cache_name,
                expire_after=cache_expire,
                use_cache=True
            )
        else:
            session = requests.Session()

        session.mount("http://", adapter)
        session.mount("https://", adapter)

        def log_response_hook(response, *args, **kwargs):
            logger.info(f"Hook: {response.status_code} {response.url}")

        session.hooks["response"] = [log_response_hook]

        return session

    def save_json(self, data: dict, page: int):
        Path(self.save_dir).mkdir(parents=True, exist_ok=True)
        with open(f"{self.save_dir}/offers_page_{page}.json", "w") as f:
            f.write(json.dumps(data, indent=4))

    def fetch_page(self, page: int = 1) -> dict:
        try:
            response = self.session.get(self.base_url, params={"page": page})
            response.raise_for_status()
            logger.info(f"Download page: {page}")
            self.save_json(response.json(), page)

            return response.json()
        except RetryError as e:
            raise RuntimeError(f"All retries failed: {e}")
        except RequestException as e:
            logger.error(f"Request failed: {e}")
            raise
        except Exception as e:
            logger.error(f"Error: {e}")
            raise

    def fetch_pages(self, max_pages: int = 5):
        pages = list(range(1, max_pages + 1))
        random.shuffle(pages)

        for page in pages:
            self.fetch_page(page)
            logger.info(f"Saved page: {page}")
            time.sleep(random.randint(self.min_delay, self.max_delay))

In [None]:
today_str = datetime.now().strftime("%Y-%m-%d")
path_dir = f"../data/raw/{today_str}"

job_offers_fetcher = JobOffersScraper(OFFERS_URL, save_dir=path_dir)
job_offers_fetcher.fetch_pages(14)