diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..44b169c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,19 @@ +# EditorConfig is awesome: https://EditorConfig.org + +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.py] +charset = utf-8 +indent_style = space +indent_size = 4 +max_line_length = 160 + +[*.yml] +indent_style = space +indent_size = 2 +trim_trailing_whitespace = True \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..d21cf00 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml new file mode 100644 index 0000000..e315193 --- /dev/null +++ b/.github/workflows/lint-python.yml @@ -0,0 +1,59 @@ +name: Ruff + +on: + push: + paths: + - "**.py" + - .github/workflows/lint-python.yml + - pyproject.toml + pull_request: + paths: + - "**.py" + - pyproject.toml + workflow_dispatch: + +jobs: + ruff: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: + - "3.12" + - "3.11" + - "3.10" + - "3.9" + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install uv and create venv + run: | + pip install -U pip uv + uv venv + + - name: Enable macOS/Linux venv + if: runner.os != 'Windows' + run: | + source .venv/bin/activate + echo $PATH >> $GITHUB_PATH + + - name: Enable Windows venv + if: runner.os == 'Windows' + run: | + .venv\Scripts\activate + echo $env:path >> $ENV:GITHUB_PATH + + - name: Install dependencies + run: | + uv pip install -e . ruff ydiff + + - name: Lint the code with ruff + run: | + ruff check $(git ls-files '*.py') --diff | ydiff -s diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..8a27e69 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,64 @@ +name: Pytest + +on: + push: + paths: + - "**.py" + - .github/workflows/pytest.yml + - pyproject.toml + pull_request: + paths: + - "**.py" + - pyproject.toml + workflow_dispatch: + +jobs: + pytest: + strategy: + fail-fast: false + matrix: + os: + # - "ubuntu-latest" # doesn't work without headless + - "windows-latest" + - "macos-latest" + python-version: + - "3.12" + - "3.11" + - "3.10" + - "3.9" + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install uv and create venv + run: | + pip install -U pip uv + uv venv + + - name: Enable macOS/Linux venv + if: runner.os != 'Windows' + run: | + source .venv/bin/activate + echo $PATH >> $GITHUB_PATH + + - name: Enable Windows venv + if: runner.os == 'Windows' + run: | + .venv\Scripts\activate + echo $env:path >> $ENV:GITHUB_PATH + + - name: Install dependencies + run: | + uv pip install -r requirements.txt + uv pip install -e . pytest pytest-cov + + - name: Test with pytest + run: | + pytest --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html diff --git a/.gitignore b/.gitignore index c63caa8..11e9b9b 100644 --- a/.gitignore +++ b/.gitignore @@ -100,6 +100,9 @@ ENV/ # mkdocs documentation /site +# Poetry +poetry.lock + # mypy .mypy_cache/ diff --git a/.travis.yml b/.travis.yml index e1a0970..6ff9a13 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,13 @@ sudo: false language: python python: - - '2.7' - - '3.4' - - '3.5' - - '3.6' + - '3.9' + - '3.10' + - '3.11' + - '3.12' install: - pip install tox-travis - - wget -N https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_linux64.zip -P ~/ + - wget -N https://chromedriver.storage.googleapis.com/100.0.4896.20/chromedriver_linux64.zip -P ~/ - unzip ~/chromedriver_linux64.zip -d ~/ - rm ~/chromedriver_linux64.zip - sudo mv -f ~/chromedriver /usr/local/share/ @@ -24,7 +24,7 @@ deploy: on: all_branches: true tags: false - python: 3.6 + python: 3.12 # Real PyPI in tags (ie. GitHub releases) - provider: pypi distributions: sdist bdist_wheel --universal @@ -33,6 +33,6 @@ deploy: on: branch: master tags: true - python: 3.6 + python: 3.12 addons: chrome: stable diff --git a/README.md b/README.md index a0e75f3..304a73f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ ![Requestium](https://user-images.githubusercontent.com/14966348/32966130-8bb15b00-cbb7-11e7-9faf-85963ec5bd82.png) ======== -[![Build Status](https://travis-ci.org/tryolabs/requestium.svg?branch=master)](https://travis-ci.org/tryolabs/requestium) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) Requestium is a Python library that merges the power of [Requests](https://github.com/requests/requests), [Selenium](https://github.com/SeleniumHQ/selenium), and [Parsel](https://github.com/scrapy/parsel) into a single integrated tool for automatizing web actions. diff --git a/pyproject.toml b/pyproject.toml index 765bd58..a98626e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.3.0" description = "" authors = ["Joaquin Alori "] readme = "README.md" +license = "BSD-3-Clause" [tool.poetry.dependencies] python = ">=3.9,<=3.13" @@ -13,7 +14,15 @@ selenium = "^4.15.2" tldextract = "^5.1.1" [tool.poetry.group.dev.dependencies] -pytest = "^7.4.3" +pytest = "^7.4.4" + +[tool.ruff] +line-length = 160 + +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F"] +extend-select = ["B", "W", "C"] +ignore = ["C400", "C401"] [build-system] requires = ["poetry-core"] diff --git a/requestium/__init__.py b/requestium/__init__.py index ff16f59..d8c3b44 100644 --- a/requestium/__init__.py +++ b/requestium/__init__.py @@ -1,5 +1,8 @@ +"""Requestium is a Python library that merges the power of Requests, Selenium, and Parsel into a single integrated tool for automatizing web actions.""" + from selenium.common import exceptions # noqa: F401 +from selenium.webdriver.common.by import By # noqa: F401 from selenium.webdriver.common.keys import Keys # noqa: F401 from selenium.webdriver.support.ui import Select # noqa: F401 -from .requestium import Session # noqa: F401 +from .requestium_session import Session # noqa: F401 diff --git a/requestium/requestium.py b/requestium/requestium.py deleted file mode 100644 index 6941f51..0000000 --- a/requestium/requestium.py +++ /dev/null @@ -1,412 +0,0 @@ -import functools -import time -import types -import warnings - -import requests -import tldextract -from parsel.selector import Selector -from selenium import webdriver -from selenium.common.exceptions import WebDriverException -from selenium.webdriver import ChromeService -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.ui import WebDriverWait - - -class Session(requests.Session): - """Class that adds a Selenium Webdriver and helper methods to a Requests Session - - This session class is a normal Requests Session that has the ability to switch back - and forth between this session and a webdriver, allowing us to run js when needed. - - Cookie transfer is done with the 'transfer' methods. - - Header and proxy transfer is done only one time when the driver process starts. - - Some useful helper methods and object wrappings have been added. - """ - - def __init__(self, webdriver_path=None, headless=None, default_timeout=5, - webdriver_options={}, driver=None, **kwargs): - super(Session, self).__init__() - self.webdriver_path = webdriver_path - self.default_timeout = default_timeout - self.webdriver_options = webdriver_options - self._driver = driver - self._last_requests_url = None - - if self._driver is None: - self._driver_initializer = functools.partial(self._start_chrome_browser, headless=headless) - else: - for name in DriverMixin.__dict__: - name_private = name.startswith('__') and name.endswith('__') - name_function = isinstance(DriverMixin.__dict__[name], types.FunctionType) - name_in_driver = name in dir(self._driver) - if name_private or not name_function or name_in_driver: - continue - self._driver.__dict__[name] = DriverMixin.__dict__[name].__get__(self._driver) - setattr(self._driver, 'default_timeout', self.default_timeout) - - @property - def driver(self): - if self._driver is None: - self._driver = self._driver_initializer() - return self._driver - - def _start_chrome_browser(self, headless=False): - # TODO transfer of proxies and headers: Not supported by chromedriver atm. - # Choosing not to use plug-ins for this as I don't want to worry about the - # extra dependencies and plug-ins don't work in headless mode. :-( - chrome_options = webdriver.chrome.options.Options() - - if headless: - chrome_options.add_argument('headless=new') - - if 'binary_location' in self.webdriver_options: - chrome_options.binary_location = self.webdriver_options['binary_location'] - - if 'arguments' in self.webdriver_options: - if isinstance(self.webdriver_options['arguments'], list): - for arg in self.webdriver_options['arguments']: - chrome_options.add_argument(arg) - else: - raise Exception('A list is needed to use \'arguments\' option. Found {}'.format( - type(self.webdriver_options['arguments']))) - - if 'extensions' in self.webdriver_options: - if isinstance(self.webdriver_options['extensions'], list): - for arg in self.webdriver_options['extensions']: - chrome_options.add_extension(arg) - - if 'prefs' in self.webdriver_options: - prefs = self.webdriver_options['prefs'] - chrome_options.add_experimental_option('prefs', prefs) - - experimental_options = self.webdriver_options.get('experimental_options') - if isinstance(experimental_options, dict): - for name, value in experimental_options.items(): - chrome_options.add_experimental_option(name, value) - - # Create driver process - RequestiumChrome = type('RequestiumChrome', (DriverMixin, webdriver.Chrome), {}) - # Selenium updated webdriver.Chrome's arg and kwargs, to accept options, service, keep_alive - # since ChromeService is the only object where webdriver_path is mapped to executable_path, it must be - # initialized and passed in as a kwarg to RequestiumChrome so it can be passed in as a kwarg - # when passed into webdriver.Chrome in super(DriverMixin, self).__init__(*args, **kwargs) - service = ChromeService(executable_path=self.webdriver_path) - return RequestiumChrome(service=service, options=chrome_options, default_timeout=self.default_timeout) - - def transfer_session_cookies_to_driver(self, domain=None): - """Copies the Session's cookies into the webdriver - - Using the 'domain' parameter we choose the cookies we wish to transfer, we only - transfer the cookies which belong to that domain. The domain defaults to our last visited - site if not provided. - """ - if not domain and self._last_requests_url: - domain = tldextract.extract(self._last_requests_url).registered_domain - elif not domain and not self._last_requests_url: - raise Exception('Trying to transfer cookies to selenium without specifying a domain ' - 'and without having visited any page in the current session') - - # Transfer cookies - for c in [c for c in self.cookies if domain in c.domain]: - cookie = {'name': c.name, 'value': c.value, 'path': c.path, - 'expiry': c.expires, 'domain': c.domain} - - self.driver.ensure_add_cookie({k: v for k, v in cookie.items() if v is not None}) - - def transfer_driver_cookies_to_session(self, copy_user_agent=True): - if copy_user_agent: - self.copy_user_agent_from_driver() - - for cookie in self.driver.get_cookies(): - self.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain']) - - def get(self, *args, **kwargs): - resp = super(Session, self).get(*args, **kwargs) - self._last_requests_url = resp.url - return RequestiumResponse(resp) - - def post(self, *args, **kwargs): - resp = super(Session, self).post(*args, **kwargs) - self._last_requests_url = resp.url - return RequestiumResponse(resp) - - def put(self, *args, **kwargs): - resp = super(Session, self).put(*args, **kwargs) - self._last_requests_url = resp.url - return RequestiumResponse(resp) - - def copy_user_agent_from_driver(self): - """ Updates requests' session user-agent with the driver's user agent - - This method will start the browser process if its not already running. - """ - selenium_user_agent = self.driver.execute_script("return navigator.userAgent;") - self.headers.update({"user-agent": selenium_user_agent}) - - -class RequestiumResponse(object): - """Adds xpath, css, and regex methods to a normal requests response object""" - - def __init__(self, response): - self.__class__ = type(response.__class__.__name__, - (self.__class__, response.__class__), - response.__dict__) - - @property - def selector(self): - """Returns the response text in a Selector - - We re-parse the text on each xpath, css, re call in case the encoding has changed. - """ - return Selector(text=self.text) - - def xpath(self, *args, **kwargs): - return self.selector.xpath(*args, **kwargs) - - def css(self, *args, **kwargs): - return self.selector.css(*args, **kwargs) - - def re(self, *args, **kwargs): - return self.selector.re(*args, **kwargs) - - def re_first(self, *args, **kwargs): - return self.selector.re_first(*args, **kwargs) - - -class DriverMixin(object): - """Provides helper methods to our driver classes - """ - - def __init__(self, *args, **kwargs): - self.default_timeout = kwargs.pop('default_timeout', None) - super(DriverMixin, self).__init__(*args, **kwargs) - - def try_add_cookie(self, cookie): - """Attempt to add the cookie. Suppress any errors, and simply - detect success or failure if the cookie was actually added. - """ - try: - self.add_cookie(cookie) - except Exception: - pass - return self.is_cookie_in_driver(cookie) - - def ensure_add_cookie(self, cookie, override_domain=None): - """Ensures a cookie gets added to the driver - - Selenium needs the driver to be currently at the domain of the cookie - before allowing you to add it, so we need to get through this limitation. - - The cookie parameter is a dict which must contain the keys (name, value, domain) and - may contain the keys (path, expiry). - - We first check that we aren't currently in the cookie's domain, if we aren't, we GET - the cookie's domain and then add the cookies to the driver. - - We can override the cookie's domain using 'override_domain'. The use for this - parameter is that sometimes GETting the cookie's domain redirects you to a different - sub domain, and therefore adding the cookie fails. So sometimes the user may - need to override the cookie's domain to a less strict one, Eg.: 'site.com' instead of - 'home.site.com', in this way even if the site redirects us to a subdomain, the cookie will - stick. If you set the domain to '', the cookie gets added with whatever domain the browser - is currently at (at least in chrome it does), so this ensures the cookie gets added. - - It also retries adding the cookie with a more permissive domain if it fails in the first - try, and raises an exception if that fails. The standard selenium behaviour in this case - was to not do anything, which was very hard to debug. - """ - if override_domain: - cookie['domain'] = override_domain - - cookie_domain = cookie['domain'] if cookie['domain'][0] != '.' else cookie['domain'][1:] - try: - browser_domain = tldextract.extract(self.current_url).fqdn - except AttributeError: - browser_domain = '' - if cookie_domain not in browser_domain: - # TODO Check if hardcoding 'http' causes trouble - # TODO Consider using a new proxy for this next request to not cause an anomalous - # request. This way their server sees our ip address as continuously having the - # same cookies and not have a request mid-session with no cookies - self.get('http://' + cookie_domain) - - cookie_added = self.try_add_cookie(cookie) - - # If we fail adding the cookie, retry with a more permissive domain - if not cookie_added: - cookie['domain'] = tldextract.extract(cookie['domain']).registered_domain - cookie_added = self.try_add_cookie(cookie) - if not cookie_added: - raise WebDriverException("Couldn't add the following cookie to the webdriver: {}".format(cookie)) - - def is_cookie_in_driver(self, cookie): - """We check that the cookie is correctly added to the driver - - We only compare name, value and domain, as the rest can produce false negatives. - We are a bit lenient when comparing domains. - """ - for driver_cookie in self.get_cookies(): - name_matches = cookie['name'] == driver_cookie['name'] - value_matches = cookie['value'] == driver_cookie['value'] - domain_matches = driver_cookie['domain'] in (cookie['domain'], '.' + cookie['domain']) - if name_matches and value_matches and domain_matches: - return True - return False - - def ensure_element_by_id(self, selector, state="present", timeout=None): - return self.ensure_element(By.ID, selector, state, timeout) - - def ensure_element_by_name(self, selector, state="present", timeout=None): - return self.ensure_element(By.NAME, selector, state, timeout) - - def ensure_element_by_xpath(self, selector, state="present", timeout=None): - return self.ensure_element(By.XPATH, selector, state, timeout) - - def ensure_element_by_link_text(self, selector, state="present", timeout=None): - return self.ensure_element(By.LINK_TEXT, selector, state, timeout) - - def ensure_element_by_partial_link_text(self, selector, state="present", timeout=None): - return self.ensure_element(By.PARTIAL_LINK_TEXT, selector, state, timeout) - - def ensure_element_by_tag_name(self, selector, state="present", timeout=None): - return self.ensure_element(By.TAG_NAME, selector, state, timeout) - - def ensure_element_by_class_name(self, selector, state="present", timeout=None): - return self.ensure_element(By.CLASS_NAME, selector, state, timeout) - - def ensure_element_by_css_selector(self, selector, state="present", timeout=None): - return self.ensure_element(By.CSS_SELECTOR, selector, state, timeout) - - def ensure_element(self, locator: str, selector: str, state: str = "present", timeout=None): - """This method allows us to wait till an element appears or disappears in the browser - - The webdriver runs in parallel with our scripts, so we must wait for it everytime it - runs javascript. Selenium automatically waits till a page loads when GETing it, - but it doesn't do this when it runs javascript and makes AJAX requests. - So we must explicitly wait in that case. - - The 'locator' argument defines what strategy we use to search for the element. - It expects standard names from the By class in selenium.webdriver.common.by. - https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.by.html - - The 'state' argument allows us to chose between waiting for the element to be visible, - clickable, present, or invisible. Presence is more inclusive, but sometimes we want to - know if the element is visible. Careful, its not always intuitive what Selenium considers - to be a visible element. We can also wait for it to be clickable, although this method - is a bit buggy in selenium, an element can be 'clickable' according to selenium and - still fail when we try to click it. - - More info at: http://selenium-python.readthedocs.io/waits.html - """ - locators_compatibility = { - 'link_text': By.LINK_TEXT, - 'partial_link_text': By.PARTIAL_LINK_TEXT, - 'tag_name': By.TAG_NAME, - 'class_name': By.CLASS_NAME, - 'css_selector': By.CSS_SELECTOR - } - if locator in locators_compatibility: - warnings.warn( - """ - Support for locator strategy names with underscores is deprecated. - Use strategies from Selenium's By class (importable from selenium.webdriver.common.by). - """, - DeprecationWarning - ) - locator = locators_compatibility[locator] - - if not timeout: - timeout = self.default_timeout - - if state == 'visible': - element = WebDriverWait(self, timeout).until( - expected_conditions.visibility_of_element_located((locator, selector)) - ) - elif state == 'clickable': - element = WebDriverWait(self, timeout).until( - expected_conditions.element_to_be_clickable((locator, selector)) - ) - elif state == 'present': - element = WebDriverWait(self, timeout).until( - expected_conditions.presence_of_element_located((locator, selector)) - ) - elif state == 'invisible': - WebDriverWait(self, timeout).until( - expected_conditions.invisibility_of_element_located((locator, selector)) - ) - element = None - else: - raise ValueError( - "The 'state' argument must be 'visible', 'clickable', 'present' " - "or 'invisible', not '{}'".format(state) - ) - - # We add this method to our element to provide a more robust click. Chromedriver - # sometimes needs some time before it can click an item, specially if it needs to - # scroll into it first. This method ensures clicks don't fail because of this. - if element: - element.ensure_click = functools.partial(_ensure_click, element) - return element - - @property - def selector(self): - """Returns the current state of the browser in a Selector - - We re-parse the site on each xpath, css, re call because we are running a web browser - and the site may change between calls""" - return Selector(text=self.page_source) - - def xpath(self, *args, **kwargs): - return self.selector.xpath(*args, **kwargs) - - def css(self, *args, **kwargs): - return self.selector.css(*args, **kwargs) - - def re(self, *args, **kwargs): - return self.selector.re(*args, **kwargs) - - def re_first(self, *args, **kwargs): - return self.selector.re_first(*args, **kwargs) - - -def _ensure_click(self): - """Ensures a click gets made, because Selenium can be a bit buggy about clicks - - This method gets added to the selenium element returned in '__ensure_element_by_xpath'. - We should probably add it to more selenium methods, such as all the 'find**' methods though. - - I wrote this method out of frustration with chromedriver and its problems with clicking - items that need to be scrolled to in order to be clickable. In '__ensure_element_by_xpath' we - scroll to the item before returning it, but chrome has some problems if it doesn't get some - time to scroll to the item. This method ensures chromes gets enough time to scroll to the item - before clicking it. I tried SEVERAL more 'correct' methods to get around this, but none of them - worked 100% of the time (waiting for the element to be 'clickable' does not work). - """ - - # We ensure the element is scrolled into the middle of the viewport to ensure that - # it is clickable. There are two main ways an element may not be clickable: - # - It is outside of the viewport - # - It is under a banner or toolbar - # This script solves both cases - script = ("var viewPortHeight = Math.max(" - "document.documentElement.clientHeight, window.innerHeight || 0);" - "var elementTop = arguments[0].getBoundingClientRect().top;" - "window.scrollBy(0, elementTop-(viewPortHeight/2));") - self.parent.execute_script(script, self) # parent = the webdriver - - for _ in range(10): - try: - self.click() - return - except WebDriverException as e: - exception_message = str(e) - time.sleep(0.2) - raise WebDriverException( - "Couldn't click item after trying 10 times, got error message: \n{}".format( - exception_message - ) - ) diff --git a/requestium/requestium_chrome.py b/requestium/requestium_chrome.py new file mode 100644 index 0000000..c9436e7 --- /dev/null +++ b/requestium/requestium_chrome.py @@ -0,0 +1,249 @@ +import functools +import time +import warnings +from typing import Optional, Union + +import tldextract +from parsel.selector import Selector, SelectorList +from selenium import webdriver +from selenium.common.exceptions import UnableToSetCookieException +from selenium.common.exceptions import WebDriverException +from selenium.webdriver.common.by import By +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.support.ui import WebDriverWait + + +class RequestiumChrome(webdriver.Chrome): + """ + Provides helper methods to our driver classes + """ + + def __init__(self, *args, **kwargs) -> None: + self.default_timeout = kwargs.pop("default_timeout", None) + super(RequestiumChrome, self).__init__(*args, **kwargs) + + def try_add_cookie(self, cookie: dict[str, str]) -> bool: + """ + Attempt to add the cookie. Suppress any errors, and simply + detect success or failure if the cookie was actually added. + """ + try: + self.add_cookie(cookie) + except UnableToSetCookieException: + pass + return self.is_cookie_in_driver(cookie) + + def ensure_add_cookie(self, cookie: dict[str, str], override_domain: Optional[str] = None) -> None: + """ + Ensures a cookie gets added to the driver + + Selenium needs the driver to be currently at the domain of the cookie + before allowing you to add it, so we need to get through this limitation. + + The cookie parameter is a dict which must contain the keys (name, value, domain) and + may contain the keys (path, expiry). + + We first check that we aren't currently in the cookie's domain, if we aren't, we GET + the cookie's domain and then add the cookies to the driver. + + We can override the cookie's domain using 'override_domain'. The use for this + parameter is that sometimes GETting the cookie's domain redirects you to a different + sub domain, and therefore adding the cookie fails. So sometimes the user may + need to override the cookie's domain to a less strict one, Eg.: 'site.com' instead of + 'home.site.com', in this way even if the site redirects us to a subdomain, the cookie will + stick. If you set the domain to '', the cookie gets added with whatever domain the browser + is currently at (at least in chrome it does), so this ensures the cookie gets added. + + It also retries adding the cookie with a more permissive domain if it fails in the first + try, and raises an exception if that fails. The standard selenium behaviour in this case + was to not do anything, which was very hard to debug. + """ + if override_domain: + cookie["domain"] = override_domain + + cookie_domain = cookie["domain"] if cookie["domain"][0] != "." else cookie["domain"][1:] + try: + browser_domain = tldextract.extract(self.current_url).fqdn + except AttributeError: + browser_domain = "" + if cookie_domain not in browser_domain: + # TODO Check if hardcoding 'http' causes trouble + # TODO Consider using a new proxy for this next request to not cause an anomalous + # TODO request. This way their server sees our ip address as continuously having the + # TODO same cookies and not have a request mid-session with no cookies + self.get("http://" + cookie_domain) + + cookie_added = self.try_add_cookie(cookie) + + # If we fail adding the cookie, retry with a more permissive domain + if not cookie_added: + cookie["domain"] = tldextract.extract(cookie["domain"]).registered_domain + cookie_added = self.try_add_cookie(cookie) + if not cookie_added: + raise WebDriverException(f"Couldn't add the following cookie to the webdriver: {str(cookie)}") + + def is_cookie_in_driver(self, cookie: dict[str, str]) -> bool: + """ + We check that the cookie is correctly added to the driver + + We only compare name, value and domain, as the rest can produce false negatives. + We are a bit lenient when comparing domains. + """ + for driver_cookie in self.get_cookies(): + name_matches = cookie["name"] == driver_cookie["name"] + value_matches = cookie["value"] == driver_cookie["value"] + domain_matches = driver_cookie["domain"] in ( + cookie["domain"], + "." + cookie["domain"], + ) + if name_matches and value_matches and domain_matches: + return True + return False + + def ensure_element_by_id(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.ID, selector, state, timeout) + + def ensure_element_by_name(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.NAME, selector, state, timeout) + + def ensure_element_by_xpath(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.XPATH, selector, state, timeout) + + def ensure_element_by_link_text(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.LINK_TEXT, selector, state, timeout) + + def ensure_element_by_partial_link_text(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.PARTIAL_LINK_TEXT, selector, state, timeout) + + def ensure_element_by_tag_name(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.TAG_NAME, selector, state, timeout) + + def ensure_element_by_class_name(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.CLASS_NAME, selector, state, timeout) + + def ensure_element_by_css_selector(self, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + return self.ensure_element(By.CSS_SELECTOR, selector, state, timeout) + + def ensure_element(self, locator: str, selector: str, state: str = "present", timeout: Union[float, int] = 1) -> Optional[WebElement]: + """ + This method allows us to wait till an element appears or disappears in the browser + + The webdriver runs in parallel with our scripts, so we must wait for it everytime it + runs javascript. Selenium automatically waits till a page loads when GETing it, + but it doesn't do this when it runs javascript and makes AJAX requests. + So we must explicitly wait in that case. + + The 'locator' argument defines what strategy we use to search for the element. + It expects standard names from the By class in selenium.webdriver.common.by. + https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.by.html + + The 'state' argument allows us to chose between waiting for the element to be visible, + clickable, present, or invisible. Presence is more inclusive, but sometimes we want to + know if the element is visible. Careful, its not always intuitive what Selenium considers + to be a visible element. We can also wait for it to be clickable, although this method + is a bit buggy in selenium, an element can be 'clickable' according to selenium and + still fail when we try to click it. + + More info at: http://selenium-python.readthedocs.io/waits.html + """ + locators_compatibility = { + "link_text": By.LINK_TEXT, + "partial_link_text": By.PARTIAL_LINK_TEXT, + "tag_name": By.TAG_NAME, + "class_name": By.CLASS_NAME, + "css_selector": By.CSS_SELECTOR, + } + if locator in locators_compatibility: + warnings.warn( + """ + Support for locator strategy names with underscores is deprecated. + Use strategies from Selenium's By class (importable from selenium.webdriver.common.by). + """, + DeprecationWarning, + stacklevel=2, + ) + locator = locators_compatibility[locator] + + if not timeout: + timeout = self.default_timeout + + if state == "visible": + element = WebDriverWait(self, timeout).until(expected_conditions.visibility_of_element_located((locator, selector))) + elif state == "clickable": + element = WebDriverWait(self, timeout).until(expected_conditions.element_to_be_clickable((locator, selector))) + elif state == "present": + element = WebDriverWait(self, timeout).until(expected_conditions.presence_of_element_located((locator, selector))) + elif state == "invisible": + WebDriverWait(self, timeout).until(expected_conditions.invisibility_of_element_located((locator, selector))) + element = None + else: + raise ValueError(f"The 'state' argument must be 'visible', 'clickable', 'present' or 'invisible', not '{state}'") + + # We add this method to our element to provide a more robust click. Chromedriver + # sometimes needs some time before it can click an item, specially if it needs to + # scroll into it first. This method ensures clicks don't fail because of this. + if element: + element.ensure_click = functools.partial(_ensure_click, element) + return element + + @property + def selector(self) -> Selector: + """ + Returns the current state of the browser in a Selector + + We re-parse the site on each xpath, css, re call because we are running a web browser + and the site may change between calls + """ + return Selector(text=self.page_source) + + def xpath(self, *args, **kwargs) -> SelectorList: + return self.selector.xpath(*args, **kwargs) + + def css(self, *args, **kwargs) -> SelectorList: + return self.selector.css(*args, **kwargs) + + def re(self, *args, **kwargs) -> list[str]: + return self.selector.re(*args, **kwargs) + + def re_first(self, *args, **kwargs) -> Union[str, None]: + return self.selector.re_first(*args, **kwargs) + + +def _ensure_click(self) -> None: + """ + Ensures a click gets made, because Selenium can be a bit buggy about clicks + + This method gets added to the selenium element returned in '__ensure_element_by_xpath'. + We should probably add it to more selenium methods, such as all the 'find**' methods though. + + I wrote this method out of frustration with chromedriver and its problems with clicking + items that need to be scrolled to in order to be clickable. In '__ensure_element_by_xpath' we + scroll to the item before returning it, but chrome has some problems if it doesn't get some + time to scroll to the item. This method ensures chromes gets enough time to scroll to the item + before clicking it. I tried SEVERAL more 'correct' methods to get around this, but none of them + worked 100% of the time (waiting for the element to be 'clickable' does not work). + """ + + # We ensure the element is scrolled into the middle of the viewport to ensure that + # it is clickable. There are two main ways an element may not be clickable: + # - It is outside of the viewport + # - It is under a banner or toolbar + # This script solves both cases + script = ( + "var viewPortHeight = Math.max(" + "document.documentElement.clientHeight, window.innerHeight || 0);" + "var elementTop = arguments[0].getBoundingClientRect().top;" + "window.scrollBy(0, elementTop-(viewPortHeight/2));" + ) + self.parent.execute_script(script, self) # parent = the webdriver + + exception_message = "" + for _ in range(10): + try: + self.click() + return + except WebDriverException as e: + exception_message = str(e) + time.sleep(0.2) + raise WebDriverException(f"Couldn't click item after trying 10 times, got error message: {exception_message}") diff --git a/requestium/requestium_response.py b/requestium/requestium_response.py new file mode 100644 index 0000000..ae117d0 --- /dev/null +++ b/requestium/requestium_response.py @@ -0,0 +1,35 @@ +import requests +from parsel.selector import Selector + + +class RequestiumResponse(requests.Response): + """Adds xpath, css, and regex methods to a normal requests response object""" + + def __init__(self, response): + super().__init__() + self.__class__ = type( + response.__class__.__name__, + (self.__class__, response.__class__), + response.__dict__, + ) + + @property + def selector(self): + """ + Returns the response text in a Selector + + We re-parse the text on each xpath, css, re call in case the encoding has changed. + """ + return Selector(text=self.text) + + def xpath(self, *args, **kwargs): + return self.selector.xpath(*args, **kwargs) + + def css(self, *args, **kwargs): + return self.selector.css(*args, **kwargs) + + def re(self, *args, **kwargs): + return self.selector.re(*args, **kwargs) + + def re_first(self, *args, **kwargs): + return self.selector.re_first(*args, **kwargs) diff --git a/requestium/requestium_session.py b/requestium/requestium_session.py new file mode 100644 index 0000000..c3daa93 --- /dev/null +++ b/requestium/requestium_session.py @@ -0,0 +1,162 @@ +import functools +import types +from typing import Optional, Union + +import requests +import tldextract +from selenium import webdriver +from selenium.webdriver import ChromeService +from selenium.webdriver.remote.webdriver import WebDriver + +from .requestium_chrome import RequestiumChrome +from .requestium_response import RequestiumResponse + + +class Session(requests.Session): + """ + Class that adds a Selenium Webdriver and helper methods to a Requests Session + + This session class is a normal Requests Session that has the ability to switch back + and forth between this session and a webdriver, allowing us to run js when needed. + + Cookie transfer is done with the 'transfer' methods. + + Header and proxy transfer is done only one time when the driver process starts. + + Some useful helper methods and object wrappings have been added. + """ + + def __init__( + self, + webdriver_path: Optional[str] = None, + headless: bool = False, + default_timeout: Union[float, int] = 5, + webdriver_options: Optional[dict] = None, + driver: Optional[WebDriver] = None, + ) -> None: + if webdriver_options is None: + webdriver_options = {} + super(Session, self).__init__() + self.webdriver_path = webdriver_path + self.default_timeout = default_timeout + self.webdriver_options = webdriver_options + self._driver = driver + self._last_requests_url = None + + if self._driver is None: + self._driver_initializer = functools.partial(self._start_chrome_browser, headless=headless) + else: + for name in RequestiumChrome.__dict__: + name_private = name.startswith("__") and name.endswith("__") + name_function = isinstance(RequestiumChrome.__dict__[name], types.FunctionType) + name_in_driver = name in dir(self._driver) + if name_private or not name_function or name_in_driver: + continue + self._driver.__dict__[name] = RequestiumChrome.__dict__[name].__get__(self._driver) + self._driver.default_timeout = self.default_timeout + + @property + def driver(self): + if self._driver is None: + self._driver = self._driver_initializer() + return self._driver + + def _start_chrome_browser(self, headless: bool = False) -> RequestiumChrome: + # TODO transfer of proxies and headers: Not supported by chromedriver atm. + # Choosing not to use plug-ins for this as I don't want to worry about the + # extra dependencies and plug-ins don't work in headless mode. :-( + chrome_options = webdriver.ChromeOptions() + + if headless: + chrome_options.add_argument("headless=new") + + if "binary_location" in self.webdriver_options: + chrome_options.binary_location = self.webdriver_options["binary_location"] + + args = self.webdriver_options.get("arguments") + if isinstance(args, list): + for arg in args: + chrome_options.add_argument(arg) + elif args: + raise Exception(f"A list is needed to use 'arguments' option. Found {type(args)}") + + extensions = self.webdriver_options.get("extensions") + if isinstance(extensions, list): + for arg in extensions: + chrome_options.add_extension(arg) + + if "prefs" in self.webdriver_options: + prefs = self.webdriver_options["prefs"] + chrome_options.add_experimental_option("prefs", prefs) + + experimental_options = self.webdriver_options.get("experimental_options") + if isinstance(experimental_options, dict): + for name, value in experimental_options.items(): + chrome_options.add_experimental_option(name, value) + + # Selenium updated webdriver.Chrome's arg and kwargs, to accept options, service, keep_alive + # since ChromeService is the only object where webdriver_path is mapped to executable_path, it must be + # initialized and passed in as a kwarg to RequestiumChrome so it can be passed in as a kwarg + # when passed into webdriver.Chrome in super(DriverMixin, self).__init__(*args, **kwargs) + service = ChromeService(executable_path=self.webdriver_path) + return RequestiumChrome( + service=service, + options=chrome_options, + default_timeout=self.default_timeout, + ) + + def transfer_session_cookies_to_driver(self, domain: Optional[str] = None) -> None: + """ + Copies the Session's cookies into the webdriver + + Using the 'domain' parameter we choose the cookies we wish to transfer, we only + transfer the cookies which belong to that domain. The domain defaults to our last visited + site if not provided. + """ + if not domain and self._last_requests_url: + domain = tldextract.extract(self._last_requests_url).registered_domain + elif not domain and not self._last_requests_url: + raise Exception("Trying to transfer cookies to selenium without specifying a domain " "and without having visited any page in the current session") + + # Transfer cookies + for c in [c for c in self.cookies if domain in c.domain]: + cookie = { + "name": c.name, + "value": c.value, + "path": c.path, + "expiry": c.expires, + "domain": c.domain, + } + + self.driver.ensure_add_cookie({k: v for k, v in cookie.items() if v is not None}) + + def transfer_driver_cookies_to_session(self, copy_user_agent: bool = True) -> None: + if copy_user_agent: + self.copy_user_agent_from_driver() + + for cookie in self.driver.get_cookies(): + self.cookies.set(cookie["name"], cookie["value"], domain=cookie["domain"]) + + def get(self, *args, **kwargs) -> RequestiumResponse: + resp = super(Session, self).get(*args, **kwargs) + self._last_requests_url = resp.url + return RequestiumResponse(resp) + + def post(self, *args, **kwargs) -> RequestiumResponse: + resp = super(Session, self).post(*args, **kwargs) + self._last_requests_url = resp.url + return RequestiumResponse(resp) + + def put(self, *args, **kwargs) -> RequestiumResponse: + resp = super(Session, self).put(*args, **kwargs) + self._last_requests_url = resp.url + return RequestiumResponse(resp) + + def copy_user_agent_from_driver(self) -> None: + """ + Updates requests' session user-agent with the driver's user agent + + This method will start the browser process if its not already running. + """ + selenium_user_agent = self.driver.execute_script("return navigator.userAgent;") + self.headers.update({"user-agent": selenium_user_agent}) diff --git a/setup.py b/setup.py index 107cf77..38dd3a0 100644 --- a/setup.py +++ b/setup.py @@ -2,37 +2,40 @@ from setuptools import setup # Get the long description from the README file -with open('README.md') as file: +with open("README.md") as file: long_description = file.read() setup( - name='requestium', - version='0.3.0', + name="requestium", + version="0.4.0", description=( "Adds a Selenium webdriver and parsel's parser to a request's Session " "object, and makes switching between them seamless. Handles cookie, " "proxy and header transfer." ), long_description=long_description, - long_description_content_type='text/markdown', - author='Joaquin Alori', - author_email='joaquin@tryolabs.com', - url='https://github.com/tryolabs/requestium', - packages=('requestium',), + long_description_content_type="text/markdown", + author="Joaquin Alori", + author_email="joaquin@tryolabs.com", + url="https://github.com/tryolabs/requestium", + packages=("requestium",), install_requires=( - 'parsel>=1.7.0', - 'requests>=2.28.1', - 'selenium>=4.6.0', - 'tldextract>=3.4.0', + "parsel>=1.8.1", + "requests>=2.31.0", + "selenium>=4.15.2", + "tldextract>=5.1.1", ), - license='MIT', + license="MIT", zip_safe=False, classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 3', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Natural Language :: English", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], ) diff --git a/tests/test_ensure_elements_deprecation.py b/tests/test_ensure_elements_deprecation.py index 96ce208..bffbcfd 100644 --- a/tests/test_ensure_elements_deprecation.py +++ b/tests/test_ensure_elements_deprecation.py @@ -1,21 +1,20 @@ import shutil import pytest -import selenium -from selenium.webdriver.common.by import By +from selenium import webdriver import requestium -chrome_webdriver_path = shutil.which('chromedriver') +chrome_webdriver_path = shutil.which("chromedriver") -chrome_webdriver = selenium.webdriver.chrome.webdriver.WebDriver() -firefox_webdriver = selenium.webdriver.firefox.webdriver.WebDriver() +chrome_webdriver = webdriver.Chrome() +firefox_webdriver = webdriver.Firefox() session_parameters = [ - {'webdriver_path': chrome_webdriver_path}, - {'webdriver_path': chrome_webdriver_path, 'headless': True}, - {'driver': chrome_webdriver}, - {'driver': firefox_webdriver}, + {"webdriver_path": chrome_webdriver_path}, + {"webdriver_path": chrome_webdriver_path, "headless": True}, + {"driver": chrome_webdriver}, + {"driver": firefox_webdriver}, ] @@ -27,6 +26,6 @@ def session(request): def test_deprecation_warning_for_ensure_element_locators_with_underscores(session): - session.driver.get('http://the-internet.herokuapp.com') + session.driver.get("http://the-internet.herokuapp.com") with pytest.warns(DeprecationWarning): - session.driver.ensure_element("class_name", 'no-js') + session.driver.ensure_element("class_name", "no-js") diff --git a/tests/test_requestium.py b/tests/test_requestium.py index a18caf3..6147825 100644 --- a/tests/test_requestium.py +++ b/tests/test_requestium.py @@ -1,21 +1,21 @@ import shutil import pytest -import selenium +from selenium import webdriver from selenium.webdriver.common.by import By import requestium -chrome_webdriver_path = shutil.which('chromedriver') +chrome_webdriver_path = shutil.which("chromedriver") -chrome_webdriver = selenium.webdriver.chrome.webdriver.WebDriver() -firefox_webdriver = selenium.webdriver.firefox.webdriver.WebDriver() +chrome_webdriver = webdriver.Chrome() +firefox_webdriver = webdriver.Firefox() session_parameters = [ - {'webdriver_path': chrome_webdriver_path}, - {'webdriver_path': chrome_webdriver_path, 'headless': True}, - {'driver': chrome_webdriver}, - {'driver': firefox_webdriver}, + {"webdriver_path": chrome_webdriver_path}, + {"webdriver_path": chrome_webdriver_path, "headless": True}, + {"driver": chrome_webdriver}, + {"driver": firefox_webdriver}, ] @@ -27,9 +27,9 @@ def session(request): def test_simple_page_load(session): - session.driver.get('http://the-internet.herokuapp.com') - session.driver.ensure_element(By.ID, 'content') + session.driver.get("http://the-internet.herokuapp.com") + session.driver.ensure_element(By.ID, "content") title = session.driver.title heading = session.driver.find_element(By.XPATH, '//*[@id="content"]/h1') - assert title == 'The Internet' - assert heading.text == 'Welcome to the-internet' + assert title == "The Internet" + assert heading.text == "Welcome to the-internet" diff --git a/tox.ini b/tox.ini index e926590..c3f144e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{27,34,35,36} +envlist = py{39,310,311,312} [testenv] passenv = TOXENV CI TRAVIS TRAVIS_*