In [1]:
from typing import List, Tuple
import os
from bs4 import BeautifulSoup, Tag
import requests
from urllib.parse import urlparse
import re
from page_loader.utils import filter_name, join_urls

In [19]:
with open("./fixtures/before.html", "r") as f:
    before = f.read()

with open("./fixtures/after.html", "r") as f:
    after = f.read()

bsoup = BeautifulSoup(before)
asoup = BeautifulSoup(after)

url = 'https://ru.hexlet.io/courses'

In [20]:
def get_link(tag: Tag) -> Tuple[str]:
    for attr in ["href", "src"]:
        link = tag.attrs.get(attr)
        if link:
            return link, attr
    return None, None


def get_file_ext(link):
    file_ext = re.findall(r"\.[\w]*$", link)
    if not file_ext:
        return ".html"
    else:
        return file_ext[0]

In [21]:
def replace_links(
    soup: BeautifulSoup, base_url: str, save_dir: str = ""
) -> Tuple[BeautifulSoup, List[Tuple[str]]]:
    """Replace all links, images, css styles, scripts, etc.
    to local copies in beatiful soup html document.

    Example with url = "https://ru.hexlet.io/courses"
    before:
        <link href="/assets/application.css" media="all" rel="stylesheet"/>
        <img alt="Иконка профессии Python-программист"
         src="/assets/professions/python.png"/>
    after:
        <link href="ru-hexlet-io-courses_files/ru-hexlet-io-assets-application.css"
        media="all" rel="stylesheet"/>
        <img alt="Иконка профессии Python-программист"
        src="ru-hexlet-io-courses_files/ru-hexlet-io-assets-professions-python.png"/>

    Args:
        soup (BeautifulSoup): The original document in which
        you need to replace the links
        base_url (str): Origin url for soup.
        save_dir (str): Where to place replaced links.
    Returns:
        Tuple[BeautifulSoup, List[str]]: Soup with replaced links,
        list with Tuple, where first element is link and second one is file name
    """
    links = []

    for tag in soup.find_all(["link", "script", "img"]):
        link, attr = get_link(tag)
        if link:

            file_ext = get_file_ext(link)
            link_netloc = link.replace(file_ext, "")
            url_netloc = urlparse(base_url).netloc

            if url_netloc == urlparse(link).netloc:
                new_link = filter_name(link_netloc) + file_ext
                links.append((link_netloc + file_ext, new_link))

            elif not link.startswith("http"):

                new_link = filter_name(join_urls(url_netloc, link_netloc)) + file_ext

                link_to_save = join_urls(urlparse(base_url).scheme, url_netloc, link_netloc)
                print(link_to_save, link_netloc)

                links.append((link_to_save, new_link))

            else:
                continue  # we dont need to change or save other types of links
            save_dir = filter_name(base_url) + "_files"
            tag[attr] = join_urls(save_dir, new_link)

    return soup, links

In [None]:
replace_links(bsoup, url)

In [23]:
s, links = _

In [24]:
links

[('ru.hexlet.io/assets/application', 'ru-hexlet-io-assets-application.css'),
 ('ru.hexlet.io/courses', 'ru-hexlet-io-courses.html'),
 ('ru.hexlet.io/assets/professions/python',
  'ru-hexlet-io-assets-professions-python.png'),
 ('https://ru.hexlet.io/packs/js/runtime.js',
  'ru-hexlet-io-packs-js-runtime.js')]

In [25]:
urlparse('https://ru.hexlet.io/packs/js/runtime.js')

ParseResult(scheme='https', netloc='ru.hexlet.io', path='/packs/js/runtime.js', params='', query='', fragment='')

In [108]:
pairs = [
    (
        "https://ru.hexlet.io/courses/assets/application.css",
        "ru-hexlet-io-assets-application.css",
    ),
    ("https://ru.hexlet.io/courses/courses", "ru-hexlet-io-courses.html"),
    (
        "https://ru.hexlet.io/courses/assets/professions/python.png",
        "ru-hexlet-io-assets-professions-python.png",
    ),
    ("https://ru.hexlet.io/packs/js/runtime.js", "ru-hexlet-io-packs-js-runtime.js"),
]

In [None]:
replace_links(bsoup, 'https://ru.hexlet.io/courses', '.')

In [71]:
tag = bsoup.find_all(["link", "script", "img"])[1]
link, attr = get_link(tag)
file_ext = get_file_ext(link)
link_netloc = link.replace(file_ext, "")

In [136]:
with requests_mock.Mocker() as mock:
    matcher = re.compile(urlparse(url).netloc)
    mock.register_uri("GET", matcher, content=b'content')
    for link in pairs:
        res = requests.get(link[0])
        print(res.content)

b'content'
b'content'
b'content'
b'content'


In [3]:
from page_loader.utils import get_soup
from page_loader.scrap import replace_links, download_links
import os
url = 'https://books.toscrape.com/'

In [4]:
soup = get_soup(url)

In [6]:
soup, pairs = replace_links(soup, url, os.getcwd())

In [7]:
pairs

[]