In [81]:
from typing import List, Tuple
import os
from bs4 import BeautifulSoup, Tag
import requests
from urllib.parse import urlparse
import re
from page_loader.utils import filter_name, join_urls

In [2]:
with open("./fixtures/before.html", "r") as f:
    before = f.read()

with open("./fixtures/after.html", "r") as f:
    after = f.read()

In [67]:
bsoup = BeautifulSoup(before)
asoup = BeautifulSoup(after)

In [94]:
def replace_links(
    soup: BeautifulSoup, base_url: str, save_dir: str
) -> Tuple[BeautifulSoup, List[str]]:
    """Replace all links, images, css styles, scripts, etc.
    to local copies in beatiful soup html document.

    Example with url = "https://ru.hexlet.io/courses"
    before:
        <link href="/assets/application.css" media="all" rel="stylesheet"/>
        <img alt="Иконка профессии Python-программист" src="/assets/professions/python.png"/>
    after:
        <link href="ru-hexlet-io-courses_files/ru-hexlet-io-assets-application.css" media="all" rel="stylesheet"/>
        <img alt="Иконка профессии Python-программист"
        src="ru-hexlet-io-courses_files/ru-hexlet-io-assets-professions-python.png"/>

    Args:
        soup (BeautifulSoup): The original document in which you need to replace the links
        base_url (str): Origin url for soup.
        save_dir (str): Where to place replaced links.
    Returns:
        Tuple[BeautifulSoup, List[str]]: Soup with replaced links, list of all replaced links
    """
    links = []

    for tag in soup.find_all(["link", "script", "img"]):
        link, attr = get_link(tag)
        if link:

            file_ext = get_file_ext(link)
            link_netloc = link.replace(file_ext, "")

            if link.startswith("/"):

                new_link = (
                    join_urls(filter_name(base_url), filter_name(link_netloc))
                    + file_ext
                )

                link_to_save = join_urls(base_url, link)
                links.append(link_to_save)

            elif urlparse(base_url).netloc == urlparse(link).netloc:
                new_link = filter_name(link_netloc) + file_ext
                links.append(link_netloc + file_ext)
            else:
                continue  # we dont need to change or save other types of links

            tag[attr] = join_urls(save_dir, new_link)

    return soup, links

In [95]:
replace_links(bsoup, 'https://ru.hexlet.io/courses', '.')

(<!-- Используйте этот код в своём проекте -->
 <!DOCTYPE html>
 
 <html lang="ru">
 <head>
 <meta charset="utf-8"/>
 <title>
    Курсы по программированию Хекслет
   </title>
 <link href="https://cdn2.hexlet.io/assets/menu.css" media="all" rel="stylesheet"/>
 <link href="./ru-hexlet-io-courses/assets-application.css" media="all" rel="stylesheet"/>
 <link href="./ru-hexlet-io-courses/courses.html" rel="canonical"/>
 </head>
 <body>
 <img alt="Иконка профессии Python-программист" src="./ru-hexlet-io-courses/assets-professions-python.png"/>
 <img alt="Иконка профессии Python-ананист" src="https://s7d2.scene7.com/is/image/Caterpillar/CM20180911-37279-17325"/>
 <h3>
 <a href="/professions/python">
     Python-программист
    </a>
 </h3>
 <script src="https://js.stripe.com/v3/">
 </script>
 <script src="./ru-hexlet-io-packs-js-runtime.js">
 </script>
 </body>
 </html>,
 ['https://ru.hexlet.io/courses/assets/application.css',
  'https://ru.hexlet.io/courses/courses',
  'https://ru.hexlet.io/

In [71]:
tag = bsoup.find_all(["link", "script", "img"])[1]
link, attr = get_link(tag)

In [77]:
file_ext = get_file_ext(link)
link_netloc = link.replace(file_ext, "")

In [80]:
link_netloc

'/assets/application'

In [91]:
filter_name(link_netloc)

'assets-application'

In [92]:
filter_name(url)

'ru-hexlet-io-courses'

In [93]:
join_urls(filter_name(url), filter_name(link_netloc))

'ru-hexlet-io-courses/assets-application'