# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import re
from fastcore.all import *
from urllib.parse import urlparse, urlencode, quote_plus, unquote
import requests

In [None]:
#| export
IGNORE_EXT = [
    ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp",  # Image Files
    ".mp3", ".wav", ".aac", ".flac", ".mp4", ".avi", ".mov", ".mkv", ".webm",  # Audio and Video Files
    ".zip", ".rar", ".tar", ".gz", ".7z",  # Archive Files
    ".html", ".css", ".js", ".json", ".xml", ".yaml", ".yml", ".md", ".php",  # Web and Code Files
    ".sql", ".sqlite", ".db", ".bak",  # Database and Backup Files
    ".iso", ".epub", ".chm", ".dmg", ".apk", ".exe", ".bin",  # Miscellaneous Files
    ".py", ".java", ".cpp", ".go", ".rb", ".sh",  # Code and Script Files
]

def valid_href(href:str):
    """
    Checks if the provided `href` should be ignored based on a set of conditions.
    Returns True if the href matches any ignore condition, otherwise False.
    """
    if href is None:
        return False
    ignore_conditions = [
        lambda x: x == "" or x == "#" or x in "x",
        lambda x: x.startswith(("ftp:", "irc:", "mailto:", "tel:", "javascript:", "app://")),
        lambda x: any(i in x for i in ["private","subscribe","paywall","login"]),
        lambda x: any(x.endswith(i) for i in IGNORE_EXT)
        ]
    
    href = href.lower()
    for fn in ignore_conditions:
        if fn(href):
            return False

    return True

In [None]:
#|echo: True
hrefs = [
    "mailto:someone@example.com", "tel:+1234567890","javascript:void(0)","https://example.com/file.mp4",
    "https://example.com/image.JPG","https://example.com/private-area","https://example.com/login",
    "https://example.com/subscribe","https://example.com/api/data.json","app://some-app",
]
assert not any([valid_href(href) for href in hrefs ]) 

In [None]:
#| export

HTTP_URL_PATTERN = r'^http[s]*://.+'
def hydrate_links(local_domain, url):
    """Converts relative URLs to absolute; returns None for external links."""
    url = unquote(url)
    clean_link = None
    
    if re.search(HTTP_URL_PATTERN, url):
        url_obj = urlparse(url)
        if url_obj.netloc == local_domain:
                clean_link = url

    else:
        if url.startswith("/"):
            url = url[1:]
        clean_link = "https://" + local_domain + "/" + url

    if clean_link is not None:
        if clean_link.endswith("/#"):
            clean_link = clean_link[:-2] 
        if clean_link.endswith("/") or  clean_link.endswith("#"):
            clean_link = clean_link[:-1]

    return clean_link 

In [None]:
assert hydrate_links(    "or.wikipedia.org",
    "https://or.wikipedia.org/wiki/%E0%AC%86%E0%AC%87%E0%AC%9C%E0%AC%BE%E0%AC%95_%E0%AC%B8%E0%AC%BE%E0%AC%A8%E0%AD%8D%E0%AC%A4%E0%AD%8D%E0%AC%B0%E0%AC%BE",
) == 'https://or.wikipedia.org/wiki/ଆଇଜାକ_ସାନ୍ତ୍ରା'

In [None]:
#|echo: True
local_domain = "example.com"

assert hydrate_links(local_domain, "https://example.com/path") == "https://example.com/path"
assert hydrate_links(local_domain, "https://otherdomain.com/path") is None
assert hydrate_links(local_domain, "/path") == "https://example.com/path"
assert hydrate_links(local_domain, "path") == "https://example.com/path"
assert hydrate_links(local_domain, "https://example.com/path/") == "https://example.com/path"
assert hydrate_links(local_domain, "/path/") == "https://example.com/path"
assert hydrate_links(local_domain, "/") == "https://example.com"
assert hydrate_links(local_domain, "") == "https://example.com"
assert hydrate_links(local_domain, "http://example.com/path") == "http://example.com/path"
assert hydrate_links(local_domain, "http://example.com/path/#") == "http://example.com/path"
assert hydrate_links(local_domain, "http://example.com/path#") == "http://example.com/path"

In [None]:
#| export
ALLOWED_EXT_CONTENT_TYPS = {
    ".pdf": "application/pdf",
    ".doc": "application/msword",
    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    ".odt": "application/vnd.oasis.opendocument.text",
    ".xls": "application/vnd.ms-excel",
    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    ".ppt": "application/vnd.ms-powerpoint",
    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ".txt": "text/plain",
    ".csv": "text/csv",
    ".html": "text/html",
}
def get_fn_from_url(url: str):
    """
    Return the file name from the URL.
    If URL ends with .pdf, .doc, .docx, .html etc, return path.<extension>.
    Else return path_params_query.html.
    """

    parsed_url = urlparse(unquote(url.lower()))
    
    path = parsed_url.path
    path += f"/{parsed_url.params}" if parsed_url.params else ""
    path += f"/{parsed_url.query}" if parsed_url.query else ""
    
    path = re.sub(r"[\/=\?\s]", "_", path)
    
    if any(path.endswith(ext) for ext in ALLOWED_EXT_CONTENT_TYPS.keys()):
        return path

    # Default case: return path with .html extension
    return path + '.html'

In [None]:
assert get_fn_from_url("https://example.com/somepath/with/query?name=value") == "_somepath_with_query_name_value.html"
assert get_fn_from_url("https://example.com/somepath/report.pdf") == "_somepath_report.pdf"
assert get_fn_from_url("https://example.com/somepath/page.html") == "_somepath_page.html"
assert get_fn_from_url("https://example.com/somepath/with/space%20in%20path") == "_somepath_with_space_in_path.html"
assert get_fn_from_url("https://example.com/somepath/with/equals=sign") == "_somepath_with_equals_sign.html"
assert get_fn_from_url("https://example.com/") == "_.html"
assert get_fn_from_url("https://example.com/somepath/file.csv") == "_somepath_file.csv"
assert get_fn_from_url("https://example.com/somepath/long_query?param=value&another=more") == '_somepath_long_query_param_value&another_more.html'
assert get_fn_from_url("https://example.com/somepath/.docx") == "_somepath_.docx"

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()