In [None]:
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

def get_all_valid_links(url, domain=None, visited=None):
    if visited is None:
        visited = set()

    if domain is None:
        domain = urlparse(url).netloc

    if url in visited:
        return visited

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            print(f"Skipping {url} - HTTP {response.status_code}")
            return visited

        if "nothing found" in response.text.lower():
            print(f"Skipping {url} - 'Nothing Found' in content")
            return visited

        visited.add(url)
        print(f"Valid URL: {url}")

        soup = BeautifulSoup(response.content, "html.parser")
        for tag in soup.find_all("a", href=True):
            href = tag['href']
            full_url = urljoin(url, href)
            parsed_href = urlparse(full_url)

            if parsed_href.netloc == domain:
                normalized_url = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
                if normalized_url not in visited:
                    get_all_valid_links(normalized_url, domain, visited)

    except Exception as e:
        print(f"Error visiting {url}: {e}")

    return visited


In [13]:
start_url = "https://lambdalogics.com/"
all_links = get_all_valid_links(start_url)
print("\nAll internal links found:")
for link in all_links:
    print(link)


Valid URL: https://lambdalogics.com/
Valid URL: https://lambdalogics.com/about
Valid URL: https://lambdalogics.com/expertise/
Error visiting https://lambdalogics.com/expertise/cloud-services-and-devops/: HTTPSConnectionPool(host='lambdalogics.com', port=443): Read timed out. (read timeout=5)
Error visiting https://lambdalogics.com/expertise/custom-development/: HTTPSConnectionPool(host='lambdalogics.com', port=443): Read timed out.
Error visiting https://lambdalogics.com/expertise/data-engineering-services/: HTTPSConnectionPool(host='lambdalogics.com', port=443): Read timed out.
Valid URL: https://lambdalogics.com/expertise/information-security-services/
Error visiting https://lambdalogics.com/expertise/cloud-services-and-devops/: HTTPSConnectionPool(host='lambdalogics.com', port=443): Read timed out. (read timeout=5)
Error visiting https://lambdalogics.com/expertise/custom-development/: HTTPSConnectionPool(host='lambdalogics.com', port=443): Read timed out. (read timeout=5)
Error visi

In [14]:
list(all_links)

['https://lambdalogics.com/',
 'https://lambdalogics.com/expertise/information-security-services/',
 'https://lambdalogics.com/expertise/',
 'https://lambdalogics.com/careers/',
 'https://lambdalogics.com/expertise/custom-development/',
 'https://lambdalogics.com/expertise/staff-augmentation-services/',
 'https://lambdalogics.com/contact-us/',
 'https://lambdalogics.com/expertise/mobile-app-development/',
 'https://lambdalogics.com/about/',
 'https://lambdalogics.com/expertise/data-engineering-services/',
 'https://lambdalogics.com/about',
 'https://lambdalogics.com/contact',
 'https://lambdalogics.com/expertise/cloud-services-and-devops/']