In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
from datetime import datetime
import time
from collections import deque

In [5]:
class SitemapGenerator:
    def __init__(self, base_url, max_pages=500, delay=1):
        self.base_url = base_url.rstrip('/')
        self.domain = urlparse(base_url).netloc
        self.visited_urls = set()
        self.to_visit = deque([base_url])
        self.max_pages = max_pages
        self.delay = delay  # Delay between requests to be respectful
        
    def is_valid_url(self, url):
        """Check if URL belongs to the same domain and is valid"""
        parsed = urlparse(url)
        return (
            parsed.netloc == self.domain and
            parsed.scheme in ['http', 'https'] and
            not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.ico'])
        )
    
    def get_links_from_page(self, url):
        """Extract all links from a webpage"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            links = set()
            
            # Find all anchor tags with href attributes
            for link in soup.find_all('a', href=True):
                href = link['href'].strip()
                if href:
                    # Convert relative URLs to absolute URLs
                    absolute_url = urljoin(url, href)
                    # Remove fragments (anchors)
                    absolute_url = absolute_url.split('#')[0]
                    if self.is_valid_url(absolute_url):
                        links.add(absolute_url)
            
            return links
            
        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")
            return set()
    
    def crawl_website(self):
        """Crawl the website and collect all URLs"""
        print(f"Starting to crawl {self.base_url}")
        print(f"Max pages: {self.max_pages}, Delay: {self.delay}s")
        
        while self.to_visit and len(self.visited_urls) < self.max_pages:
            current_url = self.to_visit.popleft()
            
            if current_url in self.visited_urls:
                continue
                
            print(f"Crawling ({len(self.visited_urls) + 1}/{self.max_pages}): {current_url}")
            
            # Add current URL to visited
            self.visited_urls.add(current_url)
            
            # Get links from current page
            links = self.get_links_from_page(current_url)
            
            # Add new links to queue
            for link in links:
                if link not in self.visited_urls and link not in self.to_visit:
                    self.to_visit.append(link)
            
            # Be respectful with delays
            time.sleep(self.delay)
        
        print(f"Crawling completed. Found {len(self.visited_urls)} pages.")
        return sorted(list(self.visited_urls))
    
    def generate_xml_sitemap(self, urls, filename='sitemap.xml'):
        """Generate XML sitemap from URLs"""
        # Create root element
        urlset = ET.Element('urlset')
        urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
        
        # Add each URL
        for url in urls:
            url_elem = ET.SubElement(urlset, 'url')
            
            # Add location
            loc_elem = ET.SubElement(url_elem, 'loc')
            loc_elem.text = url
            
            # Add last modified date (current date)
            lastmod_elem = ET.SubElement(url_elem, 'lastmod')
            lastmod_elem.text = datetime.now().strftime('%Y-%m-%d')
            
            # Add change frequency (optional)
            changefreq_elem = ET.SubElement(url_elem, 'changefreq')
            changefreq_elem.text = 'weekly'
            
            # Add priority (optional)
            priority_elem = ET.SubElement(url_elem, 'priority')
            # Give homepage higher priority
            if url == self.base_url or url == self.base_url + '/':
                priority_elem.text = '1.0'
            else:
                priority_elem.text = '0.8'
        
        # Create tree and write to file
        tree = ET.ElementTree(urlset)
        ET.indent(tree, space="  ", level=0)  # Pretty print
        tree.write(filename, encoding='utf-8', xml_declaration=True)
        print(f"XML sitemap saved to {filename}")
    
    def generate_txt_sitemap(self, urls, filename='sitemap.txt'):
        """Generate text sitemap from URLs"""
        with open(filename, 'w', encoding='utf-8') as f:
            for url in urls:
                f.write(url + '\n')
        print(f"Text sitemap saved to {filename}")

def main():
    # Configuration
    base_url = "https://docs.chaicode.com/"
    max_pages = 100  # Adjust based on site size
    delay = 1  # Seconds between requests
    
    # Create generator instance
    generator = SitemapGenerator(base_url, max_pages, delay)
    
    # Crawl the website
    urls = generator.crawl_website()
    
    if urls:
        # Generate both XML and text sitemaps
        generator.generate_xml_sitemap(urls, 'sitemap.xml')
        generator.generate_txt_sitemap(urls, 'sitemap.txt')
        
        print(f"\nSitemap generation complete!")
        print(f"Total URLs found: {len(urls)}")
        print(f"Files generated: sitemap.xml, sitemap.txt")
    else:
        print("No URLs found to generate sitemap.")

In [6]:
main()

Starting to crawl https://docs.chaicode.com
Max pages: 100, Delay: 1s
Crawling (1/100): https://docs.chaicode.com/
Crawling (2/100): https://docs.chaicode.com/contribute/guide
Crawling (3/100): https://docs.chaicode.com/youtube/getting-started/
Crawling (4/100): https://docs.chaicode.com/contribute/starter-kit/managing-assets/
Crawling (5/100): https://docs.chaicode.com/contribute/starter-kit/authoring-content/
Crawling (6/100): https://docs.chaicode.com/contribute/starter-kit/mdx-crash-course/
Crawling (7/100): https://docs.chaicode.com/contribute/starter-kit/components-library/
Crawling (8/100): https://docs.chaicode.com
Crawling (9/100): https://docs.chaicode.com/contribute/guide/
Crawling (10/100): https://docs.chaicode.com/contribute/starter-kit/adding-contnet/
Crawling (11/100): https://docs.chaicode.com/contribute/starter-kit/contributing-workflow/
Crawling (12/100): https://docs.chaicode.com/contribute/starter-kit/next-step/
Crawling (13/100): https://docs.chaicode.com/contribu