In [4]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time

# Create directory if it doesn't exist
output_dir = '/Users/tristangardner/Documents/Programming/01_Apps/_TikTokDev/API-Docs'
os.makedirs(output_dir, exist_ok=True)

# Base URL
base_url = 'https://developers.tiktok.com/doc/overview'

def get_all_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links in the navigation menu
        links = []
        nav_items = soup.find_all('a', href=True)
        
        for item in nav_items:
            href = item['href']
            if href.startswith('/'):  # Convert relative URLs to absolute
                full_url = urljoin(base_url, href)
                links.append(full_url)
                
        return list(set(links))  # Remove duplicates
    
    except Exception as e:
        print(f"Error fetching links from {url}: {str(e)}")
        return []

def save_page_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract main content
        content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
        
        if content:
            # Create cleaner filename from URL
            filename = url.split('/')[-1].split('?')[0]  # Remove query parameters
            if not filename:
                filename = 'index'
            filename = f"{filename}.md"
            filepath = os.path.join(output_dir, filename)
            
            with open(filepath, 'w', encoding='utf-8') as f:
                # Get page title
                title = soup.find('h1') or soup.find('title')
                title = title.get_text(strip=True) if title else filename
                
                # Write header section
                f.write(f"# {title}\n\n")
                f.write(f"Source: {url}\n\n")
                
                # Process content
                for element in content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'pre', 'code', 'ul', 'ol']):
                    if element.name.startswith('h'):
                        # Convert h1-h4 to appropriate markdown headers
                        level = int(element.name[1])
                        f.write(f"\n{'#' * level} {element.get_text(strip=True)}\n\n")
                    
                    elif element.name == 'p':
                        # Handle paragraphs
                        text = element.get_text(strip=True)
                        if text:  # Only write non-empty paragraphs
                            f.write(f"{text}\n\n")
                    
                    elif element.name == 'pre' or element.name == 'code':
                        # Handle code blocks
                        code = element.get_text(strip=True)
                        if code:
                            f.write(f"```\n{code}\n```\n\n")
                    
                    elif element.name in ['ul', 'ol']:
                        # Handle lists
                        for li in element.find_all('li'):
                            text = li.get_text(strip=True)
                            if text:
                                f.write(f"- {text}\n")
                        f.write("\n")
                
            print(f"Saved content from {url} to {filepath}")
            
        else:
            print(f"No main content found for {url}")
    
    except Exception as e:
        print(f"Error saving content from {url}: {str(e)}")

# Get all links and save content
links = get_all_links(base_url)
print(f"Found {len(links)} unique links")

for link in links:
    save_page_content(link)
    time.sleep(1)  # Be nice to the server

print("Scraping completed!")

Found 131 unique links
Saved content from https://developers.tiktok.com/doc/webhooks-events?enter_method=left_navigation to /Users/tristangardner/Documents/Programming/01_Apps/_TikTokDev/API-Docs/webhooks-events.md
Saved content from https://developers.tiktok.com/doc/mobile-sdk-android-quickstart?enter_method=left_navigation to /Users/tristangardner/Documents/Programming/01_Apps/_TikTokDev/API-Docs/mobile-sdk-android-quickstart.md
Saved content from https://developers.tiktok.com/doc/commercial-content-api-query-ads?enter_method=left_navigation to /Users/tristangardner/Documents/Programming/01_Apps/_TikTokDev/API-Docs/commercial-content-api-query-ads.md
Saved content from https://developers.tiktok.com/doc/vce-query-profiles?enter_method=left_navigation to /Users/tristangardner/Documents/Programming/01_Apps/_TikTokDev/API-Docs/vce-query-profiles.md
Saved content from https://developers.tiktok.com/doc/vce-query-liked-videos?enter_method=left_navigation to /Users/tristangardner/Documents/P