#   Extract URL text using selenium

In [4]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
import re

- Selenium WebDriver Setup and Dynamic Content Loading

In [6]:
def setup_driver():
    """Initialize Chrome WebDriver with appropriate options"""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def wait_for_notion_content(driver, wait):
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".notion-page-content")))
    time.sleep(5)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

- Text processing

In [7]:
def clean_text(text):
    if not text:
        return ""
    
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        list_marker = ''
        if line.strip().startswith('•'):
            list_marker = '• '
            line = line.replace('•', '', 1)
        elif line.lstrip().startswith(tuple(f"{i}." for i in range(1, 100))):
            number = line.lstrip().split('.')[0]
            list_marker = f"{number}. "
            line = '.'.join(line.split('.')[1:])
        
        cleaned_line = ' '.join(line.strip().split())
        if cleaned_line:
            cleaned_lines.append(f"{list_marker}{cleaned_line}")
    
    return '\n'.join(cleaned_lines)

- Link extraction and embedding in text

In [8]:
def get_element_links(element):
    """Extract all href links from an element"""
    try:
        links = element.find_elements(By.CSS_SELECTOR, "a[href]")
        return [(link.text, link.get_attribute('href')) for link in links]
    except Exception as e:
        print(f"Error extracting links: {str(e)}")
        return []

def embed_links_in_text(text, links):
    """Embed links in text using markdown format"""
    if not links:
        return text
    
    result = text
    for link_text, href in links:
        if link_text in result:
            result = result.replace(link_text, f"[{link_text}]({href})")
    return result

- Extract Tables

In [9]:
def extract_table_data(table_element):
    """Extract data from a table element and convert to DataFrame"""
    try:
        rows = table_element.find_elements(By.CSS_SELECTOR, "tr")
        data = []
        
        for row in rows:
            cells = row.find_elements(By.CSS_SELECTOR, "td, th")
            cell_data = []
            for cell in cells:
                cell_text = clean_text(cell.text)
                cell_links = get_element_links(cell)
                cell_text_with_links = embed_links_in_text(cell_text, cell_links)
                cell_data.append(cell_text_with_links)
            data.append(cell_data)
            
        if not data:
            return None
            
        # First row as headers
        df = pd.DataFrame(data[1:], columns=data[0])
        return {
            'type': 'table',
            'content': df.to_dict('records')
        }
        
    except Exception as e:
        print(f"Error extracting table data: {str(e)}")
        return None

- Extract element content: Toggles, headers, links, tables, code blocks

In [10]:
def get_element_content(element):
    """Extract content from element including links"""
    try:
        class_name = element.get_attribute('class')
        element_id = element.get_attribute('data-block-id')
        
        # Extract links first
        element_links = get_element_links(element)
        
        # Handle tables first
        if 'notion-table' in class_name:
            table_data = extract_table_data(element)
            if table_data:
                table_data['id'] = element_id
                return table_data
        
        # Enhanced header handling
        tag_name = element.tag_name.lower()
        if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            span_text = element.find_elements(By.CSS_SELECTOR, "span[data-token-index]")
            header_text = span_text[0].text if span_text else element.text
            header_text = embed_links_in_text(clean_text(header_text), element_links)
            return {
                'id': element_id,
                'type': tag_name,
                'content': header_text
            }
        
        header_mapping = {
            'notion-header-block': 'h1',
            'notion-heading-1-block': 'h1',
            'notion-heading-2-block': 'h2',
            'notion-heading-3-block': 'h3',
            'notion-heading-4-block': 'h4',
            'notion-heading-5-block': 'h5',
            'notion-heading-6-block': 'h6'
        }
        
        for header_class, header_type in header_mapping.items():
            if header_class in class_name:
                span_text = element.find_elements(By.CSS_SELECTOR, "span[data-token-index]")
                header_text = span_text[0].text if span_text else element.text
                header_text = embed_links_in_text(clean_text(header_text), element_links)
                return {
                    'id': element_id,
                    'type': header_type,
                    'content': header_text
                }
        
        content_types = {
            'notion-code-block': ('code', lambda e: {
                'id': element_id,
                'type': 'code',
                'language': e.find_element(By.CSS_SELECTOR, ".notion-code-block-language").text,
                'content': e.find_element(By.CSS_SELECTOR, "pre code").text
            }),
            'notion-bulleted_list-block': ('bullet', lambda e: {
                'id': element_id,
                'type': 'bullet',
                'content': embed_links_in_text(clean_text(e.text), element_links)
            }),
            'notion-numbered_list-block': ('number', lambda e: {
                'id': element_id,
                'type': 'number',
                'content': embed_links_in_text(clean_text(e.text), element_links)
            }),
            'notion-text-block': ('text', lambda e: {
                'id': element_id,
                'type': 'text',
                'content': embed_links_in_text(clean_text(e.text), element_links)
            }),
            'notion-callout-block': ('callout', lambda e: {
                'id': element_id,
                'type': 'callout',
                'content': embed_links_in_text(clean_text(e.text), element_links)
            }),
            'notion-quote-block': ('quote', lambda e: {
                'id': element_id,
                'type': 'quote',
                'content': embed_links_in_text(clean_text(e.text), element_links)
            })
        }
        
        for class_type, (content_type, handler) in content_types.items():
            if class_type in class_name:
                try:
                    result = handler(element)
                    if content_type != 'code':  # Don't process links in code blocks
                        result['content'] = embed_links_in_text(result['content'], element_links)
                    return result
                except Exception as e:
                    return {
                        'id': element_id,
                        'type': content_type,
                        'content': embed_links_in_text(clean_text(element.text), element_links)
                    }
        
        return None
    except Exception as e:
        print(f"Error getting element content: {str(e)}")
        return None

- Expand and extract toggle content

In [11]:
def expand_toggle(driver, toggle):
    try:
        driver.execute_script("arguments[0].scrollIntoView();", toggle)
        toggle_button = toggle.find_element(By.CSS_SELECTOR, "div[role='button']")
        if not toggle_button.get_attribute('aria-expanded') == 'true':
            driver.execute_script("arguments[0].click();", toggle_button)
            time.sleep(0.5)
        
        nested_toggles = toggle.find_elements(By.CSS_SELECTOR, "div[class*='notion-toggle-block']")
        for nested_toggle in nested_toggles:
            if nested_toggle != toggle:
                expand_toggle(driver, nested_toggle)
                
    except Exception as e:
        print(f"Error expanding toggle: {str(e)}")

def process_toggle_content(driver, toggle, depth, processed_blocks):
    """Process toggle content including links"""
    toggle_id = toggle.get_attribute('data-block-id')
    if toggle_id in processed_blocks:
        return None
    
    processed_blocks.add(toggle_id)
    
    try:
        expand_toggle(driver, toggle)
        title_element = toggle.find_element(By.CSS_SELECTOR, "[data-content-editable-leaf='true']")
        toggle_title = clean_text(title_element.text)
        
        # Extract links from toggle title
        title_links = get_element_links(toggle)
        toggle_title = embed_links_in_text(toggle_title, title_links)
        
        content_selectors = (
            "div[class*='notion-text-block'], "
            "div[class*='notion-bulleted_list-block'], "
            "div[class*='notion-numbered_list-block'], "
            "div[class*='notion-code-block'], "
            "div[class*='notion-header-block'], "
            "div[class*='notion-heading-'], "
            "div[class*='notion-callout-block'], "
            "div[class*='notion-quote-block'], "
            "div[class*='notion-toggle-block'], "
            "div[class*='notion-table'], "
            "table, "
            "h1, h2, h3, h4, h5, h6"
        )
        
        elements = toggle.find_elements(By.CSS_SELECTOR, content_selectors)
        toggle_content = []
        
        for element in elements:
            element_id = element.get_attribute('data-block-id')
            if element_id in processed_blocks:
                continue
                
            if 'notion-toggle-block' in element.get_attribute('class'):
                nested_toggle = process_toggle_content(driver, element, depth + 1, processed_blocks)
                if nested_toggle:
                    toggle_content.append(nested_toggle)
            else:
                content_data = get_element_content(element)
                if content_data:
                    if element_id:
                        processed_blocks.add(element_id)
                    toggle_content.append(content_data)
        
        return {
            "id": toggle_id,
            "type": "toggle",
            "title": toggle_title,
            "depth": depth,
            "content": toggle_content
        }
        
    except Exception as e:
        print(f"Error processing toggle: {str(e)}")
        return None


In [14]:
def extract_all_content(url):
    driver = setup_driver()
    wait = WebDriverWait(driver, 20)
    all_content = []
    processed_blocks = set()
    
    try:
        driver.get(url)
        wait_for_notion_content(driver, wait)
        
        content_container = driver.find_element(By.CSS_SELECTOR, ".notion-page-content")
        
        content_selectors = (
            "div[class*='notion-text-block'], "
            "div[class*='notion-bulleted_list-block'], "
            "div[class*='notion-numbered_list-block'], "
            "div[class*='notion-code-block'], "
            "div[class*='notion-header-block'], "
            "div[class*='notion-heading-'], "
            "div[class*='notion-callout-block'], "
            "div[class*='notion-quote-block'], "
            "div[class*='notion-toggle-block'], "
            "div[class*='notion-table'], "
            "table, "
            "h1, h2, h3, h4, h5, h6"
        )
        
        all_elements = content_container.find_elements(By.CSS_SELECTOR, content_selectors)
        
        for element in all_elements:
            element_id = element.get_attribute('data-block-id')
            if element_id in processed_blocks:
                continue
            
            if 'notion-toggle-block' in element.get_attribute('class'):
                toggle_data = process_toggle_content(driver, element, 0, processed_blocks)
                if toggle_data:
                    all_content.append(toggle_data)
            else:
                content_data = get_element_content(element)
                if content_data:
                    if element_id:
                        processed_blocks.add(element_id)
                    all_content.append(content_data)
        
        return all_content
        
    except Exception as e:
        print(f"Error during content extraction: {str(e)}")
        return []
        
    finally:
        driver.quit()

- Print and save extracted content

In [15]:
def print_content(content_data, indent_level=0):
    """Print all content with proper formatting"""
    indent = "  " * indent_level
    
    for item in content_data:
        if item['type'] == 'toggle':
            print(f"\n{indent}▼ {item['title']}")
            print_content(item['content'], indent_level + 1)
        elif item['type'] == 'table':
            print(f"\n{indent}Table:")
            df = pd.DataFrame(item['content'])
            print(df.to_string(index=False))
        elif item['type'] == 'code':
            language = item.get('language', '')
            print(f"\n{indent}```{language}\n{item['content']}\n{indent}```")
        elif item['type'].startswith('h'):
            level = item['type'][1]
            print(f"\n{indent}{'#' * int(level)} {item['content']}")
        elif item['type'] in ('bullet', 'number'):
            print(f"{indent}- {item['content']}")
        elif item['type'] in ('callout', 'quote'):
            print(f"\n{indent}> {item['content']}")
        else:
            print(f"{indent}{item['content']}")

def save_content_to_file(content_data, file, indent_level=0):
    """Save all content to a file with proper formatting."""
    indent = "  " * indent_level
    
    for item in content_data:
        if item['type'] == 'toggle':
            file.write(f"\n{indent}▼ {item['title']}\n")
            save_content_to_file(item['content'], file, indent_level + 1)
        elif item['type'] == 'table':
            file.write(f"\n{indent}Table:\n")
            df = pd.DataFrame(item['content'])
            file.write(df.to_string(index=False))
            file.write('\n')
        elif item['type'] == 'code':
            language = item.get('language', '')
            file.write(f"\n{indent}```{language}\n{item['content']}\n{indent}```\n")
        elif item['type'].startswith('h'):
            level = item['type'][1]
            file.write(f"\n{indent}{'#' * int(level)} {item['content']}\n")
        elif item['type'] in ('bullet', 'number'):
            file.write(f"{indent}- {item['content']}\n")
        elif item['type'] in ('callout', 'quote'):
            file.write(f"\n{indent}> {item['content']}\n")
        else:
            file.write(f"{indent}{item['content']}\n")

In [None]:
# Extract content
url = "https://crustdata.notion.site/Crustdata-Dataset-API-Detailed-Examples-b83bd0f1ec09452bb0c2cac811bba88c#aa49c8b2a8ba4a05a49ca380fed4b95b"
content = extract_all_content(url)

# Save content to a file
output_file = "exported_content_v1.txt"
with open(output_file, "w", encoding="utf-8") as file:
    save_content_to_file(content, file)

print(f"\nContent saved to {output_file}")