In [None]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

base_url = 'https://handbook.gitlab.com'
visited_links = set()

# List to store all page data
all_data = []

def extract_and_store_page(url):
    """Fetch the content of a single page and store it in the JSON list."""
    print(f"Crawling: {url}")
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract title (h1 or h2 tags)
    title = soup.find(['h1', 'h2']).get_text() if soup.find(['h1', 'h2']) else "No Title"
    
    # Extract content (all paragraph text)
    content = " ".join([p.get_text() for p in soup.find_all('p')])
    
    # Store the data in a dictionary and append to the list
    page_data = {
        "title": title,
        "url": url,
        "content": content
    }
    all_data.append(page_data)
    
    # Sleep to avoid rate limiting issues (optional, adjust as needed)
    time.sleep(1)

def crawl_main_page():
    """Crawl the main page and extract all internal links, then crawl each of them."""
    print(f"Crawling main page: {base_url}")
    
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links on the main page
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href)  # Handle relative URLs
        
        # Only consider internal links that are part of the main handbook
        if base_url in full_url and full_url not in visited_links:
            visited_links.add(full_url)
            extract_and_store_page(full_url)
    
    # Save collected data to a JSON file
    with open('gitlab_handbook_data.json', 'w') as json_file:
        json.dump(all_data, json_file, indent=4)

# Start the crawling process
crawl_main_page()
