In [None]:
# 1.1) Function to get and parse HTML content from a Wikipedia page:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_page_content(url):
    # Send an HTTP GET request to the Wikipedia page URL
    response = requests.get(url)

    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Return the parsed HTML content
    return soup


In [None]:
# 2. Function to extract the article title from a Wikipedia page:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article_title(url):
    # Send an HTTP GET request to the Wikipedia page URL
    response = requests.get(url)

    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title element of the page
    title_element = soup.find('title')

    # Extract the text from the title element
    article_title = title_element.get_text()

    # Return the article title
    return article_title


In [None]:
# 3. Function to extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in the dictionary:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article_paragraphs(url):
    # Send an HTTP GET request to the Wikipedia page URL
    response = requests.get(url)

    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content element of the page
    content_div = soup.find(id='content')

    # Find all the headings and paragraphs in the main content element
    headings = content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    paragraphs = content_div.find_all('p')

    # Create a dictionary to map headings to their respective paragraphs
    article_text = {}
    for heading in headings:
        # Get the text of the heading
        heading_text = heading.get_text().strip()

        # Find the paragraph that follows the heading
        next_elem = heading.next_sibling
        while next_elem is not None and next_elem.name != 'p':
            next_elem = next_elem.next_sibling

        # Add the heading and its corresponding paragraph to the dictionary
        if next_elem is not None:
            article_text[heading_text] = next_elem.get_text().strip()

    # Return the article text dictionary
    return article_text


In [None]:
# 4. Function to collect every link that redirects to another Wikipedia page:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_internal_links(url):
    # Send an HTTP GET request to the Wikipedia page URL
    response = requests.get(url)

    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the links on the page that point to another Wikipedia page
    internal_links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None and href.startswith('/wiki/'):
            internal_links.append('https://en.wikipedia.org' + href)

    # Return the list of internal Wikipedia links
    return internal_links


In [1]:
# 5. Function to wrap all the previous functions into a single function that takes as parameters a Wikipedia link:
def scrape_wikipedia_page(url):
    # Get and parse HTML content from the Wikipedia page
    soup = get_wikipedia_page_content(url)

    # Extract the article title
    article_title = get_wikipedia


In [5]:
# Single function that calls all the other functions in the correct order.

import requests
from bs4 import BeautifulSoup

def get_html(url):
    response = requests.get(url)
    html = response.content
    return html

def extract_title(html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('title').text
    return title

def extract_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    content = {}
    section_name = ''
    for element in soup.find_all(['h2', 'p']):
        if element.name == 'h2':
            section_name = element.text.strip()
            content[section_name] = []
        elif element.name == 'p':
            section_content = element.text.strip()
            content[section_name].append(section_content)
    return content

def extract_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('/wiki/'):
            links.append('https://en.wikipedia.org' + href)
    return links

def scrape_wikipedia(url):
    html = get_html(url)
    title = extract_title(html)
    content = extract_content(html)
    links = extract_links(html)
    result = f"Title: {title}\n\nContent:\n"
    for section, paragraphs in content.items():
        result += f"{section}\n"
        for paragraph in paragraphs:
            result += f"- {paragraph}\n"
    result += "\nLinks:\n"
    for link in links:
        result += f"- {link}\n"
    return result


# example usage
result = scrape_wikipedia('https://en.wikipedia.org/wiki/Web_scraping')
print(result)


Title: Web scraping - Wikipedia

Content:
Contents
- Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites.[1] Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis.
- Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet 