In [1]:
from bs4 import BeautifulSoup

html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>
    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>
    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>
    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>
    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>
    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>
    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')

In [2]:
title = soup.title

title_text = title.string

print("Title of the webpage:", title_text)

Title of the webpage: Sports World


In [5]:
paragraphs = soup.find_all('p')

for paragraph in paragraphs:
    print("Paragraph:", paragraph.get_text())

Paragraph: Your one-stop destination for the latest sports news and videos.
Paragraph: Read about the latest football matches and player news.
Paragraph: Watch highlights from the latest NBA games.
Paragraph: Get the latest updates from the world of Grand Slam tennis.


In [8]:
elements_with_src = soup.find_all(lambda tag: tag.has_attr('src'))

for element in elements_with_src:
    src = element['src']
    print("Source:", src)

Source: https://www.youtube.com/embed/football-video-id
Source: https://www.youtube.com/embed/basketball-video-id
Source: https://www.youtube.com/embed/tennis-video-id


In [10]:
import requests

def get_robots_txt(url):
    try:
        robots_txt_url = f"{url}/robots.txt"

        response = requests.get(robots_txt_url)

        if response.status_code == 200:
            return response.text
        else:
            return f"Error: Unable to retrieve robots.txt (Status Code: {response.status_code})"
    except Exception as e:
        return f"Error: {e}"

def main():
    url = "https://en.wikipedia.org"

    robots_txt_content = get_robots_txt(url)

    print(f"Robots.txt for {url}:\n")
    print(robots_txt_content)

if __name__ == "__main__":
    main()

Robots.txt for https://en.wikipedia.org:

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.

In [12]:
import requests
from bs4 import BeautifulSoup

def extract_headers(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        for header in headers:
            print(header.text.strip())
    else:
        print(f"Error: Unable to fetch content. Status Code: {response.status_code}")

wiki_main_page_url = 'https://en.wikipedia.org/wiki/Main_Page'

extract_headers(wiki_main_page_url)

Main Page
Welcome to Wikipedia
From today's featured article
Did you know ...
In the news
On this day
Today's featured picture
Other areas of Wikipedia
Wikipedia's sister projects
Wikipedia languages


In [13]:
import requests
from bs4 import BeautifulSoup

def check_page_title(url):
    try:
        # Fetch the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the title tag
        title_tag = soup.find('title')

        # Check if the title tag exists
        if title_tag:
            print(f"The page has a title: {title_tag.text}")
        else:
            print("The page does not have a title.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")

if __name__ == "__main__":
    # Example usage
    url_to_check = input("Enter the URL of the page to check: ")
    check_page_title(url_to_check)


Enter the URL of the page to check: https://en.wikipedia.org/wiki/Main_Page
The page has a title: Wikipedia, the free encyclopedia
