In [None]:
!pip install requests beautifulsoup4 pandas



#Exercise 1 : Parsing HTML with BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import json


html_content = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>
'''

soup = BeautifulSoup(html_content, 'html.parser')

title = soup.title.text if soup.title else "No title found"
paragraphs = [p.text for p in soup.find_all('p')]
links = [a['href'] for a in soup.find_all('a', href=True)]

# Prepare results
result = {
    "title": title,
    "paragraphs": paragraphs,
    "links": links
}

# Print the results as JSON
print(json.dumps(result, indent=4))


{
    "title": "Sports World",
    "paragraphs": [
        "Your one-stop destination for the latest sports news and videos.",
        "Read about the latest football matches and player news.",
        "Watch highlights from the latest NBA games.",
        "Get the latest updates from the world of Grand Slam tennis."
    ],
    "links": [
        "#football",
        "#basketball",
        "#tennis"
    ]
}


# Exercise 2 : Scraping robots.txt from Wikipedia

In [3]:
import requests

wikipedia_robots_url = "https://en.wikipedia.org/robots.txt"

try:
    # Sending a GET request to fetch the robots.txt
    response = requests.get(wikipedia_robots_url)

    print("Content of robots.txt:")
    print(response.text)

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Content of robots.txt:
﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Di

#  Exercise 3 : Extracting Headers from Wikipedia’s Main Page

In [4]:
import requests
from bs4 import BeautifulSoup

# Define the URL for Wikipedia's main page
url = "https://en.wikipedia.org/wiki/Main_Page"

try:
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors

    soup = BeautifulSoup(response.text, 'html.parser')
    header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    print("Header Tags from Wikipedia's Main Page:")
    for header in header_tags:
        print(f"{header.name}: {header.text.strip()}")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Header Tags from Wikipedia's Main Page:
h1: Main Page
h1: Welcome to Wikipedia
h2: From today's featured article
h2: Did you know ...
h2: In the news
h2: On this day
h2: From today's featured list
h2: Today's featured picture
h2: Other areas of Wikipedia
h2: Wikipedia's sister projects
h2: Wikipedia languages


#  Exercise 4 : Checking for Page Title

In [5]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Main_Page"

try:
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    if soup.title:
        print(soup.title.text)
    else:
        print("The page does not contain a title.")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Wikipedia, the free encyclopedia


In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m35.1 MB/s

In [None]:
!pip install webdriver-manager


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


# Exercise 5 : Analyzing US-CERT Security Alerts

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL for the first page (filtered for 2024 and alerts)
base_url = "https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024"

# Initializing counter for total alerts
total_alerts = 0

# Function to extract the "Next" page URL
def extract_next_page(soup, base_url):
    # Locate the "Next" button
    next_button = soup.find('span', attrs={"aria-hidden": "true"}, string="Next")
    if next_button:
        # Find the parent <a> tag
        parent_a_tag = next_button.find_parent('a')
        if parent_a_tag and parent_a_tag.get('href'):
            # Extract the page parameter
            next_href = parent_a_tag['href']
            # Preserve the filters from the base URL and add the new page parameter
            return base_url.split("?")[0] + next_href
    return None  # No "Next" button found

# Function to scrape a single page and count alerts
def scrape_page(url):
    global total_alerts
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Findingall alert containers (e.g., <article> elements with the relevant class)
        alert_containers = soup.find_all('article', class_='c -teaser')  # Adjust class if necessary
        count_on_page = len(alert_containers)
        total_alerts += count_on_page

        print(f"Scraping: {url} - Found {count_on_page} alerts on this page.")
        return soup
    except Exception as e:
        print(f"Error fetching page {url}: {e}")
        return None

# Function to scrape all pages and count total alerts
def scrape_all_pages():
    global total_alerts
    next_page = base_url
    page_number = 0

    while next_page:
        page_number += 1
        print(f"\nScraping page {page_number}...")

        # Scraping the current page
        soup = scrape_page(next_page)
        if not soup:
            break  # Stop if the page couldn't be scraped

        # waiting 2 sec
        time.sleep(2)

        # Extracting the next page URL
        next_page = extract_next_page(soup, base_url)
        if next_page:
            print(f"Next page URL: {next_page}")

# Run the scraper
scrape_all_pages()

# Print the total number of alerts for 2024
print(f"\nTotal Alerts for 2024: {total_alerts}")










Scraping page 1...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=1

Scraping page 2...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=1 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=2

Scraping page 3...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=2 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=3

Scraping page

# Exercise 6 : Scraping Hockey teams details

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


url = "https://www.scrapethissite.com/pages/forms/"
response = requests.get(url)
if response.status_code != 200:
    print("Failed to retrieve the page")
else:
    print("Page retrieved successfully")

soup = BeautifulSoup(response.text, 'html.parser')

# Extracting data for each column
team_names = soup.find_all(class_="name")
team_names = [name.get_text().strip() for name in team_names]

years = soup.find_all(class_="year")
years = [year.get_text().strip() for year in years]

wins = soup.find_all(class_="wins")
wins = [win.get_text().strip() or "0" for win in wins]

losses = soup.find_all(class_="losses")
losses = [loss.get_text().strip() or "0" for loss in losses]

ot_losses = soup.find_all(class_="ot-losses")
ot_losses = [ot_loss.get_text().strip() or "0" for ot_loss in ot_losses]

goals_for = soup.find_all(class_="gf")
goals_for = [gf.get_text().strip() or "0" for gf in goals_for]


goals_against = soup.find_all(class_="ga")
goals_against = [ga.get_text().strip() or "0" for ga in goals_against]


goal_diff = soup.find_all(class_="diff")
goal_diff = [diff.get_text().strip() or "0" for diff in goal_diff]

win_percent = soup.find_all('td', class_="pct")
win_percent = [pct.get_text().strip() or "0" for pct in win_percent]

# Consolidating into a DataFrame
data = pd.DataFrame({
    "Team Name": team_names,
    "Year": years,
    "Wins": wins,
    "Losses": losses,
    "OT Losses": ot_losses,
    "Win Percent": win_percent,
    "Goals For": goals_for,
    "Goals Against": goals_against,
    "Goal Difference": goal_diff
})

# Converting  numeric columns to appropriate types
data["Wins"] = data["Wins"].astype(int)
data["Losses"] = data["Losses"].astype(int)
data["OT Losses"] = data["OT Losses"].astype(int)
data["Win Percent"] = data["Win Percent"].astype(float)
data["Goals For"] = data["Goals For"].astype(int)
data["Goals Against"] = data["Goals Against"].astype(int)
data["Goal Difference"] = data["Goal Difference"].astype(int)

print(soup.title.text)


data.to_csv("hockey_teams.csv", index=False)
print("Data saved to hockey_teams.csv")

# Print the first few rows of the DataFrame
print(data.head())







Page retrieved successfully
Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
Data saved to hockey_teams.csv
            Team Name  Year  Wins  Losses  OT Losses  Win Percent  Goals For  \
0       Boston Bruins  1990    44      24          0        0.550        299   
1      Buffalo Sabres  1990    31      30          0        0.388        292   
2      Calgary Flames  1990    46      26          0        0.575        344   
3  Chicago Blackhawks  1990    49      23          0        0.613        284   
4   Detroit Red Wings  1990    34      38          0        0.425        273   

   Goals Against  Goal Difference  
0            264               35  
1            278               14  
2            263               81  
3            211               73  
4            298              -25  


# New Section