# Beautiful Soup Practice Questions

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Question 1: Write a python program to display all the header tags from wikipedia.org.

In [5]:
#Solution

import requests
from bs4 import BeautifulSoup

# Make a GET request to the Wikipedia homepage
response = requests.get("https://www.wikipedia.org/")

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all header tags (h1, h2, h3, h4, h5, h6)
header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

# Display the header tags with their text content
for tag in header_tags:
    print(f"{tag.name}: {tag.text.strip()}")

h1: Wikipedia

The Free Encyclopedia
h2: 1,000,000+


articles
h2: 100,000+


articles
h2: 10,000+


articles
h2: 1,000+


articles
h2: 100+


articles


In [7]:
#Question 2: Write a python program to display IMDB’s Top rated 100 movies’ data (i.e. name, rating, year of release) and make data frame.

In [11]:
# Solution
import requests
import pandas as pd
from bs4 import BeautifulSoup

# URL for IMDB's Top 250 movies
url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

# Retrieve the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to retrieve the page. Please check the URL or your connection.")
else:
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the top 100 movies
    movie_table = soup.find("tbody", class_="lister-list")

    if movie_table is None:
        print("Failed to find the movie table. The page structure may have changed.")
    else:
        # Lists to store the extracted data
        movies = []
        ratings = []
        years = []

        # Extract movie details from the table rows
        for row in movie_table.find_all("tr")[:100]:  # limiting to top 100
            # Get the title of the movie
            title = row.find("td", class_="titleColumn").a.text
            # Get the rating
            rating = row.find("td", class_="ratingColumn").strong.text
            # Get the year of release (strip parentheses)
            year = row.find("span", class_="secondaryInfo").text.strip("()")

            # Append the details to the respective lists
            movies.append(title)
            ratings.append(rating)
            years.append(year)

        # Create a DataFrame with the collected data
        imdb_top_100 = pd.DataFrame({
            "Movie Name": movies,
            "Rating": ratings,
            "Year of Release": years
        })

        # Display the DataFrame
        print(imdb_top_100)

Failed to retrieve the page. Please check the URL or your connection.


In [12]:
#Question 3: Write a python program to scrape mentioned details from dineout.co.in : i) Restaurant name ii) Cuisine iii) Location iv) Ratings v) Image URL.

In [13]:
# Solution

import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website to scrape
url = "https://www.dineout.co.in/delhi-restaurants"  # Modify the URL if needed

# Send a GET request to the website
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to retrieve the page. Please check the URL or your connection.")
else:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all the restaurant containers
    restaurant_cards = soup.find_all("div", class_="restnt-card restaurant")

    # Lists to store the extracted data
    names = []
    cuisines = []
    locations = []
    ratings = []
    image_urls = []

    # Loop through each restaurant card and extract details
    for card in restaurant_cards:
        # Restaurant name
        name = card.find("div", class_="restnt-info").a.text.strip()
        names.append(name)

        # Cuisine
        cuisine = card.find("span", class_="double-line-ellipsis").text.split('|')[1].strip()
        cuisines.append(cuisine)

        # Location
        location = card.find("div", class_="restnt-loc ellipsis").text.strip()
        locations.append(location)

        # Rating
        rating_tag = card.find("div", class_="restnt-rating rating-4")
        rating = rating_tag.text.strip() if rating_tag else "N/A"  # Some might not have ratings
        ratings.append(rating)

        # Image URL
        image_url = card.find("img", class_="no-img")['data-src']
        image_urls.append(image_url)

    # Create a DataFrame with the extracted data
    restaurants_df = pd.DataFrame({
        "Restaurant Name": names,
        "Cuisine": cuisines,
        "Location": locations,
        "Ratings": ratings,
        "Image URL": image_urls
    })

    # Display the DataFrame
    print(restaurants_df)

                             Restaurant Name  \
0                  Delia My Bar Headquarters   
1                        My Bar Headquarters   
2                                    Berco's   
3                              The G.T. Road   
4                                 Article 21   
5                           Minar Restaurant   
6                        Unplugged Courtyard   
7                               Punjab Grill   
8                                      Slice   
9                                    Nando's   
10                            Hard Rock Cafe   
11                        Kwality Restaurant   
12                                    Subway   
13                                   Fa Yian   
14                                  Cafe MRP   
15                                 Oddly Pub   
16                                     Local   
17                             Kill The Bill   
18                               The Embassy   
19  Desi Villagio - Village Theme Restro

In [14]:
#Question 4: Write s python program to display list of respected former finance minister of India(i.e. Name , Term of office) from https://presidentofindia.nic.in/former-presidents.htm and make data frame.


In [15]:
# Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL for the list of former Presidents of India
url = "https://presidentofindia.nic.in/former-presidents.htm"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to retrieve the page. Please check the URL or your connection.")
else:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the list of former Presidents
    presidents_section = soup.find("div", class_="presidentListing")

    # Lists to store the extracted data
    names = []
    terms = []

    # Find all presidents in the section
    if presidents_section:
        president_entries = presidents_section.find_all("div", class_="presidentListing")

        for entry in president_entries:
            # Extract the name
            name = entry.find("div", class_="presidentName").text.strip()
            names.append(name)

            # Extract the term of office
            term = entry.find("div", class_="presidentTerm").text.strip()
            terms.append(term)

        # Create a DataFrame with the collected data
        presidents_df = pd.DataFrame({
            "Name": names,
            "Term of Office": terms
        })

        # Display the DataFrame
        print(presidents_df)
    else:
        print("Could not find the list of former Presidents on the page.")

Failed to retrieve the page. Please check the URL or your connection.
