### Activity 7.01: Extracting the Top 100 e-books from Gutenberg

In [48]:
# Import the necessary libraries
import requests
from bs4 import BeautifulSoup
import re

# Ignore all Warnings
import warnings
warnings.filterwarnings('ignore')

In [49]:
# Function for Read the HTML from the URL and check status
def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the page.")
        return response.text 
    else:
        print(f"Failed to fetch page. Status code: {response.status_code}")
        return None

In [50]:
# Function for Extract href links
def extract_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = []
    for tag in soup.find_all('a', href=True):
        links.append(tag['href'])
    return soup,links

In [51]:
#Display first 30 href links
url = "https://www.gutenberg.org/browse/scores/top"
html = get_html(url)
if html:
    soup, href_links = extract_links(html)
    print("First 30 href links:")
    for link in href_links[:30]:
        print(link)

Successfully fetched the page.
First 30 href links:
/
/about/
/about/
/policy/collection_development.html
/about/contact_information.html
/about/background/
/policy/permission.html
/policy/privacy_policy.html
/policy/terms_of_use.html
/ebooks/
/ebooks/
/ebooks/categories
/ebooks/bookshelf/
/browse/scores/top
/ebooks/offline_catalogs.html
/help/
/help/
/help/copyright.html
/help/errata.html
/help/file_formats.html
/help/faq.html
/policy/
/help/public_domain_ebook_submission.html
/help/submitting_your_own_work.html
/help/mobile.html
/attic/
/donate/
/donate/
pretty-pictures
#books-last1


In [52]:
# Use regex to extract numeric digits (file numbers)
def extract_file_numbers(links):
    file_numbers = []
    for link in links:
        matches = re.findall(r'/ebooks/(\d+)', link)
        file_numbers.extend(matches)
    return list(set(file_numbers))

In [53]:
# Print first 2000 characters of soup.text
def print_soup_text(soup):
    print("\nFirst 2000 characters of page text:\n")
    print(soup.text[:2000])

In [54]:
if html:
    soup, links = extract_links(html)
    file_numbers = extract_file_numbers(links)
    print(f"\nFound {len(file_numbers)} unique file numbers. Sample:")
    print(file_numbers[:10])
    print_soup_text(soup)


Found 116 unique file numbers. Sample:
['25344', '4363', '110', '26184', '6130', '37106', '7370', '1497', '64317', '1080']

First 2000 characters of page text:





Top 100 | Project Gutenberg



























Menu▾



About
          ▾

▾


About Project Gutenberg
Collection Development
Contact Us
History & Philosophy
Permissions & License
Privacy Policy
Terms of Use



Search and Browse
      	  ▾

▾


Book Search
Main Categories
Bookshelves
Frequently Downloaded
Offline Catalogs



Help
          ▾

▾


All help topics →
Copyright How-To
Errata, Fixes and Bug Reports
File Formats
Frequently Asked Questions
Policies →
Public Domain eBook Submission
Submitting Your Own Work
Tablets, Phones and eReaders
The Attic →


Donate










Ways to donate







To determine the ranking we count the times each file gets downloaded.
Both HTTP and FTP transfers are counted.
Only transfers from ibiblio.org are counted as we have no access to our mirrors log files.
Multiple downloads f

In [None]:
# Locate the 'Top 100 EBooks yesterday' section
header = soup.find('h2', id='books-last1', string="Top 100 EBooks yesterday")

# Get the ordered list <ol> containing book entries
book_list = header.find_next_sibling('ol') if header else None

# Extract and clean titles using regex
titles = []
if book_list:
    for li in book_list.find_all('li'):
        text = li.get_text(strip=True)
        # Use regex to extract text before ()
        match = re.match(r'^(.+?)(?:\s*\([^)]*\))?$', text)
        if match:
            start, end = match.span(1)
            titles.append(text[start:end])
        else:
            titles.append(text)  # fallback if no match

# Print the cleaned titles
for i, title in enumerate(titles, 1):
    print(f"{i}. {title}")


1. Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley
2. Moby Dick; Or, The Whale by Herman Melville
3. Pride and Prejudice by Jane Austen
4. A Doll's House : a play by Henrik Ibsen
5. The Great Gatsby by F. Scott  Fitzgerald
6. Romeo and Juliet by William Shakespeare
7. Alice's Adventures in Wonderland by Lewis Carroll
8. The Importance of Being Earnest: A Trivial Comedy for Serious People by Oscar Wilde
9. The Strange Case of Dr. Jekyll and Mr. Hyde by Robert Louis Stevenson
10. The Picture of Dorian Gray by Oscar Wilde
11. Middlemarch by George Eliot
12. Dracula by Bram Stoker
13. The Complete Works of William Shakespeare by William Shakespeare
14. A Room with a View by E. M.  Forster
15. Little Women; Or, Meg, Jo, Beth, and Amy by Louisa May Alcott
16. Crime and Punishment by Fyodor Dostoyevsky
17. The Blue Castle: a novel by L. M.  Montgomery
18. Simple Sabotage Field Manual by United States. Office of Strategic Services
19. The Enchanted April by Elizabeth Von