CSC221 Final Project'

In [27]:
#World..redacted
# CSC 221 Final Project
# Web Scraping abd Data Visualization
# 11/21/2024

# IMPORT LIBRARIES AND MODULES FOR PROJECT:

import requests
from bs4 import BeautifulSoup
import csv
import time
import re

# TIME DELAY FOR HTML REQUEST TO AVOID TIMEOUT ERRORS
time.sleep(2)  # Wait for 2 seconds between requests


In [28]:
# THIS IS THE SITE I WILL BE SCRAPING:
BASE_URL = "https://nostarch.com/"


In [29]:
# BUILDING A WEB SCRAPER:

def scrape_book_page(url):
    full_url = f"{BASE_URL}/{url}"
    response = requests.get(full_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    books = []
    print(f"Full URL: {full_url}")

    book_elements = soup.find_all(['article', 'div'], class_=re.compile(r'node|product|teaser|clearfix'))
    print(f"Number of book elements found: {len(book_elements)}")

    #if node_elements:
       # print("\nFirst book element:")
       # print(soup.prettify().replace('\n', '\t'))
   # else:
       # print("No book elements found")

    for index, element in enumerate(node_elements, 1):
        print(f"\nProcessing book {index}:")

        # FIND THE H2 TAGS FOR BOOK LINK:
        
        h2_element = element.find('h2')
        if h2_element and h2_element.a:
            print(f"Found h2 element for book {index}")
            book_info = {
                'Title': h2_element.a.text.strip(),
                'URL': BASE_URL + h2_element.a['href'],
                'Author': '',
                'ISBN': '',
                'Number of Pages': ''
            }
            print(f"Created book info dict for book {index}")

            # SCRAPE OUT AUTHOR INFO:
            
            author_element = element.find('div', class_=re.compile(r'field-name-field-author|author'))
            if author_element:
                author_text = author_element.find('div', class_='field-items').find('div', class_='field-item even')
                if author_text and author_text.text.strip():
                    book_info['Author'] = author_text.text.strip()
                    print(f"Added author to book {index}")

            # SCRAPE OUT ISBN INFO:
            
            isbn_element = element.find('span', class_=re.compile(r'field-name-field-isbn13|isbn'))
            if isbn_element:
                isbn_text = isbn_element.find('div', class_='field-items').find('div', class_='field-item even')
                if isbn_text and isbn_text.text.strip():
                    book_info['ISBN'] = isbn_text.text.strip()
                    print(f"Added ISBN to book {index}")

            # SCRAPE OUT NUMBER OF PAGES:
            
            page_info = element.find('div', class_=re.compile(r'field-name-released-date|released-date'))
            if page_info:
                page_text = page_info.find('div', class_='field-items').find('div', class_='field-item even')
                if page_text and page_text.text.strip():
                    parts = page_text.text.strip().split(', ')
                    num_pages = parts[1].split('.')[0] if '.' in parts[1] else parts[1]
                    book_info['Number of Pages'] = num_pages
                    print(f"Added number of pages to book {index}")

            # ADD ITEMS TO BOOK LIST
            books.append(book_info)
            print(f"Added book info to list for book {index}")
        else:
            print(f"No h2 element found for book {index}")

    print(f"Total books added: {len(books)}")
    return books


In [30]:
# DEFINE WHAT NEEDS TO HAPPEN WHEN BOOK LINK IS CLICKED:

def scrape_book_details(books):
    BASE_URL = 'https://nostarch.com'

    for book in books:
        page_info_url = f'{BASE_URL}/{book['URL']}'
        try:
            page_response = requests.get(page_info_url)
            page_soup = BeautifulSoup(page_response.content, 'html.parser')

            # CODE TO GET AUTHOR DETAILS:
            
            author_element = page_soup.find('span', class_=re.compile(r'field-name-field-author|author'))
            if author_element:
                book['Author'] = author_element.find('div', class_=re.compile(r'field-items|field-container')).find('div', class_=re.compile(r'field-item|item')).text.strip()

            # CODE TO GET PAGE AMOUNT:
            
            page_info = page_soup.find('div', class_=re.compile(r'field-name-released-date|released-date'))
            if page_info:
                page_text = page_info.find('div', class_=re.compile(r'field-items|field-container')).find('div', class_=re.compile(r'field-item|item')).text.strip()
                parts = page_text.split(',')
                num_pages = parts[1].split('.')[0] if '.' in parts[1] else parts[1]
                book['Number of Pages'] = num_pages

            # CODE TO GET ISBN-13:
            isbn_element = page_soup.find('div', class_=re.compile(r'field-name-field-isbn13|isbn'))
            if isbn_element:
                isbn_text = isbn_element.find('div', class_=re.compile(r'field-items|field-container')).find('div', class_=re.compile(r'field-item|item')).text.strip()
                book['ISBN'] = isbn_text

        except Exeption as e:  # e for error
            print(f'Error scraping details for {book['Title']}: {str(e)}')
    return books
            

            

In [31]:
# FORMAT FOR CSV FILE:

def save_to_csv(books):
    fieldnames = ['Title', 'Author', 'ISBN', 'Number of Pages']
    with open('python_books.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for book in books:
            writer.writerow({
                'Title': book['Title'],
                'Author': book['Author'],
                'ISBN': book['ISBN'],
                'Number of Pages': book.get('Number of Pages') or ''
            })

In [32]:
# DEFINE MAIN:

def main():
    url = 'catalog/python'
    books = scrape_book_page(url)
    books = scrape_book_details(books)

    save_to_csv(books)
    print('Scraping complete! Go check your new csv file, WORLD!!!')

if __name__ == '__main__':
    try:
        main()
    except Exception as e: # e for error
        print(f'An error occured: {str(e)}')

Full URL: https://nostarch.com//catalog/python
Number of book elements found: 40

Processing book 1:
Found h2 element for book 1
Created book info dict for book 1
Added author to book 1
Added book info to list for book 1

Processing book 2:
Found h2 element for book 2
Created book info dict for book 2
Added author to book 2
Added book info to list for book 2

Processing book 3:
Found h2 element for book 3
Created book info dict for book 3
Added author to book 3
Added book info to list for book 3

Processing book 4:
Found h2 element for book 4
Created book info dict for book 4
Added author to book 4
Added book info to list for book 4

Processing book 5:
Found h2 element for book 5
Created book info dict for book 5
Added author to book 5
Added book info to list for book 5

Processing book 6:
Found h2 element for book 6
Created book info dict for book 6
Added author to book 6
Added book info to list for book 6

Processing book 7:
Found h2 element for book 7
Created book info dict for book