In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to extract product information from a single page
def extract_products_from_page(url):
    # Send a GET request to the page
    response = requests.get(url)
    
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Empty list to hold product details
    products = []
    
    # Find all product containers (each book is in an article tag with class 'product_pod')
    product_containers = soup.find_all('article', class_='product_pod')
    
    for container in product_containers:
        # Extract product name (book title is inside 'h3' tag)
        name = container.find('h3').find('a')['title']
        
        # Extract product price (price is in a 'p' tag with class 'price_color')
        price = container.find('p', class_='price_color').text.strip()
        
        # Extract product link (link to the book details page is inside the 'a' tag within 'h3')
        link = container.find('h3').find('a')['href']
        
        # Construct full link if needed
        full_link = f'http://books.toscrape.com/catalogue/{link}' if link.startswith('catalogue') else f'http://books.toscrape.com/{link}'
        
        # Append the product details to the products list
        products.append({
            'Name': name,
            'Price': price,
            'Link': full_link
        })
    
    return products

# Main function to crawl multiple pages
def crawl_ecommerce_website(start_url, pages_to_crawl=5):
    all_products = []
    
    for page in range(1, pages_to_crawl + 1):
        # Construct the URL for the specific page
        url = f'{start_url}catalogue/page-{page}.html'
        print(f'Crawling page {page}: {url}')
        
        # Extract product data from the page
        products = extract_products_from_page(url)
        
        # Add the products to the master list
        all_products.extend(products)
    
    return all_products

# Example usage
def main():
    # Start URL of the e-commerce website's product listing page
    start_url = 'http://books.toscrape.com/'
    
    # Crawl 5 pages of the website
    products = crawl_ecommerce_website(start_url, pages_to_crawl=5)
    
    # Convert the list of products into a pandas DataFrame
    df = pd.DataFrame(products)
    
    # Output the DataFrame to a CSV file
    df.to_csv('lab_5/products.csv', index=False)
    
    print("Web crawling completed. Data saved to 'products.csv'.")

# Run the main function
if __name__ == '__main__':
    main()


Crawling page 1: http://books.toscrape.com/catalogue/page-1.html
Crawling page 2: http://books.toscrape.com/catalogue/page-2.html
Crawling page 3: http://books.toscrape.com/catalogue/page-3.html
Crawling page 4: http://books.toscrape.com/catalogue/page-4.html
Crawling page 5: http://books.toscrape.com/catalogue/page-5.html
Web crawling completed. Data saved to 'products.csv'.


In [16]:
import pandas as pd

# Path to your CSV file
file_path = 'C:/Users/ameyp/IR in AI Lab/lab_5/products.csv'

try:
    # Try to read the CSV file
    data = pd.read_csv(file_path)

    # Check if the file is empty
    if data.empty:
        print("The CSV file is empty.")
    else:
        # Display the first five rows
        print(data.head())

except pd.errors.EmptyDataError:
    print("Error: The CSV file is empty or does not contain any valid data.")
except FileNotFoundError:
    print(f"Error: The file at path '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


                                    Name    Price  \
0                   A Light in the Attic  Â£51.77   
1                     Tipping the Velvet  Â£53.74   
2                             Soumission  Â£50.10   
3                          Sharp Objects  Â£47.82   
4  Sapiens: A Brief History of Humankind  Â£54.23   

                                                Link  
0  http://books.toscrape.com/a-light-in-the-attic...  
1  http://books.toscrape.com/tipping-the-velvet_9...  
2  http://books.toscrape.com/soumission_998/index...  
3  http://books.toscrape.com/sharp-objects_997/in...  
4  http://books.toscrape.com/sapiens-a-brief-hist...  
