In [2]:
import time
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [6]:
def extract_text(detail, tag, class_name, default=""):
    """
    :detail: BeautifulSoup object of the page
    :tag: HTML tag to search for
    :class_name: Class name to search for
    :default: Default value to return if element is not found
    :return: Extracted text or default value
    """
    try:
        element = detail.find(tag, attrs={"class": class_name})
        return element.text.strip() if element else default
    except Exception as e:
        return default

def get_title(detail):
    """
    Extracts the book title from the page.
    """
    return extract_text(detail, "span", "VU-ZEz")

def get_price(detail):
    """
    Extracts the book price from the page.
    """
    return extract_text(detail, "div", "Nx9bqj CxhGGd")

def get_author(detail):
    """
    Extracts the book author from the page.
    """
    return extract_text(detail, "a", "yN+eNk w9jEaj azBkHf")

def get_desc(detail):
    """
    Extracts the book description from the page.
    """
    return extract_text(detail, "div", "yN+eNk w9jEaj")

def get_highlight(detail, highlight):
    """
    :detail: BeautifulSoup object of the page
    :highlight: Keyword to search for (e.g., "Publisher", "Pages")
    :return: Extracted text of the highlight or empty string if not found
    """
    try:
        for item in detail.find_all("li", attrs={"class": "_7eSDEz"}):
            if highlight.lower() in item.text.strip().lower():
                return item.text.strip()
    except Exception as e:
        return ""
    return ""

def get_services(detail, service_name):
    """
    Extracts service-related information such as payment or return policy.
    
    :param detail: BeautifulSoup object of the page
    :param service_name: Name of the service (e.g., "Cash", "Policy")
    :return: Extracted service text or empty string if not found
    """
    try:
        for item in detail.find_all("div", attrs={"class": "YhUgfO"}):
            if service_name.lower() in item.text.strip().lower():
                return item.text.strip()
    except Exception as e:
        return ""
    return ""

def get_rating(detail):
    """
    Extracts the book rating from the page.
    """
    return extract_text(detail, "div", "ipqd2A")

In [5]:
# Check if the link is valid and accessible
def check_link(new_link):
    try:
        new_webpage = requests.get(new_link, user_agent)
        if new_webpage.status_code == 200:
            return True
        return False
    except:
        return False

# Define a function to scrape data for a single book
def get_data(book_link):
    dic = {"title": [], "price": [], "author": [], 
           "description": [], "Publisher": [], "Pages": [], 
           "Edition": [], "Genre": [], "Language": [], "Binding": [], 
           "Return Policy": [], "Payment Policy": [], "Rating": []}
    
    try:
        response = requests.get(book_link, user_agent)
        soup = BeautifulSoup(response.content, "html.parser")
        
        dic["title"].append(get_title(soup))
        dic["price"].append(get_price(soup))
        dic["author"].append(get_author(soup))
        dic["description"].append(get_desc(soup))
        dic["Publisher"].append(get_highlights(soup, "Publisher"))
        dic["Pages"].append(get_highlights(soup, "Pages"))
        dic["Edition"].append(get_highlights(soup, "Edition"))
        dic["Genre"].append(get_highlights(soup, "Genre"))
        dic["Language"].append(get_highlights(soup, "Language"))
        dic["Binding"].append(get_highlights(soup, "Binding"))
        dic["Return Policy"].append(get_services(soup, "Policy"))
        dic["Payment Policy"].append(get_services(soup, "Cash"))
        dic["Rating"].append(get_rating(soup))
        
    except Exception as e:
        print(f"Error fetching data from {book_link}: {e}")
    
    return dic

def read_user_agent(file_path):
    try:
        with open(file_path, "r") as file:
            user_agent = file.read().strip() 
        return user_agent
    except Exception as e:
        print(f"Error reading user agent from file: {e}")
        return None
        
if __name__ == '__main__':
    base_url = "https://www.flipkart.com/search?q=books&page="
    
    user_agent = read_user_agent("my_agent.txt")    
    
    # Number of pages to scrape
    total_pages = 500 
    all_books_data = []
    
    for page in range(1, total_pages + 1):
        print(f"Scraping page {page}...")
        url = base_url + str(page)
        
        # Check if the page is accessible before scraping
        if not check_link(url):
            print(f"Page {page} not available, skipping...")
            continue
        
        response = requests.get(url, user_agent)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract all book links from the page
        links = soup.find_all("a", attrs={"class": "wjcEIp"}) 
        links_list = ["https://www.flipkart.com" + link.get("href") for link in links if link.get("href")]
        
        # Scrape data for each book link
        for book_link in links_list:
            book_data = get_data(book_link)
            all_books_data.append(book_data)
        
        # Delay to avoid overwhelming the server
        time.sleep(2) 

    df = pd.DataFrame(all_books_data)
    df.to_csv("books_data.csv", index=False, encoding='utf-8')
    print("Data saved to books_data.csv")


Scraping page 1...
Scraping page 2...
Scraping page 3...


KeyboardInterrupt: 

NameError: name 'df' is not defined