***Generators in Python:***<br>
->A generator is a special type of function that gives one value at a time instead of giving all values at once.<br>
Think like this:<br>
A normal function returns everything → heavy<br>
A generator returns values one by one → light and fast<br>

It uses the yield keyword instead of return.<br>

Why use Generators?<br>
i.Saves memory<br>
ii.Fast for big data<br>
iii.You can loop through the values easily<br>

***Lazy data loading*:**<br>
->Lazy data loading means:<br>
i.Don’t load all data at once<br>
ii.Load only when needed (one by one or small chunks)<br>

This saves memory and makes your program faster, especially with big files or big datasets.<br>

Example in real life:<br>
Imagine you have a 1000-page book.<br>
Normal loading = bring whole book at once → heavy<br>
Lazy loading = bring only 1 page when you need → light<br>

In [54]:
#Create a fetch_page(url) generator that sends a GET request and yields the raw HTML of each page until no next page exists.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def fetch_page(url):
    """Generator that fetches each page and moves to next page."""
    
    while url:   # continue until url becomes None
      
        #  Get the HTML
        response = requests.get(url)
        html = response.text
        
        # Yield (give) the HTML of current page
        yield html
        
        # Parse HTML to find next page URL
        soup = BeautifulSoup(html, "html.parser")
        
        next_link = soup.find("li", class_="next")  # find next page button
        
        if next_link and next_link.a:
            # Build next page URL
            next_href = next_link.a.get("href")
              # Turn relative href into absolute URL
            url = urljoin(start_url, next_href)
        else:
            # No next page stop loop
            url = None
start_url ="https://quotes.toscrape.com/page/1/"

for page in fetch_page(start_url):
    print("Got a page!")
    print(page[:50])
    print("-"*50) 

Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
--------------------------------------------------
Got a page!
<!DOCTYPE html>
<html lang="en">
<head>
	<meta cha
-------------------------

In [56]:
#Create a parse_items(html) generator that extracts all items from a single page and yields each item one-by-one instead of returning a list.
import requests
from bs4 import BeautifulSoup

def parse_items(html):
    """Generator that extracts each book item from a single page."""
    soup = BeautifulSoup(html, "html.parser")
    
    # Find all book containers
    books = soup.find_all("article", class_="product_pod")
    
    for book in books:
        # Extract book title
        title = book.h3.a["title"]
        # Extract book price
        price = book.find("p", class_="price_color").text
        # Extract availability
        availability = book.find("p", class_="instock availability").text.strip()
        
        # Yield a dictionary with book info
        yield {
            "title": title,
            "price": price,
            "availability": availability
        }

# Example usage
url = "http://books.toscrape.com/catalogue/page-1.html"
html = requests.get(url).text

for book in parse_items(html):
    print(book)


{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': 'Â£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'availability': 'In stock'}
{'title': 'The Requiem Red', 'price': 'Â£22.65', 'availability': 'In stock'}
{'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'price': 'Â£33.34', 'availability': 'In stock'}
{'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'price': 'Â£17.93', 'availability': 'In stock'}
{'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'price': 'Â£22.60', 'availability': 'In stock'}
{'title': 'The Black Maria', 'price': 'Â£52.15', 'availability': 'In stock'}
{'title': 'Starving H

In [23]:
#Build a scrape_all() generator that connects fetch_page() and parse_items() to yield scraped items lazily across all pages.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Generator to fetch pages lazily
def fetch_page(url):

    while url:
        response = requests.get(url)
        html = response.text
        yield html

        # Find next page
        soup = BeautifulSoup(html, "html.parser")
        next_button = soup.find("li", class_="next")
        if next_button and next_button.a:
            next_href = next_button.a.get("href")
            url = urljoin(start_url, next_href)
        else:
            url = None

# Generator to parse items from a single page
def parse_items(html):
    soup = BeautifulSoup(html, "html.parser")
    books = soup.find_all("article", class_="product_pod")
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        availability = book.find("p", class_="instock availability").text.strip()
        yield {
            "title": title,
            "price": price,
            "availability": availability
        }

# Generator to scrape all items across all pages lazily
def scrape_all(start_url):
    for page_html in fetch_page(start_url):          # fetch one page at a time
        for item in parse_items(page_html):         # parse items lazily from that page
            yield item                              # yield one item at a time

# Usage example
start_url = "http://books.toscrape.com/catalogue/page-1.html"
for book in scrape_all(start_url):
    print(book)


{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': 'Â£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'availability': 'In stock'}
{'title': 'The Requiem Red', 'price': 'Â£22.65', 'availability': 'In stock'}
{'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'price': 'Â£33.34', 'availability': 'In stock'}
{'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'price': 'Â£17.93', 'availability': 'In stock'}
{'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'price': 'Â£22.60', 'availability': 'In stock'}
{'title': 'The Black Maria', 'price': 'Â£52.15', 'availability': 'In stock'}
{'title': 'Starving H

In [24]:
#Add lazy loading: ensure scrape_all() does not store any full page or full item list in memory.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Generator to fetch pages lazily
def fetch_page(url):

    while url:
        response = requests.get(url)
        html = response.text
        yield html

        # Find next page
        soup = BeautifulSoup(html, "html.parser")
        next_button = soup.find("li", class_="next")
        if next_button and next_button.a:
            next_href = next_button.a.get("href")
            url = urljoin(start_url, next_href)
        else:
            url = None

# Generator to parse items from a single page
def parse_items(html):
    soup = BeautifulSoup(html, "html.parser")
    books = soup.find_all("article", class_="product_pod")
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        availability = book.find("p", class_="instock availability").text.strip()
        yield {
            "title": title,
            "price": price,
            "availability": availability
        }

# Generator to scrape all items across all pages lazily
def scrape_all(start_url):
    for page_html in fetch_page(start_url):          # fetch one page at a time
        for item in parse_items(page_html):         # parse items lazily from that page
            yield item                              # yield one item at a time

# Usage example
start_url = "http://books.toscrape.com/catalogue/page-1.html"
for book in scrape_all(start_url):
    print(book)

{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': 'Â£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'availability': 'In stock'}
{'title': 'The Requiem Red', 'price': 'Â£22.65', 'availability': 'In stock'}
{'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'price': 'Â£33.34', 'availability': 'In stock'}
{'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'price': 'Â£17.93', 'availability': 'In stock'}
{'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'price': 'Â£22.60', 'availability': 'In stock'}
{'title': 'The Black Maria', 'price': 'Â£52.15', 'availability': 'In stock'}
{'title': 'Starving H

In [30]:
#Iterate over scrape_all() using a for loop and print the first 5 items to verify that your generator pipeline works.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Generator to fetch pages lazily
def fetch_page(url):

    while url:
        response = requests.get(url)
        html = response.text
        yield html

        # Find next page
        soup = BeautifulSoup(html, "html.parser")
        next_button = soup.find("li", class_="next")
        if next_button and next_button.a:
            next_href = next_button.a.get("href")
            url = urljoin(start_url, next_href)
        else:
            url = None

# Generator to parse items from a single page
def parse_items(html):
    soup = BeautifulSoup(html, "html.parser")
    books = soup.find_all("article", class_="product_pod")
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        availability = book.find("p", class_="instock availability").text.strip()
        yield {
            "title": title,
            "price": price,
            "availability": availability
        }

# Generator to scrape all items across all pages lazily
def scrape_all(start_url):
    for page_html in fetch_page(start_url):          # fetch one page at a time
        for item in parse_items(page_html):         # parse items lazily from that page
            yield item                              # yield one item at a time

# Usage example
start_url = "http://books.toscrape.com/catalogue/page-1.html"
for i, book in enumerate(scrape_all(start_url), start=1):
    print(book)          # print the book dictionary
    if i >= 5:           # stop after 5 books
        break

{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': 'Â£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'availability': 'In stock'}


In [57]:
#Add a page counter inside fetch_page() to track how many pages were visited and print the total pages at the end.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def fetch_page(start_url):
    url = start_url
    page_number = 0
    while url:
        page_number += 1
        html = requests.get(url).text
        yield html  # yield one page HTML

        # Find next page
        soup = BeautifulSoup(html, "html.parser")
        next_button = soup.find("li", class_="next")
        if next_button and next_button.a:
            url = urljoin(start_url, next_button.a["href"])
        else:
            url = None

    print(f"Total pages visited: {page_number}")

# Usage: iterate over pages
start_url = "http://books.toscrape.com/catalogue/page-1.html"
for page in fetch_page(start_url):
    print("Got a page!")


Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Got a page!
Total pages visited: 50


In [64]:
#Validate generator behavior by confirming the script processes items only when needed (no preloading, no storing all data at once).
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Generator: fetch pages lazily
def fetch_page(start_url):
    url = start_url
    page_number = 0
    while url:
        page_number += 1
        print(f"[fetch_page] Fetching page {page_number}: {url}")
        html = requests.get(url).text
        yield html  # yield page HTML only when needed

        # Find next page
        soup = BeautifulSoup(html, "html.parser")
        next_button = soup.find("li", class_="next")
        if next_button and next_button.a:
            url = urljoin(start_url, next_button.a["href"])
        else:
            url = None
    print(f"[fetch_page] Total pages visited: {page_number}")

# Generator: parse items lazily from a page
def parse_items(html):
    soup = BeautifulSoup(html, "html.parser")
    books = soup.find_all("article", class_="product_pod")
    for book in books:
        print(f"[parse_items] Processing book: {book.h3.a['title']}")
        yield {
            "title": book.h3.a["title"],
            "price": book.find("p", class_="price_color").text,
            "availability": book.find("p", class_="instock availability").text.strip()
        }

# Generator: combine fetch and parse
def scrape_all(start_url):
    for page_html in fetch_page(start_url):
        for item in parse_items(page_html):
            yield item  # yield one item at a time

# Usage: process only first 5 books
start_url = "http://books.toscrape.com/catalogue/page-1.html"

for i, book in enumerate(scrape_all(start_url), start=1):
    print(f"[main] Got book {i}: {book['title']}")
    if i >= 5:
        break


[fetch_page] Fetching page 1: http://books.toscrape.com/catalogue/page-1.html
[parse_items] Processing book: A Light in the Attic
[main] Got book 1: A Light in the Attic
[parse_items] Processing book: Tipping the Velvet
[main] Got book 2: Tipping the Velvet
[parse_items] Processing book: Soumission
[main] Got book 3: Soumission
[parse_items] Processing book: Sharp Objects
[main] Got book 4: Sharp Objects
[parse_items] Processing book: Sapiens: A Brief History of Humankind
[main] Got book 5: Sapiens: A Brief History of Humankind
