***Logging***<br>
->Logging is like keeping a diary or a record of what your program is doing.<br>
It helps you understand your program, especially when something goes wrong (errors).<br>
Why we use it:<br>
-To track the flow of a program.<br>
-To debug errors.<br>
-To keep a history of important events<br>

***Automation***<br>
->Automation is when you make your computer do repetitive tasks by itself, without you doing it manually.<br>
Why we use it:<br>
-Save time.<br>
-Avoid mistakes from manual work.<br>
-Run tasks regularly.<br>

***Error-proof scraping design***<br>
When we scrape data from websites, many things can go wrong:<br>
-Website structure may change.<br>
-Some pages may be missing or return errors.<br>
-Internet connection may fail.<br>
-Some data may be empty or unexpected<br>.

Error-proof scraping means designing your scraper so it handles these errors gracefully without crashing and still collects as much data as possible.<br>


In [1]:
#Add try–except blocks around all network requests to handle common errors such as ConnectionError, Timeout, and unexpected exceptions.
import requests


urls = [
    "https://httpbin.org/get",         # should succeed (returns JSON)
    "https://httpstat.us/404",         # will return a 404 error
    "https://httpbin.org/delay/10"     # will timeout (delay > timeout)
]

for url in urls:
    try:
        response = requests.get(url, timeout=5)  # timeout after 5 seconds
        print(f"Success! Status code: {response.status_code} - {url}")

    except requests.exceptions.ConnectionError:
        print(f"Connection error: Could not reach {url}")

    except requests.exceptions.Timeout:
        print(f"Timeout error: {url} took too long to respond")

    except requests.exceptions.RequestException:
        print(f"Some other error occurred with {url}")


Success! Status code: 200 - https://httpbin.org/get
Connection error: Could not reach https://httpstat.us/404
Timeout error: https://httpbin.org/delay/10 took too long to respond


In [5]:
'''Configure Python logging:
• Create a logger that writes logs to a file (scraper.log)
• Log INFO messages for normal flow (page fetched, items parsed)
• Log ERROR messages when exceptions occur'''

import requests
import logging

# Configure logger
logging.basicConfig(
    filename="scraper.log",      # log file name
    level=logging.INFO,           # capture INFO and higher level messages
    format="%(asctime)s - %(levelname)s - %(message)s"  # log format
)

# List of real URLs
urls = [
    "https://httpbin.org/get",
    "https://httpstat.us/404",
    "https://httpbin.org/delay/10"
]

# Loop through URLs
for url in urls:
    try:
        logging.info(f"Fetching URL: {url}")  # INFO message
        response = requests.get(url, timeout=5)
        logging.info(f"Success! Status code: {response.status_code} - {url}") # INFO message
        # Example: parsing items (we just simulate here)
        items = ["item1", "item2"]  
        logging.info(f"Parsed items: {items}")  # INFO message

    except requests.exceptions.ConnectionError:
        logging.error(f"Connection error: Could not reach {url}")  # ERROR message
    except requests.exceptions.Timeout:
        logging.error(f"Timeout error: {url} took too long to respond")  # ERROR message
    except requests.exceptions.RequestException as e:
        logging.error(f"Some other error occurred with {url}: {e}")  # ERROR message


In [6]:
'''Implement a retry mechanism for failed requests:
• Retry a request up to N times (for example, 3 retries)
• Add a small delay between retries
• Stop retrying after max attempts and log the failure'''

import requests
import time
import logging

# Configure logging
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

url = "https://httpbin.org/delay/10"  # slow URL to force failure
max_retries = 3
delay_seconds = 2

for attempt in range(1, max_retries + 1):
    try:
        logging.info(f"Attempt {attempt}: Fetching {url}")
        response = requests.get(url, timeout=3)
        response.raise_for_status()
        logging.info("Page fetched successfully")
        break   # stop retrying if success

    except requests.exceptions.RequestException as e:
        logging.error(f"Attempt {attempt} failed: {e}")
        time.sleep(delay_seconds)

else:
    logging.error(f"Failed to fetch {url} after {max_retries} attempts")


In [13]:
''' Modify fetch_page() to:
• Log the page number being scraped
• Log success when status code is 200
• Log warning or error when status code is not 200'''

import requests
import logging

# Configure logger
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def fetch_page(page_number):
    url = f"https://httpbin.org/status/{200 if page_number % 2 == 0 else 404}"  
    # For demo: even pages = 200, odd pages = 404

    logging.info(f"Scraping page number: {page_number}")  # Log page number

    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            logging.info(f"Page {page_number} fetched successfully!")
            return response.text
        else:
            logging.warning(f"Page {page_number} returned status code: {response.status_code}")
            return None

    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching page {page_number}: {e}")
        return None

# Example: scrape pages 1 to 5
for i in range(1, 6):
    fetch_page(i)


In [15]:
'''Ensure the scraper does not crash on a single failure:
• If one page fails, handle it gracefully
• Continue or stop based on your retry logic'''

import requests
import time
import logging

# Configure logger
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def fetch_page(page_number, max_retries=3, delay_seconds=2):
    url = f"https://httpbin.org/status/{200 if page_number % 2 == 0 else 404}"  
    # For demo: even pages = 200, odd pages = 404

    logging.info(f"Scraping page number: {page_number}")

    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                logging.info(f"Page {page_number} fetched successfully!")
                return response.text
            else:
                logging.warning(f"Page {page_number} returned status code: {response.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            logging.error(f"Attempt {attempt} failed for page {page_number}: {e}")
            time.sleep(delay_seconds)
    # After all retries fail
    logging.error(f"Failed to fetch page {page_number} after {max_retries} attempts")
    return None

# Scrape multiple pages without crashing on a single failure
for i in range(1, 6):
    fetch_page(i)


In [17]:
'''Add a final summary log at the end of execution:
• Total pages attempted
• Total pages successfully scraped
• Total failures'''

import requests
import time
import logging

# Configure logger
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Counters for summary
total_attempted = 0
total_success = 0
total_failed = 0

def fetch_page(page_number, max_retries=3, delay_seconds=2):
    url = f"https://httpbin.org/status/{200 if page_number % 2 == 0 else 404}"  
    logging.info(f"Scraping page number: {page_number}")
    
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                logging.info(f"Page {page_number} fetched successfully!")
                return True  # success
            else:
                logging.warning(f"Page {page_number} returned status code: {response.status_code}")
                return False  # failed due to status code
        except requests.exceptions.RequestException as e:
            logging.error(f"Attempt {attempt} failed for page {page_number}: {e}")
            time.sleep(delay_seconds)
    
    logging.error(f"Failed to fetch page {page_number} after {max_retries} attempts")
    return False  # failed after all retries

# Scrape multiple pages
for i in range(1, 6):
    total_attempted += 1
    success = fetch_page(i)
    if success:
        total_success += 1
    else:
        total_failed += 1

# Final summary
logging.info("--------- Scraping Summary-------- ")
logging.info(f"Total pages attempted: {total_attempted}")
logging.info(f"Total pages successfully scraped: {total_success}")
logging.info(f"Total failures: {total_failed}")


In [19]:
'''7) Automation check
• Wrap the scraper execution inside a main() function
• Make sure the script can be scheduled or run automatically without manual intervention'''

import requests
import time
import logging

# Configure logger
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def fetch_page(page_number, max_retries=3, delay_seconds=2):
    url = f"https://httpbin.org/status/{200 if page_number % 2 == 0 else 404}"  
    logging.info(f"Scraping page number: {page_number}")
    
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                logging.info(f"Page {page_number} fetched successfully!")
                return True
            else:
                logging.warning(f"Page {page_number} returned status code: {response.status_code}")
                return False
        except requests.exceptions.RequestException as e:
            logging.error(f"Attempt {attempt} failed for page {page_number}: {e}")
            time.sleep(delay_seconds)
    
    logging.error(f"Failed to fetch page {page_number} after {max_retries} attempts")
    return False

def main():
    # Counters for summary
    total_attempted = 0
    total_success = 0
    total_failed = 0

    # Scrape multiple pages
    for i in range(1, 6):
        total_attempted += 1
        success = fetch_page(i)
        if success:
            total_success += 1
        else:
            total_failed += 1

    # Final summary
    logging.info("---------Scraping Summary --------")
    logging.info(f"Total pages attempted: {total_attempted}")
    logging.info(f"Total pages successfully scraped: {total_success}")
    logging.info(f"Total failures: {total_failed}")
    
# Ensure automation-friendly execution
if __name__ == "__main__":
    main()
