# Web Crawler Project
This project demonstrates a basic web crawler implemented in Python. The crawler starts from a base URL, extracts all the links, and saves them to text files. The script handles HTTP errors, connection issues, and respects the politeness policy by adding delays between requests.


Importing necessary libraries

In [16]:
import os
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from collections import deque
import logging
import time

In [18]:
!pip install requests beautifulsoup4



Configure logging to display info messages

In [20]:
logging.basicConfig(level=logging.INFO)

The WebCrawler class, which includes methods for creating project directories, validating URLs, fetching links from pages, saving output, and performing the crawl operation.


In [22]:
class WebCrawler:
    def __init__(self, base_url, project_name, max_pages=100):
        self.base_url = base_url
        self.project_name = project_name
        self.max_pages = max_pages
        self.visited = set()
        self.queue = deque([base_url])
        self.create_project_dir()

    def create_project_dir(self):
        if not os.path.exists(self.project_name):
            logging.info(f"Creating directory: {self.project_name}")
            os.makedirs(self.project_name)

    def is_valid_url(self, url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    def get_all_links(self, url, retries=3):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }

        session = requests.Session()
        session.headers.update(headers)

        for attempt in range(retries):
            try:
                response = session.get(url)
                response.raise_for_status()

                if 'text/html' not in response.headers.get('Content-Type', ''):
                    logging.info(f"Skipping non-HTML content at {url}")
                    return set()

                soup = BeautifulSoup(response.text, "html.parser")
                urls = set()
                for a_tag in soup.findAll("a"):
                    href = a_tag.attrs.get("href")
                    if href == "" or href is None:
                        continue
                    href = urljoin(url, href)
                    parsed_href = urlparse(href)
                    href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
                    if self.is_valid_url(href):
                        urls.add(href)
                return urls

            except requests.exceptions.HTTPError as http_err:
                logging.error(f"HTTP error occurred: {http_err}")
                if response.status_code == 403:
                    logging.error("Access forbidden. The server might be blocking the request.")
                elif response.status_code == 500:
                    logging.error("Server error. The server encountered an internal error.")
                break

            except requests.exceptions.ConnectionError as conn_err:
                logging.error(f"Connection error occurred: {conn_err}")
                if attempt < retries - 1:
                    logging.info(f"Retrying... ({attempt + 1}/{retries})")
                    time.sleep(5)
                else:
                    logging.error("Max retries reached. Failed to retrieve the page.")
                    break

            except Exception as err:
                logging.error(f"An error occurred: {err}")
                break

        return set()

    def save_output(self, url, links):
        filename = os.path.join(self.project_name, f"{urlparse(url).netloc.replace('.', '_')}.txt")
        with open(filename, "w") as f:
            for link in links:
                f.write(link + "\n")
        logging.info(f"Saved links for {url} to {filename}")

    def crawl(self):
        while self.queue and len(self.visited) < self.max_pages:
            url = self.queue.popleft()
            if url in self.visited:
                continue
            logging.info(f"Crawling: {url}")
            self.visited.add(url)
            links = self.get_all_links(url)
            self.queue.extend(links - self.visited)
            logging.info(f"Queue: {self.queue}")
            logging.info(f"Crawled: {self.visited}")
            self.save_output(url, links)
            time.sleep(1) 


The main function that initializes the WebCrawler with the base URL and project name, and starts the crawling process. The base URL and project name are taken as input from the user.


In [24]:
def main():
    base_url = input("Enter the base URL to start crawling: ")
    project_name = input("Enter the project name for saving results: ")
    crawler = WebCrawler(base_url, project_name, max_pages=5)
    crawler.crawl()
if __name__ == '__main__':
    main()

Enter the base URL to start crawling:  http://quotes.toscrape.com/
Enter the project name for saving results:  quotes


INFO:root:Creating directory: quotes
INFO:root:Crawling: http://quotes.toscrape.com/
INFO:root:Queue: deque(['http://quotes.toscrape.com/author/Andre-Gide', 'http://quotes.toscrape.com/tag/love/', 'http://quotes.toscrape.com/tag/adulthood/page/1/', 'http://quotes.toscrape.com/author/J-K-Rowling', 'http://quotes.toscrape.com/tag/friendship/', 'http://quotes.toscrape.com/tag/aliteracy/page/1/', 'http://quotes.toscrape.com/author/Albert-Einstein', 'http://quotes.toscrape.com/tag/books/', 'http://quotes.toscrape.com/tag/be-yourself/page/1/', 'http://quotes.toscrape.com/author/Thomas-A-Edison', 'http://quotes.toscrape.com/tag/deep-thoughts/page/1/', 'http://quotes.toscrape.com/tag/simile/', 'http://quotes.toscrape.com/tag/truth/', 'http://quotes.toscrape.com/tag/change/page/1/', 'http://quotes.toscrape.com/author/Jane-Austen', 'http://quotes.toscrape.com/tag/obvious/page/1/', 'http://quotes.toscrape.com/tag/live/page/1/', 'http://quotes.toscrape.com/tag/miracle/page/1/', 'http://quotes.tosc