<a href="https://colab.research.google.com/github/tauqueerdanish/Web_Scraping/blob/main/Cubic_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
import hashlib
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque

# Function to download files and prettify HTML content
def download_file(url, folder_path, page_name):
    file_name = page_name + ".html"
    full_path = os.path.join(folder_path, file_name)

    # Create the folder structure if it doesn't exist
    create_directory(folder_path)

    # If the full path exists and is a directory, add a number to the filename to avoid overwriting
    counter = 1
    while os.path.exists(full_path) and os.path.isdir(full_path):
        name, ext = os.path.splitext(file_name)
        file_name = f"{name}_{counter}{ext}"
        full_path = os.path.join(folder_path, file_name)
        counter += 1

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    prettified_html = soup.prettify()

    with open(full_path, "w", encoding="utf-8") as file:
        file.write(prettified_html)

# Function to create directories for folder structure
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Function to scrape the website with a depth limit
def scrape_website(website_url, output_folder, max_depth=7):
    visited = set()
    queue = deque([(website_url, output_folder, 1)])

    while queue:
        current_url, current_folder, depth = queue.popleft()
        visited.add(current_url)

        response = requests.get(current_url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the page name from the URL
        page_name = os.path.basename(urlparse(current_url).path)
        if not page_name or page_name == "/":
            page_name = "index"

        # Save the prettified HTML page with the page name as filename
        download_file(current_url, current_folder, page_name)

        if depth < max_depth:
            # Find and enqueue links to other pages
            for link in soup.find_all("a", href=True):
                href = link["href"]
                if href.startswith("/") or website_url in href:
                    page_url = urljoin(website_url, href)
                    if page_url not in visited:
                        queue.append((page_url, current_folder, depth + 1))

            # Find and download other files (CSS, JS, images)
            for tag in soup.find_all(["link", "script", "img"]):
                if tag.name == "link" and tag.has_attr("href"):
                    file_url = urljoin(website_url, tag["href"])
                elif tag.name == "script" and tag.has_attr("src"):
                    file_url = urljoin(website_url, tag["src"])
                elif tag.name == "img" and tag.has_attr("src"):
                    file_url = urljoin(website_url, tag["src"])
                else:
                    continue

                # Save the files in the same folder structure as the website
                relative_path = urlparse(file_url).path
                file_folder_path = os.path.join(current_folder, os.path.dirname(relative_path)[1:])
                create_directory(file_folder_path)

                download_file(file_url, file_folder_path, os.path.basename(relative_path))

# Main code
if __name__ == "__main__":
    website_url = "https://www.cubix.co"  # Replace this with the website URL you want to scrape
    output_folder = "/content/drive/MyDrive/website_files_17"  # Folder to save the downloaded files
    scrape_website(website_url, output_folder)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


KeyboardInterrupt: ignored