In [67]:
import os
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import requests
import fitz
import hashlib

In [68]:
def parse_sitemap(sitemap_path):
    try:
        # Parsing
        tree = ET.parse(sitemap_path)
        root = tree.getroot()
        # Define the XML namespace
        ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        # Find all URL elements in the sitemap
        url_elements = root.findall('.//ns:url/ns:loc', namespaces=ns)
        # Extract URLs from the elements
        url_list = [url_element.text for url_element in url_elements]
        return url_list
    except Exception as e:
        print(f"Error parsing sitemap: {e}")
        return []

In [69]:
def clean_html_and_extract_text(html_text):
    try:
        soup = BeautifulSoup(html_text, 'html.parser')
        text_content = soup.get_text(separator=' ', strip=True)
        # Replace multiple spaces with a single space
        cleaned_text = re.sub(r'\s+', ' ', text_content)  
        # Remove non-alphanumeric characters
        cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  
        # Normalize the text (convert to lowercase)
        cleaned_text = cleaned_text.lower()
        return cleaned_text
    except Exception as e:
        print(f"Failed with parser. Error: {e}")
        return None

In [70]:
def fetch_content(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Check if the content is PDF
        if 'application/pdf' in response.headers.get('Content-Type', ''):
            return response.content, 'pdf'
        else:
            return response.text, 'html'

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch content from {url}. Error: {e}")
        return None, None

In [71]:
def save_to_file(content, folder_path, filename,):
    try:
        if content is not None:
            # Create the folder if it doesn't exist
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            file_path = os.path.join(folder_path, filename)
            # Write the content to the file
            with open(file_path,'w', encoding='utf-8') as file:
                file.write(content)
    except Exception as e:
        print(f"Error saving to file. Error: {e}")

In [72]:
def save_pdf_to_file(pdf_content, folder, filename):
    try:
        pdf_folder = os.path.join(folder, 'pdf_files')
        os.makedirs(pdf_folder, exist_ok=True)  # Ensure the folder exists
        file_path = os.path.join(pdf_folder, filename + "_pdf.pdf")
        with open(file_path, 'wb') as file:
            file.write(pdf_content)

        # Extract text from the downloaded PDF file
        pdf_text = extract_text_from_pdf(file_path)

        pdf_text_folder = os.path.join(folder, 'pdf_text_files')
        os.makedirs(pdf_text_folder, exist_ok=True)  # Ensure the folder exists
        text_file_path = os.path.join(pdf_text_folder, filename + "_pdf_extracted_text.txt")
        if pdf_text:
            with open(text_file_path, 'w', encoding='utf-8') as text_file:
                text_file.write(pdf_text)
    except Exception as e:
        print(f"Error saving PDF to file. Error: {e}")

In [73]:
def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        with fitz.open(pdf_path) as pdf_document:
            for page_number in range(pdf_document.page_count):
                page = pdf_document[page_number]
                text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from PDF. Error: {e}")
    return text

In [74]:
def crawl_url(url, folder_path):
    try:
        content, content_type = fetch_content(url)

        if content is not None and content_type:
            hashed_url = hashlib.md5(url.encode()).hexdigest()
            filename = "crawled_" + hashed_url + urls[i].replace("/", "_") + "_"

            if content_type == 'pdf':
                save_pdf_to_file(content, folder_path, filename + "_pdf.pdf")
            else:
                # Handle HTML content as before
                html_content = content
                extracted_text = clean_html_and_extract_text(html_content)
                html_folder = os.path.join(folder_path, 'html_files')
                save_to_file(html_content, html_folder, filename + "_html.html")
                text_folder = os.path.join(folder_path, 'html_text_files')
                save_to_file(extracted_text, text_folder, filename + "_extracted_text.txt")
        else:
            hashed_url = hashlib.md5(url.encode()).hexdigest()
            filename = "crawled_" + hashed_url + urls[i].replace("/", "_") + "_"
            error_folder=os.path.join(folder_path,'error_files')
            save_to_file(url, error_folder, filename)
            
    except Exception as e:
        print(f"Error crawling URL {url}. Error: {e}")

In [79]:
def main():
    try:
        sitemap_path = 'drax_sitemap.xml'
        urls = parse_sitemap(sitemap_path)
        urls=set(urls)
        urls=list(urls)
        for i in range(1200,1350):
            hashed_url = hashlib.md5(urls[i].encode()).hexdigest()
            filename = "crawled_" + hashed_url + urls[i].replace("/", "_") + "_"
            folder_to_store = "crawled"
            crawl_url(urls[i], folder_to_store)
    except Exception as e:
        print(f"An unexpected error occurred. Error: {e}")

In [None]:
main()

Error crawling URL https://www.drax.com/uk/press_release/greenest-christmas-fossil-fuels-fall-to-all-time-low-on-britains-power-grid/. Error: 'str' object has no attribute 'netloc'
Error crawling URL https://www.drax.com/carbon-capture/the-policy-needed-to-save-the-future/. Error: 'str' object has no attribute 'netloc'
Error crawling URL https://www.drax.com/uk/carbon-capture/transporting-carbon-how-to-safely-move-co2-from-the-atmosphere-to-permanent-storage/. Error: 'str' object has no attribute 'netloc'
Error crawling URL https://www.drax.com/us/wp-content/uploads/sites/10/2021/05/8044_Drax_Opus_Broker_Privacy_Notice.pdf. Error: 'str' object has no attribute 'netloc'
Error crawling URL https://www.drax.com/us/wp-content/uploads/sites/10/2019/05/Drax_Fuel_Cells_v6-2.mp4. Error: 'str' object has no attribute 'netloc'
Error crawling URL https://www.drax.com/wp-content/uploads/2019/11/Fig-12.2-Local-Highway-Network.pdf. Error: 'str' object has no attribute 'netloc'
Error crawling URL htt

In [57]:
url='https://www.drax.com/wp-content/uploads/2022/07/DR1500_Climate-policy_AM_V003.pdf'
parsed_url = urlparse(url)
filename = "crawled_" + parsed_url.netloc + parsed_url.path.replace("/", "_") + "_"
folder_to_store = "crawled_data"
if is_file_present(folder_to_store, filename):
    None
else:
    crawl_url(url, folder_to_store)