In [8]:
import logging
import traceback

def setup_logging(log_file: str = "app.log"):
    # Create a custom logger
    logger = logging.getLogger("LinkExtractor")
    logger.setLevel(logging.INFO)  # Capture only INFO and above (INFO, WARNING, ERROR, CRITICAL)

    # Prevent logger duplication
    if not logger.hasHandlers():
        # Create file handler to log to a file
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)  # File handler also captures only INFO and above

        # Create log format with function name included
        log_format = logging.Formatter('%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
        file_handler.setFormatter(log_format)

        # Add handler to the logger
        logger.addHandler(file_handler)

    return logger

# Step 2: Set up logging to a file only, suppressing output in the notebook
logger = setup_logging(log_file="notebook_logs.log")

# Step 3: Example usage
logger.info("This is an INFO log message.")
logger.debug("This DEBUG message won't appear in INFO mode.")
logger.warning("This is a WARNING log message.")

In [2]:
def remove_newlines(text):
    # Replace newline characters with an empty string
    return text.replace('\n', '')

In [9]:
from typing import List, Generator, Optional
from lxml.html import HtmlElement, fromstring, tostring
from lxml.html.clean import Cleaner
import traceback
from html2text import HTML2Text
import requests
import lxml
from lxml.html import HtmlElement, tostring, fromstring
def html_repr(self):
    return lxml.html.tostring(self, encoding='unicode', pretty_print=True)

HtmlElement.__repr__ = html_repr
HtmlElement.__str__ = html_repr

# Configure logging
"""
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
"""
class WebScraper:
    def __init__(self, url: str, base_xpath:str):
        """Initialize scraper with URL and extract body."""
        self.url = url
        self.base_xpath = base_xpath
        self.body = self._get_body()

    def _get_body(self) -> Optional[HtmlElement]:
        """Fetch and clean HTML body from URL."""
        try:
            logger.debug(f"Fetching URL: {self.url}")
            #response = httpx.get(self.url)
            response = requests.get(self.url, timeout=10)
            html_tree = fromstring(response.text)

            self.title = html_tree.xpath('//span[@class="mw-page-title-main"]')[0].text_content()

            body = html_tree.xpath('//body')[0]
            assert isinstance(body, HtmlElement)
            
            logger.debug("Cleaning HTML body")
            cleaner = Cleaner(javascript=True, style=True)
            cleaned_body = cleaner.clean_html(body)
            
            reconstructed_body = fromstring(''.join(
                tostring(c, encoding='unicode') for c in cleaned_body
            )).xpath(self.base_xpath)[0]

            logger.info(f"Successfully processed URL: {self.url}")
            return reconstructed_body
        
        except Exception as e:
            logger.error(f"Error processing URL {self.url}: {str(e)}")
            logger.error(traceback.format_exc())
            return None

    def find_elements_by_xpaths(self, xpaths: List[str]) -> Generator[List[HtmlElement], None, None]:
        """Find elements in body using multiple XPaths."""
        if self.body is None:
            logger.error("Body is None, cannot find elements")
            return
        
        for xpath in xpaths:
            try:
                logger.debug(f"Searching for xpath: {xpath}")
                elements = self.body.xpath(xpath)
                logger.info(f"Found {len(elements)} elements for xpath: {xpath}")
                yield elements
            except Exception as e:
                logger.error(f"Error finding elements with xpath {xpath}: {str(e)}")
                logger.error(traceback.format_exc())
                yield []

    def delete_elements_by_xpath(self, xpaths: List[str]) -> None:
        """Delete elements from body using multiple XPaths."""
        if self.body is None:
            logger.error("Body is None, cannot delete elements")
            return
        
        for i, elements in enumerate(self.find_elements_by_xpaths(xpaths)):
            logger.info(f"{xpaths[i]} --> {len(elements)} elements found")
            for element in elements:
                try:
                    parent = element.getparent()
                    if parent is not None:
                        logger.debug(f"Removing element: {remove_newlines(element.text_content())[:50]}...")
                        parent.remove(element)
                except Exception as e:
                    logger.error(f"Error removing element: {str(e)}")
                    logger.error(traceback.format_exc())

    def get_following_siblings(self, element: HtmlElement) -> Generator[HtmlElement, None, None]:
        """Get all following siblings of an element."""
        try:
            next_sibling = element.getnext()
            while next_sibling is not None:
                logger.debug(f"Found sibling: {next_sibling.tag}")
                yield next_sibling
                next_sibling = next_sibling.getnext()
        except Exception as e:
            logger.error(f"Error getting following siblings: {str(e)}")
            logger.error(traceback.format_exc())

    def delete_all_following_elements_by_xpath(self, xpath: str) -> int:
        """Delete all following sibling elements of the element found by xpath."""
        if self.body is None:
            logger.error("Body is None, cannot delete following elements")
            return 0
        
        try:
            element = self.body.xpath(xpath)[0]
            logger.info(f"Found target element: {element.text_content()[:50]}... following count: {len(list(self.get_following_siblings(element)))}")
            
            sibling_following = list(self.get_following_siblings(element))
            removed_count = 0

            for sibling in sibling_following:
                try:
                    sibling.getparent().remove(sibling)
                    removed_count += 1
                    logger.debug(f"Removed sibling: {sibling.text_content()[:20]}...")
                except Exception as e:
                    logger.error(f"Failed to remove sibling: {str(e)}")
                    logger.error(traceback.format_exc())
            
            logger.info(f"Removed {removed_count} following elements")
            return removed_count
        except Exception as e:
            logger.error(f"Error in delete_all_following_elements_by_xpath: {str(e)}")
            logger.error(traceback.format_exc())
            return 0
      
    def to_markdown(self, body_width: int = 500000) -> Optional[str]:
        """Convert the processed HTML body to markdown."""
        if self.body is None:
            logger.error("Body is None, cannot convert to markdown")
            return None

        try:
            logger.debug("Converting HTML to markdown")
            h2t = HTML2Text(bodywidth=body_width)
            h2t.ignore_links = True
            h2t.mark_code = True
            h2t.ignore_images = True
            
            html_string = tostring(self.body, encoding='unicode')
            markdown_text = h2t.handle(html_string)
            
            logger.info("Successfully converted HTML to markdown")
            return f"# {self.title} \n{markdown_text}"
        except Exception as e:
            logger.error(f"Error converting to markdown: {str(e)}")
            logger.error(traceback.format_exc())
            return None
    def get_processed_body(self) -> Optional[HtmlElement]:
        """Return the processed body."""
        return self.body


In [18]:
from urllib.parse import  unquote
url = "https://or.wikipedia.org/wiki/%E0%AC%AA%E0%AC%A6%E0%AD%8D%E0%AC%AE_%E0%AC%AC%E0%AC%BF%E0%AC%AD%E0%AD%82%E0%AC%B7%E0%AC%A3"
url = unquote(url)
print(f"{url=}")
scraper = WebScraper(url, '//div[@id="mw-content-text"]')

url='https://or.wikipedia.org/wiki/ପଦ୍ମ_ବିଭୂଷଣ'


In [19]:
xpaths_to_remove = [
    '//div[@class="navbox"]',
    '//div'
    '//a[@class="mw-jump-link"]',
    '//div[@class="vector-header-container"]',
    '//span[@class="plainlinks"]',
    '//nav',
    '//div[@class="vector-page-toolbar"]',
    '//div[@class="title-shortlink-container"]',
    '//div[@id="p-lang-btn"]',
    '//div[@id="siteSub"]',
    '//div[@class="vector-page-toolbar"]',
    '//table[contains(@class, "infobox")]',
    '//span[@class="mw-editsection"]',
    '//div[h2[@id="ଆଧାର"]]',
    '//div[@class="reflist"]',
    '//div[@class="printfooter"]',
    '//div[h2[@id="ବାହାର_ଆଧାର"]]//following::ul',
    '//div[h2[@id="ବାହାର_ଆଧାର"]]',
    '//div[@id="catlinks"]',
    '//div[@class="mw-footer-container"]',
    '//sup',
]
following_xpath = '//div[h2[@id="ଆଧାର"]]'

#scraper.delete_all_following_elements_by_xpath(following_xpath)
#scraper.delete_elements_by_xpath(xpaths_to_remove)

In [20]:
scraper.delete_all_following_elements_by_xpath(following_xpath)
scraper.delete_elements_by_xpath(xpaths_to_remove)

In [21]:
markdown_content = scraper.to_markdown()
with open('temp.md', 'w') as file:
    file.write(markdown_content)