In [1]:
from bs4 import BeautifulSoup
import requests
from lxml import etree
from fastcore.xtras import Path
from urllib.parse import urljoin, unquote
from lxml import etree
import pandas as pd
import os
from fastcore.parallel import parallel
import copy

def get_all_a_tags(page_url: str, html: etree._Element):
    """
    Retrieves all <a> tags from a given webpage URL using XPath,
    resolving both absolute and relative URLs, and handling encoded URLs
    and non-English anchor text.
    
    Args:
    - page_url (str): The URL of the page to fetch and parse.
    - html (etree._Element): The lxml element representing the HTML content.

    Returns:
    List of tuples (decoded_full_url, anchor_text), where:
    - decoded_full_url (str): The decoded full URL (resolved for relative links, if valid).
    - anchor_text (str): The text inside the anchor tag (in any language).
    """
    try:
        # Check for <base> tag and get base URL
        base_url = get_base_url(html, page_url)
        
        # Use XPath to find all <a> tags
        a_tags = html.xpath('//a')

        # List to store (full_url, anchor_text)
        result = []

        # Iterate over all <a> tags
        for a_tag in a_tags:
            href = a_tag.get('href')

            # Skip invalid hrefs (empty, JavaScript, mailto, tel, fragments)
            if href and not href.startswith(('javascript:', 'mailto:', 'tel:', 'ftp:', '#')):
                # Resolve the full URL using urljoin (to handle relative URLs)
                full_url = urljoin(base_url, href)

                # Decode URL if it is encoded (e.g., %20 for space, non-ASCII characters)
                decoded_full_url = unquote(full_url)

                # Get the anchor text using ''.join() to concatenate text and tail
                anchor_text = ''.join(a_tag.itertext()).strip()

                # Append (decoded_full_url, anchor_text) to result
                result.append((decoded_full_url, anchor_text))

        return result
    
    except Exception as e:  # Changed to Exception for broader error handling
        print(f"Error processing the page: {e}")
        return []

def get_base_url(html: etree._Element, page_url: str):
    """
    Extracts the base URL from the <base> tag if present, or falls back to the page URL.
    
    Args:
    - html (etree._Element): The lxml element representing the HTML content.
    - page_url (str): The original page URL.
    
    Returns:
    str: The base URL to resolve relative links.
    """
    base_tag = html.xpath('//base')
    if base_tag and base_tag[0].get('href'):
        return base_tag[0].get('href')
    return page_url  # Fallback to the page URL if no <base> tag is found


import logging
def get_index_page_extract_link(url):
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "lxml")
    html = etree.HTML(str(soup))

    content = copy.deepcopy(html.xpath('//figure[@class="post-thumbnail"]')) # the article html
    assert len(content) > 0, "No content found" # as each page contains few news articles

    posts_li = [j for i in content for j in get_all_a_tags(url,i)]
    
    return posts_li
  except Exception as e:
    logging.exception("An error occurred: %s", e)
    return []

lis = [
  {
    'url': 'https://www.dharitri.com/category/state-news/',
    'page_limit': 4768,
  },
  {
    'url': 'https://www.dharitri.com/category/national-news/',
    'page_limit': 4768,
  },
  {
    'url': 'https://www.dharitri.com/category/metro-news/',
    'page_limit': 2297,
  },
  {
    'url': 'https://www.dharitri.com/category/international-news/',
    'page_limit': 1627,
  },
  {
    'url': 'https://www.dharitri.com/category/sports-news/',
    'page_limit': 2115,
  },
  {
    'url': 'https://www.dharitri.com/category/business/',
    'page_limit': 896,
  },
  {
    'url': 'https://www.dharitri.com/category/editorial/',
    'page_limit': 649,
  },
  {
    'url': 'https://www.dharitri.com/category/entertainment/',
    'page_limit': 1253,
  },
  {
    'url': 'https://www.dharitri.com/category/district-news/',
    'page_limit': 5903,
  },
  {
    'url': 'https://www.dharitri.com/category/fursat/',
    'page_limit': 1082,
  },

]
import re
get_fname = lambda url : re.findall(r'/category/(.+)/$', url)[0] + '.csv'
#print(get_fname('https://www.dharitri.com/category/state-news/'))
final_lis = [{**i, 'fname': get_fname(i['url'])} for i in lis]
#
# final_lis







In [5]:
os.cpu_count()

12

In [4]:
lis[-2:]

[{'url': 'https://www.dharitri.com/category/district-news/',
  'page_limit': 5903},
 {'url': 'https://www.dharitri.com/category/fursat/', 'page_limit': 1082}]

In [5]:
state_new = lambda base_url, i : f'{base_url}/page/{i}/'
for i in final_lis[-2:]:
        url, page_limit, fname = i.values()
        print(url, page_limit, fname)

        
        # no of pages 1 : page_limit
        def process_page(i):
            idx = state_new(url, i)
            return get_index_page_extract_link(idx)

        content_links = parallel(process_page, range(1, page_limit), n_workers=12, progress=True)
        
        article_links = set([ j[0] for i in content_links for j in i]  )
        
        df = pd.DataFrame(list(article_links), columns=['article_link'])

        df.to_csv(fname, index=False)


https://www.dharitri.com/category/district-news/ 5903 district-news.csv


https://www.dharitri.com/category/fursat/ 1082 fursat.csv


In [12]:
!ls | grep ".csv" | wc -l

      10
