In [1]:
import requests
from bs4 import BeautifulSoup
import os
import datetime # Import datetime for unique filenames

# Define a User-Agent header to mimic a web browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def scrape_article(url):
    """
    Scrapes the title and main article content from a given URL.

    Args:
        url (str): The URL of the article to scrape.

    Returns:
        tuple: A tuple containing (title_text, article_text) if successful,
               otherwise (None, None). Prints error messages on failure.
    """
    print(f"\nAttempting to fetch content from: {url}")
    try:
        # Send a GET request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        print("Successfully fetched the initial HTML content.")
        print("--- Page Title ---")
        # Attempt to find the main title of the article
        article_title = soup.find('h1')
        title_text = article_title.get_text(strip=True) if article_title else "No title found"
        print(title_text)
        print("------------------")

        # --- Extracting Article Content ---
        print("--- Attempting to extract Marathi News Content ---")

        # Common classes for article body on news websites
        article_body = soup.find('div', class_='article-content') # A common class for article content
        if not article_body:
            article_body = soup.find('div', class_='story-content') # Another common class
        if not article_body:
            article_body = soup.find('div', id='article-body') # Or an ID

        article_text = ""
        if article_body:
            # Extract text from all paragraph tags within the identified article body
            paragraphs = article_body.find_all('p')
            article_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

            if article_text:
                print("\nArticle Content Found (first 500 characters):")
                print(article_text[:500] + "..." if len(article_text) > 500 else article_text)
                print("\n--- End of Article Content Snippet ---")
            else:
                print("No readable text paragraphs found within the identified article content area.")
        else:
            print("Could not find a common article content div (e.g., 'article-content', 'story-content', 'article-body').")
            print("Please inspect the website's HTML structure to find the correct selector for this URL.")

        return title_text, article_text

    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error for {url}: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting to {url}: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error for {url}: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"An unexpected error occurred for {url}: {err}")
    return None, None


In [2]:
if __name__ == "__main__":
    urls_file_name = "/content/links_data_scrapping_24jul.txt"
    output_file_name = "links_file2_data_scrapped_24jul.txt" # Define the single output file

    if not os.path.exists(urls_file_name):
        print(f"Error: File '{urls_file_name}' not found. Please create a text file with one URL per line.")
    else:
        with open(urls_file_name, 'r', encoding='utf-8') as f:
            urls = [line.strip() for line in f if line.strip()] # Read non-empty lines

        if not urls:
            print(f"The file '{urls_file_name}' is empty or contains no valid URLs.")
        else:
            print(f"\nFound {len(urls)} URLs in '{urls_file_name}'. Starting scraping...\n")

            # Open the single output file in append mode
            try:
                with open(output_file_name, "a", encoding="utf-8") as out_f:
                    for i, url_to_scrape in enumerate(urls):
                        print(f"\n--- Processing URL {i+1}/{len(urls)} ---")
                        title, content = scrape_article(url_to_scrape)
                        if title and content:
                            print(f"\nSuccessfully scraped: {title}")
                            # Write to the single output file, separated by title
                            out_f.write(f"{title} \n")
                            out_f.write(content)
                            # out_f.write("\n\n" + "="*80 + "\n\n") # Add a clear separator between articles
                        else:
                            print(f"\nFailed to scrape content from: {url_to_scrape}")
                            out_f.write(f"--- Failed to scrape from: {url_to_scrape} ---\n")
                            out_f.write("No content extracted.\n\n" + "="*80 + "\n\n")
                print(f"\nAll scraped content saved to '{output_file_name}'")
            except IOError as e:
                print(f"Error opening or writing to {output_file_name}: {e}")



Found 85 URLs in '/content/links_data_scrapping_24jul.txt'. Starting scraping...


--- Processing URL 1/85 ---

Attempting to fetch content from: https://www.loksatta.com/nashik/jalgoan-after-steady-rise-gold-and-silver-prices-dropped-on-thursday-giving-relief-to-buyers-sud-02-5251657/
Successfully fetched the initial HTML content.
--- Page Title ---
जळगावमध्ये सोने व चांदी इतके स्वस्त…
------------------
--- Attempting to extract Marathi News Content ---
Could not find a common article content div (e.g., 'article-content', 'story-content', 'article-body').
Please inspect the website's HTML structure to find the correct selector for this URL.

Failed to scrape content from: https://www.loksatta.com/nashik/jalgoan-after-steady-rise-gold-and-silver-prices-dropped-on-thursday-giving-relief-to-buyers-sud-02-5251657/

--- Processing URL 2/85 ---

Attempting to fetch content from: https://www.loksatta.com/nashik/nashik-divisional-commissioner-dr-praveen-gedam-decided-to-cancel-non-creamy-la

Scrapper for Loksatta

In [12]:
import requests
from bs4 import BeautifulSoup
import os
import datetime

# Define a User-Agent header to mimic a web browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def scrape_article(url):
    """
    Scrapes the title and main article content from a given URL.

    Args:
        url (str): The URL of the article to scrape.

    Returns:
        tuple: A tuple containing (title_text, article_text) if successful,
               otherwise (None, None). Prints error messages on failure.
    """
    print(f"\nAttempting to fetch content from: {url}")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.text, 'html.parser')

        print("Successfully fetched the initial HTML content.")

        # --- Extracting Title ---
        article_title_element = soup.find('h1')
        title_text = article_title_element.get_text(strip=True) if article_title_element else "No title found"
        print(f"--- Page Title ---\n{title_text}\n------------------")

        # --- Extracting Article Content ---
        print("--- Attempting to extract Marathi News Content ---")

        article_content_container = None

        # Strategy 1: Target div with id="pcl-full-content" (most reliable for recent Loksatta structure)
        article_content_container = soup.find('div', id='pcl-full-content')
        if article_content_container:
            print("Strategy 1: Found content using ID 'pcl-full-content'.")
        else:
            # Fallback Strategy 2: Target div with class="post-content" (parent of pcl-full-content)
            post_content_div = soup.find('div', class_='post-content')
            if post_content_div:
                article_content_container = post_content_div
                print("Strategy 2: Found content using class 'post-content'.")
            else:
                # Fallback Strategy 3: Original ID for some article types (e.g., regional news from earlier checks)
                article_content_container = soup.find('div', id='dv-full-story-content')
                if article_content_container:
                    print("Strategy 3: Found content using ID 'dv-full-story-content'.")
                else:
                    # Fallback Strategy 4: Common 'df-content' class, then specifically for 'full-story-content' inside it
                    # (This was found for some editorial pages)
                    df_content_div = soup.find('div', class_='df-content')
                    if df_content_div:
                        nested_full_story = df_content_div.find('div', class_='full-story-content')
                        if nested_full_story:
                            article_content_container = nested_full_story
                            print("Strategy 4: Found content using class 'full-story-content' within 'df-content'.")
                        else:
                            article_content_container = df_content_div
                            print("Strategy 4: Found content using class 'df-content' (no specific nested div).")


        article_text = ""
        if article_content_container:
            paragraphs = article_content_container.find_all('p')
            article_text_parts = []
            for p in paragraphs:
                p_text = p.get_text(strip=True)
                # Filter out very short strings that might be empty, captions, or ads.
                # Adjust '20' as needed based on content.
                if len(p_text) > 20:
                    article_text_parts.append(p_text)

            article_text = "\n\n".join(article_text_parts)

            if article_text:
                print("\nArticle Content Found (first 500 characters):")
                print(article_text[:500] + "..." if len(article_text) > 500 else article_text)
                print("\n--- End of Article Content Snippet ---")
            else:
                print(f"No readable text paragraphs found within the identified content area.")
        else:
            print("Could not find any suitable article content div using defined strategies.")
            print("Manual inspection of the website's HTML structure is recommended for this URL.")

        return title_text, article_text

    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error for {url}: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting to {url}: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error for {url}: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"An unexpected error occurred for {url}: {err}")
    return None, None

if __name__ == "__main__":
    # Define the name of your input file containing URLs (one URL per line)
    urls_input_file = "/content/links_data_scrapping_24jul.txt" # <--- IMPORTANT: Create this file!

    # Define the name of your output file
    output_file_name = "links_scraped_articles_output24jul.txt"

    # --- Read URLs from the input file ---
    urls_to_scrape = []
    if os.path.exists(urls_input_file):
        with open(urls_input_file, 'r', encoding='utf-8') as f:
            for line in f:
                url = line.strip() # Remove leading/trailing whitespace (like newlines)
                if url: # Only add non-empty lines
                    urls_to_scrape.append(url)
        print(f"Found {len(urls_to_scrape)} URLs in '{urls_input_file}'.")
    else:
        print(f"Error: Input file '{urls_input_file}' not found. Please create it with one URL per line.")
        exit() # Exit if the input file doesn't exist

    if not urls_to_scrape:
        print(f"The input file '{urls_input_file}' is empty or contains no valid URLs. Exiting.")
        exit()

    # --- Scrape and Save to a single output file ---
    try:
        with open(output_file_name, "w", encoding="utf-8") as out_f: # 'w' to overwrite, 'a' to append
            for i, url in enumerate(urls_to_scrape):
                print(f"\n--- Processing URL {i+1}/{len(urls_to_scrape)}: {url} ---")
                title, content = scrape_article(url)

                # Write to the file
                out_f.write(f"--- Article {i+1} ---\n")
                out_f.write(f"URL: {url}\n") # Include the URL for reference

                if title:
                    out_f.write(f"Title: {title}\n")
                else:
                    out_f.write("Title: Not found\n")

                if content:
                    out_f.write(f"Content:\n{content}\n\n")
                else:
                    out_f.write("Content: Failed to extract content.\n\n")

                out_f.write("=" * 80 + "\n\n") # Use a clear separator between articles

        print(f"\nAll scraped content saved to '{output_file_name}'")

    except IOError as e:
        print(f"Error writing to file '{output_file_name}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Found 110 URLs in '/content/links_data_scrapping_24jul.txt'.

--- Processing URL 1/110: https://www.loksatta.com/nashik/jalgoan-after-steady-rise-gold-and-silver-prices-dropped-on-thursday-giving-relief-to-buyers-sud-02-5251657/ ---

Attempting to fetch content from: https://www.loksatta.com/nashik/jalgoan-after-steady-rise-gold-and-silver-prices-dropped-on-thursday-giving-relief-to-buyers-sud-02-5251657/
Successfully fetched the initial HTML content.
--- Page Title ---
जळगावमध्ये सोने व चांदी इतके स्वस्त…
------------------
--- Attempting to extract Marathi News Content ---
Strategy 1: Found content using ID 'pcl-full-content'.

Article Content Found (first 500 characters):
जळगाव: शहरातील सुवर्ण बाजारात सोने व चांदीच्या दरात सातत्याने वाढ होत असताना, बुधवारी सोन्याच्या दरात प्रति १० ग्रॅम ११३३ रूपये आणि चांदीच्या दरात प्रति किलो २०६० रूपयांची वाढ नोंदविण्यात आली होती. दरवाढीनंतर विशेषतः चांदीने एक लाख २० हजार ५१० रूपयांचा नवीन उच्चांकही गाठला होता. मात्र, गुरूवारी सकाळी दोन्ही धातुंच्