In [5]:
pip install selenium bs4 beautifulsoup4 pandas webdriver_manager

In [2]:
import os
import time
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Function to get topic from the user
def get_topic_url(topic):
    base_url = "https://economictimes.indiatimes.com/topic/"
    topic_url = base_url + topic.replace(" ", "-") + "/news"
    return topic_url

# Get topic from the user
topic = input("Enter the topic to scrape: ")
url = get_topic_url(topic)

# Set up conditions
max_articles = input("Enter the maximum number of articles to scrape (leave blank for no limit): ")
max_articles = int(max_articles) if max_articles else None

max_date_str = input("Enter the latest date to scrape articles (in format YYYY-MM-DD, leave blank for no limit): ")
max_date = datetime.strptime(max_date_str, "%Y-%m-%d") if max_date_str else None

# Set up Selenium WebDriver
options = Options()
options.binary_location = "chromedriver.exe"  # Provide the correct path to your Chrome binary

# service = Service(ChromeDriverManager().install())
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)

SessionNotCreatedException: Message: session not created: Chrome failed to start: was killed.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location chromedriver.exe is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
	GetHandleVerifier [0x01188923+23283]
	(No symbol) [0x0114E934]
	(No symbol) [0x01080733]
	(No symbol) [0x010B1075]
	(No symbol) [0x010ABAE9]
	(No symbol) [0x010ED9ED]
	(No symbol) [0x010ED21A]
	(No symbol) [0x010E41B6]
	(No symbol) [0x010B8017]
	(No symbol) [0x010B890D]
	GetHandleVerifier [0x0127A5E3+1013683]
	GetHandleVerifier [0x01283E3C+1052684]
	GetHandleVerifier [0x0127D4A4+1025652]
	GetHandleVerifier [0x011AEA2B+179195]
	(No symbol) [0x01156833]
	(No symbol) [0x01153198]
	(No symbol) [0x01153337]
	(No symbol) [0x0114B4BE]
	BaseThreadInitThunk [0x7712FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x774D80CE+286]
	RtlGetAppContainerNamedObjectPath [0x774D809E+238]


In [10]:
# Initialize a list to store the scraped data
data = []
scraped_links = set()  # To keep track of scraped links

def extract_articles(articles):
    for article in articles:
        try:
            # Find the title and link
            title_tag = article.find("h2")
            if title_tag:
                title = title_tag.get_text(strip=True)
                link_tag = article.find("a")
                link = link_tag["href"] if link_tag else None
            else:
                # Handle articles in the container
                link_tag = article.find("a", itemprop="url")
                if not link_tag:
                    continue
                title = link_tag.get("title", link_tag.find("meta", itemprop="name")["content"])
                link = link_tag["href"]
            
            if not link.startswith("http"):
                link = "https://economictimes.indiatimes.com" + link

            # Skip if the article has already been scraped
            if link in scraped_links:
                continue
            scraped_links.add(link)

            # Find the description
            description_tag = article.find("p", class_="wrapLines l3") or article.find("div", class_="wrapLines l4")
            description = description_tag.get_text(strip=True) if description_tag else "No description"

            # Find the date of publication
            time_tag = article.find("time")
            time_value = time_tag.get_text(strip=True) if time_tag else "No time"
            date_parts = time_value.split(",")[:2]  # Get the date part only (day, month, year)
            date_value = ",".join(date_parts).strip()

            article_date = datetime.strptime(date_value, "%d %b, %Y") if time_tag else None
            formatted_date = article_date.strftime("%Y-%m-%d") if article_date else "No date"

            # Append the extracted data to the list
            data.append({
                "Title": title,
                "Link": link,
                "Description": description,
                "Date": formatted_date
            })

            # Check the conditions
            if max_articles and len(data) >= max_articles:
                return True
            if max_date and article_date and article_date < max_date:
                return True
        except Exception as e:
            print(f"Error extracting article: {e}")
            continue
    return False

try:
    # Initial parse to get the first set of articles
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    initial_articles = soup.find_all("div", class_="clr flt topicstry story_list")
    print(f"Found {len(initial_articles)} initial articles.")
    if extract_articles(initial_articles):
        raise StopIteration

    # Scroll and click "Load More" until no more content is loaded
    while True:
        # Scroll all the way to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # wait for the page to load new content

        # Try to click the "Load More" button using JavaScript
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'jsTopicLoadMore'))
            )
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(3)  # wait for new content to load
        except:
            print("No more 'Load More' button found or it could not be clicked. Stopping the scroll.")
            break  # no more "Load More" button, exit loop

        # Parse the new content
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        # Find all new article containers
        more_stories = soup.find_all("div", class_="moreStories")
        print(f"Found {len(more_stories)} 'moreStories' containers on the current page.")

        # Extract data from each article element within new containers
        for container in more_stories:
            articles = container.find_all("div", class_="clr flt topicstry")
            print(f"Found {len(articles)} articles in the current 'moreStories' container.")
            if extract_articles(articles):
                raise StopIteration

except StopIteration:
    print("Stopping the scraping process as conditions were met.")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the driver
    driver.quit()

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)

    # Display the DataFrame
    print(df.head())

    # Construct the filename
    filename = f"{topic.replace(' ', '_')}_articles"
    if max_articles:
        filename += f"_max{max_articles}"
    if max_date:
        filename += f"_end{max_date.strftime('%Y%m%d')}"
    filename += ".csv"

    # Save the DataFrame to a CSV file
    save_path = os.path.join("datasets", "scrapped_news_data", filename)
    df.to_csv(save_path, index=False)

An error occurred: name 'driver' is not defined


NameError: name 'driver' is not defined