In [None]:
import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

# Function to check if URL already exists in CSV
def is_url_duplicate(csv_filename, url):
    with open(csv_filename, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['Link'] == url:
                return True
    return False

# Function to check if no results message is present
def no_results_present(driver):
    try:
        driver.find_element_by_class_name("search__no-results__message")
        return True
    except:
        return False

# Function to check if row contains NaN
def contains_nan(row):
    for value in row.values():
        if value == "N/A":
            return True
    return False

# Specify the number of pages to scrape
num_pages = 25

# Specify the path to chromedriver.exe (change it as per your directory)
webdriver_service = Service(r"C:\chromedriver-win64\chromedriver.exe")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=webdriver_service)

# Create and open CSV file for writing
csv_filename = 'output.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    # Define fieldnames for CSV
    fieldnames = ['Headline', 'Description', 'Date', 'Link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write headers to CSV
    writer.writeheader()
    
    # Search terms related to business
    search_terms = ['business_pakistan', 'business_asia', 'business_india', 'business_china', 'business_economy',
                    'business_technology', 'business_startup', 'business_trade', 'business_finance', 'business_investment',
                    'business_marketing', 'business_management', 'business_entrepreneurship', 'business_leadership', 
                    'business_strategy', 'business_global', 'business_corporate', 'business_government', 'business_environment',
                    'business_sustainability', 'business_social', 'business_culture', 'business_trends', 'business_opportunity']
    
    # Initialize counter for the number of records written to CSV
    record_counter = 0
    
    # Iterate through search terms
    for search_term in search_terms:
        for page_num in range(1, num_pages + 1):
            # URL to scrape
            url = f"https://edition.cnn.com/search?q={search_term}&from={100 * (page_num - 1)}&size=100&page={page_num}&sort=newest&types=all&section="
            
            try:
                # Load the page
                driver.get(url)

                # Wait for the content to load
                wait = WebDriverWait(driver, 10)
                wait.until(EC.presence_of_element_located((By.CLASS_NAME, "container__link")))

            except TimeoutException:
                print(f"TimeoutException occurred. Changing search term from '{search_term}' to the next one.")
                break

            # Check if no results message is present
            if no_results_present(driver):
                print(f"No results found for '{search_term}'. Changing search term to the next one.")
                break  # If no results, break out of the loop and change the search term

            # Add a random delay between 2 to 5 seconds to simulate human-like behavior
            delay = random.uniform(2, 5)
            time.sleep(delay)

            # Get the page source after waiting
            page_source = driver.page_source

            # Parse the HTML content
            soup = BeautifulSoup(page_source, "html.parser")

            # Find all the elements with the specified class
            links = soup.select(".container__link.container__link--type-NewsArticle.container_list-images-with-description__link")

            # Iterate through links
            for link in links:
                # Get the headline
                headline_element = link.find("span", class_="container__headline-text")
                headline = headline_element.get_text() if headline_element else "N/A"

                # Get the description
                description_element = link.find("div", class_="container__description")
                description = description_element.get_text() if description_element else "N/A"

                # Get the date
                date_element = link.find("div", class_="container__date")
                date = date_element.get_text() if date_element else "N/A"

                # Get the link
                url = link.get('href')

                # Check if any column contains NaN
                if "N/A" in [headline, description, date, url]:
                    continue  # Skip this row if any column contains NaN

                # Check if URL already exists in CSV
                if is_url_duplicate(csv_filename, url):
                    continue

                # Write row to CSV
                writer.writerow({'Headline': headline.strip(), 'Description': description.strip(), 'Date': date.strip(), 'Link': url})
                
                # Increment the record counter
                record_counter += 1
                
                # Check if 2500 records are written
                if record_counter >= 2500:
                    print("Reached 2500 records. Exiting.")
                    break

            # Check if 2500 records are written
            if record_counter >= 2500:
                break
                
            # Add an additional random delay between 3 to 7 seconds between each page request to avoid being detected
            delay_between_requests = random.uniform(3, 7)
            time.sleep(delay_between_requests)
            
        # Check if 2500 records are written
        if record_counter >= 2500:
            break

# Close the WebDriver
driver.quit()
