In [8]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://indianexpress.com/"
country = "bangladesh"
initial_url = f"{BASE_URL}/about/{country}"

# response = requests.get(initial_url)

# soup = BeautifulSoup(response.text, 'html.parser')

from selenium import webdriver

# chrome_options = Options()
# chrome_options.add_argument("--headless")
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()

driver.get(initial_url)
driver.maximize_window()

MAX_PAGES = 1
clicks = 0
MAX_RETRIES = 3
consecutive_failures = 0
ERROR_LIMIT = 5

news_url_links = []

while clicks < MAX_PAGES and consecutive_failures < MAX_RETRIES:
    try:
        # Waiting for the "Load More Articles" button to be clickable
        show_more_button = WebDriverWait(driver, 15).until(
            # EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Load More Articles')]"))
            EC.element_to_be_clickable((By.XPATH, "/html/body/div[4]/div[5]/div[1]/div/div[1]/div[6]/div[2]/button/span"))
            
        )

        # Scrolling a bit to ensure the button is fully in view (using JavaScript)
        driver.execute_script("arguments[0].scrollIntoView();", show_more_button)

        # Using JavaScript to click the button
        driver.execute_script("arguments[0].click();", show_more_button)
                
        # Using sleep to ensure the content has loaded after clicking
        time.sleep(5)
        clicks += 1

        # If successful, resetting the consecutive failures counter
        consecutive_failures = 0
        
        if clicks == MAX_PAGES:
            main_div_tag = driver.find_element(By.ID, 'tag_article')
            news_details = main_div_tag.find_elements(By.CLASS_NAME, 'details')
            
            for news in news_details:
            
                news_h3 = news.find_element(By.CSS_SELECTOR, 'h3')
                link_url = news_h3.find_element(By.CSS_SELECTOR,'a')
                
                news_url_links.append(link_url.get_attribute('href'))
            

    except (NoSuchElementException, ElementClickInterceptedException):
        consecutive_failures += 1
        print(f"Attempt {consecutive_failures} failed. Retrying...")
        # Waiting before retrying
        time.sleep(10)

    except Exception as e:
        consecutive_failures += 1
        print(f"Encountered an unexpected error: {str(e)}")
        if consecutive_failures >= ERROR_LIMIT:
            print("Too many errors encountered. Stopping.")
            break
            

driver.quit()

#SCRAPING ALL NEWS


counter = 0
data_list = []

for link in news_url_links:
    
    country = 'bangladesh'   
        
    response = requests.get(link)
   
    soup = BeautifulSoup(response.text, 'html.parser')

    title_div = soup.find('div', {'id': 'section'})
    title_tag = soup.find('h1', class_ = 'native_story_title')
    title = title_tag.text if title_tag else 'Title Not Found'
    
    author_tag = soup.find('a', class_ = 'bulletProj')
    author = author_tag.text if author_tag else 'Author not found'

    date_div = soup.find('div', {'id':'section'})

    if date_div:

        date_data = date_div.find('span', {'itemprop':'dateModified'}).get('content')

        only_date = date_data.split('T')[0]
        only_time = date_data.split('T')[1]
        time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
        cleaned_date = f"{only_date},{time}"

        source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M")
        bangladesh_localtime = source_localtime + timedelta(minutes=30)

    else:
        date_data = 'Date Data Not Found'
        


    content = []
    content_header = soup.find('div', {'id':'pcl-full-content'})

    if content_header:
        main_content = content_header.find_all('p')
        for con in main_content:           
            content.append(con.text)        
    else:
        content.append('Content Not Found')


    full_content = ' '.join(content)

    
    content_summary_tag = soup.find('h2', {'itemprop':'description'})
    content_summary = content_summary_tag.text if content_summary_tag else 'Content summary not found'
    

    title_translation = 'None'
    summary_translation = 'None'
    content_translation = 'None'
    
    
    data_dict = {
            "url": link,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }

    counter+=1


    if (date_data != "Date Data Not Found" and full_content != "Content Not Found" and content_summary != "Content summary not found"):
        if data_dict not in data_list:
            # Adding to data list
            data_list.append(data_dict)
    else:
        print(counter)
        print('Skipped due to missing info.')


df = pd.DataFrame(data_list)
df.head()

country = 'India'
csv_filename = f"{country}_Indian_Express.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["date"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="date", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df["date"] = pd.to_datetime(df["date"], format = "%d-%m-%Y")  # Converting the "date" column to datetime
    df = df.sort_values(by="date", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

In [9]:
df = pd.DataFrame(data_list)
df.head()

Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://indianexpress.com/article/india/tripur...,6 insurgents from Bangladesh surrender in Tripura,Six insurgents surrendered to the Tripura Poli...,Deputy Inspector General (Intelligence) Krishn...,,,,Debraj Deb,bangladesh,2024-02-10 18:49:00,2024-02-10 19:19:00
1,https://indianexpress.com/article/india/indias...,India’s decision to fence border with Myanmar ...,Calling India’s decision to fence its borders ...,"Later, in another interaction with the media a...",,,,Author not found,bangladesh,2024-02-09 11:41:00,2024-02-09 12:11:00
2,https://indianexpress.com/article/world/mortar...,Two killed as mortar shells from Myanmar lands...,At least two people were killed when mortar sh...,This comes even as at least 95 Burmese paramil...,,,,Author not found,bangladesh,2024-02-06 09:02:00,2024-02-06 09:32:00
3,https://indianexpress.com/article/world/myanma...,Myanmar border guards flee to Bangladesh amid ...,"At least 95 Myanmar border guards, some of the...",Since mounting a coup against an elected gover...,,,,Author not found,bangladesh,2024-02-06 09:02:00,2024-02-06 09:32:00
4,https://indianexpress.com/article/india/bangla...,5 Bangladeshi nationals arrested in Tripura fo...,Five Bangladeshi nationals were arrested in Tr...,The Indian ‘tout’ was also held at Badharghat ...,,,,Debraj Deb,bangladesh,2024-01-28 08:20:00,2024-01-28 08:50:00
