In [8]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://www.ndtv.com/"
country = "bangladesh"
initial_url = f"{BASE_URL}/search?searchtext={country}"
# url = 'https://www.ndtv.com/search?searchtext=bangladesh'


from selenium import webdriver

# chrome_options = Options()
# chrome_options.add_argument("--headless")
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()

driver.get(initial_url)


MAX_PAGES = 1
clicks = 0
MAX_RETRIES = 3
consecutive_failures = 0
ERROR_LIMIT = 5

news_url_links = []
while clicks < MAX_PAGES and consecutive_failures < MAX_RETRIES:
    try:
        # Waiting for the "Load More Articles" button to be clickable
        show_more_button = WebDriverWait(driver, 15).until(
            # EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Load More Articles')]"))
            EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div/div/section/div[2]/div[2]/article/div/div/div/div[1]/div[1]/div[1]/div[3]/a"))
        )

        # Scrolling a bit to ensure the button is fully in view (using JavaScript)
        driver.execute_script("arguments[0].scrollIntoView();", show_more_button)

        # Using JavaScript to click the button
        driver.execute_script("arguments[0].click();", show_more_button)
                
        # Using sleep to ensure the content has loaded after clicking
        time.sleep(5)
        clicks += 1

        # If successful, resetting the consecutive failures counter
        consecutive_failures = 0
        
        if clicks == MAX_PAGES:
            main_div_tag = driver.find_element(By.ID, 'tag_article')
            news_details = main_div_tag.find_elements(By.CLASS_NAME, 'details')
            
            for news in news_details:
            
                news_h3 = news.find_element(By.CSS_SELECTOR, 'h3')
                link_url = news_h3.find_element(By.CSS_SELECTOR,'a')
                
                news_url_links.append(link_url.get_attribute('href'))
            

    except (NoSuchElementException, ElementClickInterceptedException):
        consecutive_failures += 1
        print(f"Attempt {consecutive_failures} failed. Retrying...")
        # Waiting before retrying
        time.sleep(10)

    except Exception as e:
        consecutive_failures += 1
        print(f"Encountered an unexpected error: {str(e)}")
        if consecutive_failures >= ERROR_LIMIT:
            print("Too many errors encountered. Stopping.")
            break
            
soup = BeautifulSoup(driver.page_source, 'html.parser')
main_list = soup.find_all('ul', class_ = 'src_lst-ul')


time.sleep(5)
driver.quit()


#The main_list has a length of 3, that's because it's somehow scraping the same content of the first element for the third element as well. 
#So, we'll discard the last element of main_list. We pressed the 'Load More News' button only once, so we take the
#first two elements. 
#If we use only soup.find, it'll only scrape the news up until 'Load More News' button, and nothing after that. 

news_lists = []
for x in main_list[:-1]:
    temp = x.find_all('li', class_ = 'src_lst-li')
    
    if temp:        
        for all_news_lists in temp:
            news_lists.append(all_news_lists)

news_urls = []
for news in news_lists:
    news_div = news.find('div', class_='src_itm-ttl')
    link = news_div.find('a')
    news_urls.append(link.get('href'))

counter = 0
data_list = []

for link in news_urls:
    
    country = 'India'   
        
    response = requests.get(link)
   
    soup = BeautifulSoup(response.text, 'html.parser')

    title_tag = soup.find('h1', class_='sp-ttl')
    title_mod = title_tag.text if title_tag else 'No Title Found'
    title = re.sub('\r\n','',title_mod)   

    date_element = soup.find('meta', {'itemprop':'datePublished'})
    
    if date_element:
            date_data = date_element.get('content')
            only_date = date_data.split('T')[0]
            only_time = date_data.split('T')[1]
            time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
            cleaned_date = f"{only_date},{time}"

            source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M")
            bangladesh_localtime = source_localtime + timedelta(minutes=30)

    else:
        date_data = 'Date Data Not Found'
        


    content = []

    content_div = soup.find('div', class_ = 'story__content')
    alt_content_div = soup.find_all('div', class_ = 'sp-ttl-wrp')

    if content_div:
        paragraphs = content_div.find_all('p')
        
        author_tag = soup.find('span',{'itemprop':'name'})
        author = author_tag.text if author_tag else 'Author not found'
        
    elif alt_content_div:
        paragraphs = alt_content_div[1].find_all('p')
        
        author_tag = soup.find_all('span', {'itemprop':'name'})[-1]
        author = author_tag.text if author_tag else 'Author not found'
               
    else:
        full_content = 'Content Not Found'
        

    for p in paragraphs:
        content.append(p.text)
    
    full_content = ' '.join(content)

    full_content = re.sub('\r\n|\xa0|\n', ' ', full_content)
    full_content = re.sub("\'", "'",full_content)
    
    #This is to remove the additional topic names mentioned, which is not part of the news. 
    temp_cuts = full_content.split('Topics mentioned in this article', 1)

    full_content = temp_cuts[0].strip()

    #This is to remove the scoorecard in news that cover live games.
    temp_cuts = full_content.split('(Scorecard)', 1)

    full_content = temp_cuts[0].strip()

    
    
    content_summary_tag = soup.find('h2', class_ = 'sp-descp')
    content_summary = content_summary_tag.text if content_summary_tag else 'Content summary not found'
    

    title_translation = 'None'
    summary_translation = 'None'
    content_translation = 'None'
    
    
    data_dict = {
            "url": link,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }

    counter+=1


    if (date_data != "Date Data Not Found" and full_content != "Content Not Found" and content_summary != "Content summary not found"):
        if data_dict not in data_list:
            # Adding to data list
            data_list.append(data_dict)
    else:
        print(counter)
        print('Skipped due to missing info.')

df = pd.DataFrame(data_list)
df.head()

csv_filename = f"{country}_NDTV_NEWS.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="bangladesh_localtime", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"], format = "%d-%m-%Y")  # Converting the "date" column to datetime
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

Attempt 1 failed. Retrying...
8
Skipped due to missing info.


Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://www.ndtv.com/india-news/wont-collect-b...,"""Won't Collect Biometric Data Of Myanmar, Ban...",Lalduhoma said his government won't collect bi...,"The Chin people from Myanmar, the Bawm commun...",,,,Press Trust of India,India,2024-02-29 14:36:00,2024-02-29 15:06:00
1,https://www.ndtv.com/india-news/relationship-o...,Relationship Of Heart And Soul: President Mur...,President Murmu said that India and Bangladesh...,President Droupadi Murmu was addressing a you...,,,,Press Trust of India,India,2024-02-28 04:03:00,2024-02-28 04:33:00
2,https://www.ndtv.com/india-news/rules-to-fast-...,Amended Citizenship Rules Likely To Be Enforc...,"The contentious Citizenship Amendment Act, whi...",Sources said the CAA will help refugees from ...,,,,Anindita Sanyal,India,2024-02-27 17:34:00,2024-02-27 18:04:00
3,https://www.ndtv.com/india-news/following-indi...,"""Following Indian Supreme Court"": Top Banglad...",Appreciating the live streaming of court proce...,Chief Justice DY Chandrachud said both India ...,,,,Ashish Kumar Bhargava,India,2024-02-26 22:33:00,2024-02-26 23:03:00
4,https://sports.ndtv.com/cricket/bpl-is-like-a-...,"""BPL Is Like A Circus"": Bangladesh Cricket Tea...",Bangladesh cricket team head coach Chandika Ha...,Chandika Hathurusingha said that the country d...,,,,NDTV Sports Desk,India,2024-02-25 16:48:00,2024-02-25 17:18:00
