In [9]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://www.dailyexcelsior.com/"
country = "bangladesh"
initial_url = f"{BASE_URL}/about/{country}"

#FINDING TOTAL PAGES AVAILABLE

#WEBSITE HAS AROUND 882 PAGES. EACH PAGES TAKES QUITE A WHILE TO LOAD AND GIVE OUTPUT. SO THE ENTIRE PROCESS WILL TAKE 
#A LONG TIME.
#FOR DEMO, I'VE DONE TILL THE 10th PAGE.

page = 1
while True:
    
    url = f'{BASE_URL}page/{page}/?s={country}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    buttons_div = soup.find('div', class_ = 'page-nav td-pb-padding-side' )
    button_present = buttons_div.find('a', {'aria-label': 'next-page'})
    
    if button_present:
                
        page+=1
    
    else:
        
        break
    
    if page>10:
        
        break


#SCRAPING ALL THE NEWS LINKS


news_url_links = []

for page in range(1,3):

#     url = f'https://www.dailyexcelsior.com/page/{page}/?s=bangladesh'
    url = f'{BASE_URL}page/{page}/?s={country}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    main_div = soup.find('div', class_ = 'td-main-content-wrap td-container-wrap')

    if main_div:

        link_div = main_div.find_all('div', class_ = 'td_module_16 td_module_wrap td-animation-stack')

        for div in link_div:

            link_h3 = div.find('h3', class_ = 'entry-title td-module-title')

            link_tag = link_h3.find('a')

            news_url_links.append(link_tag.get('href'))

            
#SCRAPING ALL NEWS FROM LINKS

counter = 0
data_list = []

for link in news_url_links:

        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')


        #TITLE

        title_tag = soup.find('h1', class_ = 'entry-title')
        title = title_tag.text if title_tag else 'Title not found'
        title

        #AUTHOR

        author_div = soup.find('div', class_ = 'td-module-meta-info')
        author_tag = author_div.find('div', class_ = 'td-post-author-name')
        author = (author_tag.find('a').text) if author_div else 'Author not found'


        #DATE 

        date_div = soup.find('meta', {'property': 'article:modified_time'})


        if date_div:

            date_data = date_div.get('content')

            only_date = date_data.split('T')[0]
            only_time = date_data.split('T')[1]
            time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
            cleaned_date = f"{only_date},{time}"

            source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M")
            bangladesh_localtime = source_localtime + timedelta(minutes=30)


        else:
            date_data = 'Date Data Not Found'

        #CONTENT

        content = []

        content_tag = soup.find('div', class_ = 'td-post-content tagdiv-type')

        if content_tag:

            all_paras = content_tag.find_all('p')

            for para in all_paras:

                content.append(para.text)

            full_content = ' '.join(content)
            full_content = re.sub('\n|\r|\xa0','',full_content)
            full_content = full_content.replace('(PTI)','')
            
            full_content = full_content.split(':',1)
            
            if len(full_content) > 1:
                
                full_content = full_content[1]
            else:
                full_content = full_content[0]
            
            full_content = full_content.strip()

        else:

            full_content = 'Content Not Found'


        #No CONTENT SUMMARY Available for this News Website
        
        content_summary = 'None'
        
        title_translation = 'None'
        summary_translation = 'None'
        content_translation = 'None'

        
        data_dict = {
            "url": link,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }

        counter+=1


        if (date_data != "Date Data Not Found" and full_content != "Content Not Found"):
            if data_dict not in data_list:
                # Adding to data list
                data_list.append(data_dict)
                print(f'Link {counter} added')
        else:
            print(f'Link {counter}')
            print('Skipped due to missing info.')


df = pd.DataFrame(data_list)
df.head()


csv_filename = f"{country}_Daily_Excelsior.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="bangladesh_localtime", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"], format = "%d-%m-%Y")  # Converting the "date" column to datetime
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

Link 1 added
Link 2 added
Link 3 added
Link 4 added
Link 5 added
Link 6 added
Link 7 added
Link 8 added
Link 9 added
Link 10 added
Link 11 added
Link 12 added
Link 13 added
Link 14
Skipped due to missing info.
Link 15
Skipped due to missing info.
Link 16 added
Link 17
Skipped due to missing info.
Link 18 added
Link 19 added
Link 20 added


Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://www.dailyexcelsior.com/govt-allows-oni...,"Govt allows onion exports to Bangladesh, Mauri...",The government on Thursday permitted traders t...,,,,,Daily Excelsior,bangladesh,2024-02-22 17:04:00,2024-02-22 17:34:00
1,https://www.dailyexcelsior.com/bangladesh-repa...,Bangladesh repatriates 330 Myanmar soldiers,Bangladesh on Thursday repatriated 330 of Myan...,,,,,Daily Excelsior,bangladesh,2024-02-15 17:34:00,2024-02-15 18:04:00
2,https://www.dailyexcelsior.com/suspend-trans-s...,Suspend trans-shipment of Bangladesh export ca...,Apparel exporters body AEPC on Thursday urged ...,,,,,Daily Excelsior,bangladesh,2024-02-15 11:39:00,2024-02-15 12:09:00
3,https://www.dailyexcelsior.com/india-is-bangla...,India is Bangladesh’s largest export destinati...,India is today Bangladesh’s largest export des...,,,,,Daily Excelsior,bangladesh,2024-02-13 15:42:00,2024-02-13 16:12:00
4,https://www.dailyexcelsior.com/india-banglades...,India-Bangladesh ties role model for neighbour...,India’s decision to fence its border with Myan...,,,,,Daily Excelsior,bangladesh,2024-02-08 18:55:00,2024-02-08 19:25:00
