In [2]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains 
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


page = 1
BASE_URL = 'https://thenewsglory.com/'
country = 'bangladesh'

initial_url = f'{BASE_URL}page/{page}/?s={country}'

#FINDING TOTAL PAGES

response = requests.get(initial_url)
    
soup = BeautifulSoup(response.text, 'html.parser')

total_pages_element = soup.find('div', class_ ='jeg_navigation jeg_pagination jeg_pagenav_1 jeg_aligncenter no_navtext no_pageinfo')

total_pages = total_pages_element.find('span').text.split('of ')[1]

#COLLECTING ALL LINKS 

all_news_links = []

for page in range(1,4):
    
    initial_url = f'{BASE_URL}page/{page}/?s={country}'
    
    response = requests.get(initial_url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    all_link_divs = soup.find_all('article', class_ = 'jeg_post jeg_pl_md_2 format-standard')
    
    for link_div in all_link_divs:

        all_news_links.append(link_div.find('a').get('href'))


#SCRAPING ALL THE LINKS 

counter = 0
data_list = []

for url in all_news_links:

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        #TITLE
        title_tag = soup.find('h1', class_ = 'jeg_post_title')

        title = title_tag.text if title_tag else 'Title not found'

        title_translation = 'None'

        #AUTHOR

        author_div = soup.find('div', class_ = 'jeg_meta_author')
        author = author_div.find('a').text if author_div else 'Author not found'

        #DATE

        date_info = soup.find('meta', {'property':'article:published_time'})

        if date_info:

            date_data = date_info.get('content')

            only_date = date_data.split('T')[0]
            only_time = date_data.split('T')[1]
            time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
            cleaned_date = f"{only_date},{time}"

            source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M") 
            bangladesh_localtime = source_localtime + timedelta(minutes=30)

        else:

            date_data = 'Date data not found'

        #NO CONTENT SUMMARIES IN THIS WEBSITE
        content_summary = 'None'
        summary_translation = 'None'

        #CONTENT 

        content = []

        content_div = soup.find('div', class_ = 'content-inner')

        if content_div:

            all_paras  = content_div.find_all('p')

            for each_para in all_paras:

                content.append(each_para.text)

            full_content = ''.join(content)

        else:

            full_content = 'Content not found'

        content_translation = 'None'
        
        data_dict = {
                "url": url,
                "title": title,
                "content": full_content,
                "content_summary": content_summary,
                "title_translation":title_translation,
                "content_translation":content_translation,
                "summary translation":summary_translation,
                "author": author,
                "country": country,
                'source_localtime': source_localtime,
                'bangladesh_localtime': bangladesh_localtime

            }

        counter+=1


        if (full_content != "Content not found" or title != 'Title not found'):

                    if data_dict not in data_list:
                            # Adding to data list
                            data_list.append(data_dict)
#                             print(f'Link {counter} added')
        else:
                    print(f'Link {counter}')
                    print('Skipped due to missing info.')


df = pd.DataFrame(data_list)
df.head()

csv_filename = f"{country}_The_News_Glory.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["bangladesh_localtime"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="date", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://thenewsglory.com/bangladesh-mall-fire-...,Bangladesh mall fire: 43 dead; Many were seri...,Dhaka: 43 people were killed in a fire at a 6-...,,,,,The News Glory,bangladesh,2024-03-01 10:39:00,2024-03-01 11:09:00
1,https://thenewsglory.com/bangladesh-election-n...,Bangladesh election not free and fair: US,The US demanded an investigation into the viol...,,,,,The News Glory,bangladesh,2024-01-09 09:12:00,2024-01-09 09:42:00
2,https://thenewsglory.com/parliamentary-electio...,Parliamentary Elections in Bangladesh Tomorrow...,Dhaka: The 12th parliamentary election is goin...,,,,,The News Glory,bangladesh,2024-01-06 18:40:00,2024-01-06 19:10:00
3,https://thenewsglory.com/bangladesh-court-sent...,Bangladesh court sentences Nobel laureate Muha...,A Bangladesh court on Monday sentenced Nobel P...,,,,,The News Glory,bangladesh,2024-01-01 16:55:00,2024-01-01 17:25:00
4,https://thenewsglory.com/bangladesh-beat-new-z...,Bangladesh beat New Zealand for the first time...,Napier: New Zealand were bowled out for 98 run...,,,,,The News Glory,bangladesh,2023-12-24 09:05:00,2023-12-24 09:35:00
