In [1]:
from bs4 import BeautifulSoup
import requests
import re
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains 
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from datetime import datetime, timedelta
import time
import os
import pandas as pd


#COLLECTING THE LINKS

BASE_URL = 'https://www.firstpost.com/'
country= 'bangladesh'
initial_url = f'{BASE_URL}tag/{country}/'

driver = webdriver.Chrome()
driver.get(initial_url)
driver.maximize_window()

time.sleep(5)

show_more_button = driver.find_element(By.XPATH, '/html/body/div[1]/section/div[7]/div/div/div[4]/a')


driver.execute_script("arguments[0].scrollIntoView();", show_more_button)
time.sleep(5)

number_of_clicks = 1

while number_of_clicks > 0:

    ActionChains(driver)\
    .click(show_more_button)\
    .perform()
    
    number_of_clicks -= 1
    time.sleep(5)
    driver.execute_script("arguments[0].scrollIntoView();", show_more_button)
    time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

all_link_divs = soup.find_all('li', class_ = 'jsx-94b310a290904418 str-lst')

all_news_links = []

for each_div in all_link_divs:
    
    all_news_links.append(each_div.find('a').get('href'))


driver.quit()
    
#SCRAPING ALL NEWS LINKS

counter = 0
data_list = []


for link in all_news_links:
    
    url = f'{BASE_URL}{link}'
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    #TITLE
    
    title_tag = soup.find('h1', class_ = 'art-sec-ttl literatafont')
    title = title_tag.text if title_tag else 'Title not found'
    title_translation = 'None'
    
    #AUTHOR
    
    author_div = soup.find('div', class_ = 'art-dtls-info')
    author = author_div.find('a').text
    
    
    #DATE
    
    date_info = soup.find('meta', {'itemprop':'dateModified'})
    
    if date_info:
        
        date_data = date_info.get('content')
        
        only_date = date_data.split('T')[0]
        only_time = date_data.split('T')[1]
        time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
        cleaned_date = f"{only_date},{time}"

        source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M")
        bangladesh_localtime = source_localtime + timedelta(minutes=30)
        
    else:
        
        date_data = 'Date data not found'
        
    #CONTENT SUMMARY
    
    content_summary_tag = soup.find('span', class_ = 'less-cont')
    content_summary = content_summary_tag.text if content_summary_tag else 'Content Summary not found'
    
    summary_translation = 'None'
    
    #CONTENT
    
    content = []

    main_content = soup.find('div', class_ = 'main-dtls-wrap max-dtls-width')

    if main_content:
    
        all_paras = main_content.find_all('p')

        all_paras.pop()

        all_paras.pop(0)

        for each_para in all_paras:

            content.append(each_para.text)

        full_content = ''.join(content)


        full_content = re.sub("\x80|\x9d|\x99|\x9c\x9a",'',full_content)
    
    else:
        
        full_content = 'Content not found'
        
    content_translation = 'None'
    
    data_dict = {
            "url": url,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }

    counter+=1


    if (date_data != "Date Data Not Found" and full_content != "Content Not Found" and content_summary != "Content summary not found"):
        if data_dict not in data_list:
            # Adding to data list
            data_list.append(data_dict)
    else:
        print(counter)
        print('Skipped due to missing info.')

df = pd.DataFrame(data_list)
df.head()

    
    
csv_filename = f"{country}_First_Post.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="bangladesh_localtime", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"], format = "%d-%m-%Y")  # Converting the "date" column to datetime
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)        
        

Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://www.firstpost.com//explainers/explaine...,Explained: Why Bangladesh court halted adoptio...,Bangladesh has halted adoption of its wild ele...,The Bangladesh High Court on Sunday granted le...,,,,FP Explainers,bangladesh,2024-02-26 18:27:00,2024-02-26 18:57:00
1,https://www.firstpost.com//india/after-demand-...,"After demand from Assam's Barak Valley, Bangla...",A Bangladesh visa centre is set to come up in ...,A Bangladesh visa centre is set to come up in ...,,,,FP Staff,bangladesh,2024-02-19 14:10:00,2024-02-19 14:40:00
2,https://www.firstpost.com//world/bangladesh-re...,Bangladesh releases opposition leaders jailed ...,Two prominent opposition figures in Bangladesh...,The BNP and dozens of other parties boycotted ...,,,,FP Staff,bangladesh,2024-02-15 19:46:00,2024-02-15 20:16:00
3,https://www.firstpost.com//world/bangladesh-wo...,"Bangladesh won't let in any more Rohingya, the...",Bangladesh declared on Wednesday that it will ...,Since the Rohingya are considered foreign intr...,,,,Ajeyo Basu,bangladesh,2024-02-07 22:00:00,2024-02-07 22:30:00
4,https://www.firstpost.com//world/myanmar-borde...,Myanmar border guards flee to Bangladesh as fi...,Officials in Bangladesh on Monday said that ov...,Officials in Bangladesh on Monday said that ov...,,,,FP Staff,bangladesh,2024-02-05 18:16:00,2024-02-05 18:46:00
