In [6]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://www.sangritoday.com/"
country = "bangladesh"
initial_url = f"{BASE_URL}/about/{country}"


#FINDING TOTAL NUMBER OF PAGES AVAILABLE
page = 1
while True:
    
    url = f'{BASE_URL}search?q={country}&page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    page_buttons = soup.find('ul', class_ = 'pagination justify-content-end')
    all_lists = page_buttons.find_all('li')
    
    #FOR THE LAST PAGE OF A PARTICULAR TOPIC, THE 'CURRENT' PAGE BUTTON IS AT THE END OF THE BUTTONS LIST.
    #IN OTHER CASES, THERE ARE ALWAYS BUTTONS TOWARDS THE END OF THE BUTTONS LIST, SUCH AS ARROW BUTTONS.
    #SO, THAT'S HOW WE'RE DETECTING THE NUMBER OF PAGES IN THIS WEBSITE.
    
    if 'active' in all_lists[-1]['class']:
        total_pages = page
        break
        
    page+=1
     
#SCRAPING THE LINKS

news_url_links = []

for page in range(1,3):

    url = f'{BASE_URL}search?q={country}&page={page}'

    driver = webdriver.Chrome()
    driver.get(url)
    
    time.sleep(10)
    
    try: 
        
        
        main_row = driver.find_element(By.XPATH, '/html/body/section/div/div/div[1]/div[2]')
        
        try:
            
            news_divs = main_row.find_elements(By.CLASS_NAME, 'title')
        
        except:
            
            continue
            
    except:
        
            continue
    
    for div in news_divs:
        
        try:
            
            link = div.find_element(By.CSS_SELECTOR, 'a')     
            news_url_links.append(link.get_attribute('href'))
        
        except:
            
            continue
    

    
    driver.quit()

#SCRAPING THE NEWS ARTICLES

counter = 0
data_list = []

#taking the first 10 links. 

for link in news_url_links[:10]:
    
    country = 'India'   

    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    
    #TITLE
    
    main_section = soup.find('section', class_ = 'section section-page')
    title_tag = main_section.find('h1', class_ = 'post-title')
    title = title_tag.text if title_tag else 'Title not found'
    
    #AUTHOR

    author_main_div = main_section.find('div', class_ = 'item-meta item-meta-author')
    author_span = author_main_div.find('span')
    author = author_span.text if author_main_div else 'Author not found'
    
    #DATE

    meta_date_element = soup.find('meta', {'property':'article:modified_time'})
    date_data = meta_date_element.get('content')

    if date_data:
        
        source_localtime = datetime.strptime(date_data, "%Y-%m-%d %H:%M:%S")
        bangladesh_localtime = source_localtime + timedelta(minutes=30)

    else:

        date_data = 'Date Data Not Found'
    
    #CONTENT

    content = []
    temp_content = []
    content_div = main_section.find('div', class_ = 'post-text mt-4')

    if content_div:

        all_paras = content_div.find_all('p')

        if all_paras:

            for para in all_paras:

                content.append(para.text)

            full_content = ' '.join(content)
            full_content = re.sub('\n|\r', '',full_content)

        else:
            full_content = 'Content Not Found'

    else:
        full_content = 'Content Not Found'
    
    #CONTENT SUMMARY

    content_summary_tag = main_section.find('h2', class_= 'post-summary')
    content_summary = content_summary_tag.text if content_summary_tag else 'Content summary not found'
    content_summary = re.sub('\n|\r', '',content_summary)
    content_summary = content_summary.strip()

    title_translation = 'None'
    summary_translation = 'None'
    content_translation = 'None'
    
    
    data_dict = {
            "url": link,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }


    if (date_data != "Date Data Not Found" and full_content != "Content Not Found" and content_summary != "Content summary not found"):
        if data_dict not in data_list:
            # Adding to data list
            data_list.append(data_dict)
    else:
        print(counter)
        print('Skipped due to missing info.')



df = pd.DataFrame(data_list)
df.head()   


csv_filename = f"{country}_Sangri_Today.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="bangladesh_localtime", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"], format = "%d-%m-%Y")  # Converting the "date" column to datetime
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://www.sangritoday.com/citizenship-amendm...,Citizenship Amendment Act will be implemented ...,The rules of 'Citizenship Amendment Act' (CAA)...,CAA: The Citizenship Amendment Bill was passed...,,,,Muskan Kumawat,India,2024-01-03 14:06:26,2024-01-03 14:36:26
1,https://www.sangritoday.com/rain-spoiled-mitch...,"Rain spoiled Mitchell Santner's hard work, Ban...",These days the New Zealand team is playing the...,NZ vs BAN 3rd T20I: These days New Zealand tea...,,,,Muskan Kumawat,India,2023-12-31 13:57:49,2023-12-31 14:27:49
2,https://www.sangritoday.com/shakib-al-hasan-su...,Shakib Al Hasan suffered a big loss due to wea...,Bangladesh team captain Shakib Al Hasan has re...,Shakib Al Hasan Weak Eye Sight: Bangladesh tea...,,,,Muskan Kumawat,India,2023-12-26 09:53:17,2023-12-26 10:23:17
3,https://www.sangritoday.com/boat-filled-with-1...,Boat filled with 142 Rohingya reached Andaman ...,A boat carrying 142 suspected Rohingya refugee...,Andaman Nicobar: A boat carrying 142 suspected...,,,,Muskan Kumawat,India,2023-12-25 13:03:39,2023-12-25 13:33:39
4,https://www.sangritoday.com/nz-vs-ban-banglade...,"NZ vs BAN: Bangladesh created history in ODI, ...",In the last match of the three-match ODI serie...,NZ vs BAN 3rd ODI: In the last match of the th...,,,,Muskan Kumawat,India,2023-12-23 15:59:08,2023-12-23 16:29:08
