In [46]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains 
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re

page = 1
BASE_URL = "https://assamtribune.com/"
country = "bangladesh"
initial_url = f'{BASE_URL}search?search={country}&search_type=all&page={page}'

#COLLECTING ALL NEWS LINKS

all_news_links = []

#THIS SCRIPT VISITS THE PAGE, CHECKS IF THE 'LINK DIV ELEMENT' EXISTS, AND STORES THE LINKS. 

#IF LINK DIV ELEMENT DOESN'T EXIST, THAT MEANS THE PAGE DOESN'T EXIST, AND THAT'S THE LAST PAGE IN THE WEBSITE REGARDING
#BANGLADESH.

#TAKING THE LINKS FROM THE FIRST 3 PAGES HERE

for page in range(1,4):
    
    initial_url = f'{BASE_URL}search?search={country}&search_type=all&page={page}'

    response = requests.get(initial_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    all_link_divs = soup.find_all('div', class_ = 'hocal-post-item')
    
    if len(all_link_divs) == 0:
        
        print(f'No links in page {page}')
        print(f'Total Number of Pages in the website regarding Bangladesh is {page - 1}')
        break

    for each_link_div in all_link_divs:
        all_news_links.append(each_link_div.find('a').get('href'))


#SCRAPING EACH NEWS LINK

counter = 0
data_list = []

#SCRAPING THE FIRST 11 LINKS

for link in all_news_links[:11]:

        url = f'{BASE_URL}{link}'

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')


        #Title

        title_tag = soup.find('h1', class_ = 'hocal-title')
        title = title_tag.text if title_tag else 'Title not found'
        title_translation = 'None'

        #Author
        author_tag = soup.find('a', class_ = 'author')
        author = author_tag.text if author_tag else 'Author not found'
        author = author.split('By ')[1]

        #Date 

        date_info = soup.find('span', class_ = 'convert-to-localtime')

        if date_info:

            date_data = date_info.text.split(' GMT')[0]

            source_localtime = datetime.strptime(date_data, '%d %b %Y %H:%M %p') + timedelta(hours = 5, minutes = 30)
            bangladesh_localtime = source_localtime + timedelta(minutes = 30)

        else:
            date_data = 'Date data not found'

        #No CONTENT SUMMARIES IN THIS WEBSITE
        content_summary = 'None'
        summary_translation = 'None'

        #Content

        content = []

        main_div = soup.find('div', class_ = 'hb-entry-main-content dropcap wow fadeIn animated')

        if main_div:

                all_paras = main_div.find_all('p')

                for para in all_paras:
                    content.append(para.text)

                full_content = ''.join(content)

        else:
                full_content = 'Content not found'

        content_translation = 'None'
        
        
        data_dict = {
            "url": url,
            "title": title,
            "content": full_content,
            "content_summary": content_summary,
            "title_translation":title_translation,
            "content_translation":content_translation,
            "summary translation":summary_translation,
            "author": author,
            "country": country,
            'source_localtime': source_localtime,
            'bangladesh_localtime': bangladesh_localtime

        }

        counter+=1


        if (full_content != "Content Not Found" and title != 'Title not found'):

                if data_dict not in data_list:
                        # Adding to data list
                        data_list.append(data_dict)
                        print(f'Link {counter} added')
        else:
                print(f'Link {counter}')
                print('Skipped due to missing info.')
                

df = pd.DataFrame(data_list)
df.head()

csv_filename = f"{country}_The_Assam_Tribune.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["bangladesh_localtime"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="date", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)

Link 1 added
Link 2 added
Link 3 added
Link 4 added
Link 5 added
Link 6 added
Link 7 added
Link 8 added
Link 9 added
Link 10 added
Link 11 added


Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://assamtribune.com//assam/suspected-catt...,Suspected cattle smuggler killed in BSF firing...,"Dhubri, Feb 22: The Border Security Force (BSF...",,,,,Correspondent,bangladesh,2024-02-22 14:35:00,2024-02-22 15:05:00
1,https://assamtribune.com//assam/bangladesh-hig...,Bangladesh High Commission to open visa centre...,"Silchar, Feb 19: In a major development to fac...",,,,,Staff Correspondent,bangladesh,2024-02-19 13:16:00,2024-02-19 13:46:00
2,https://assamtribune.com//national/6-nlft-mili...,"6 NLFT militants flee their Bangladeshi camps,...","Agartala, Feb 10: Six hardcore cadres of the b...",,,,,IANS,bangladesh,2024-02-10 11:18:00,2024-02-10 11:48:00
3,https://assamtribune.com//assam/assam-banglade...,Assam: Bangladeshi national gets 3-year jail t...,"Dhubri, Feb 2: A Bangladeshi national was sent...",,,,,Correspondent,bangladesh,2024-02-02 15:57:00,2024-02-02 16:27:00
4,https://assamtribune.com//assam/bsf-seizes-64-...,BSF seizes 6.4 tonnes of sugar on Indo-Banglad...,"Dhubri, Jan 15: The Border Security Force (BSF...",,,,,Correspondent,bangladesh,2024-01-15 10:26:00,2024-01-15 10:56:00
