In [89]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://www.telegraphindia.com/"
country = "bangladesh"
initial_url = f"{BASE_URL}search?search-term={country}&page=0"

#FINDING THE NUMBER OF PAGES

#THE PAGE COUNT STARTS FROM '0' IN THIS WEBSITE 

#WE WILL TAKE ONE PAGE MORE THAN WHAT IS SCRAPED. THAT'S BECAUSE THE PENULTIMATE PAGE DOESN'T HAVE THE 'NEXT PAGE' BUTTON.

#FOR DEMONSTRATION, TAKING ONLY FIRST 13 PAGES. AS OF THE LAST TIME I CHECKED, THE WEBSITE HAD 207 PAGES FOR BANGLADESH NEWS. 

pages = 0
for page in range(0,13):
    
    initial_url = f"{BASE_URL}search?search-term={country}&page={page}"

    response = requests.get(initial_url)
    soup = BeautifulSoup(response.text, 'html.parser')


    pagination_box = soup.find('div', class_ = 'paginationbox mt-80')

    next_page_button = pagination_box.find_all('a', {'class':'nxtpvr'})

    prev_page_button = pagination_box.find('a', {'class':'nxtpvr', 'aria-label':'Previous'})


    if pagination_box:
        
        if len(next_page_button) > 1:

            pages+=1

        elif len(next_page_button) == 1:

            if next_page_button[0] != prev_page_button:

                pages+=1    

    else:
        
        break


print(f'Total number of pages taken is {pages}')

#THIS LINE SHOULD ONLY BE EXECUTED IF THE AIM IS TO SCRAPE ALL THE PAGES OF THE WEBSITE AVAILABLE FOR BANGLADESH.

# pages+=1


#SCRAPING ALL THE NEWS LINKS 

#taking only first page links for testing

initial_url = f"{BASE_URL}search?search-term={country}&page=0"

response = requests.get(initial_url)
soup = BeautifulSoup(response.text, 'html.parser')

news_unordered_list = soup.find_all('ul','storylisting')
news_lists = []


#THERE ARE TWO UNORDERED LISTS THAT CONTAIN NEWS LINKS IN THIS WEBSITE.

#EACH UNORDERED LIST IS FIRST SCRAPED. THE NEWS INSIDE THE UNORDERED LISTS ARE PRESENT AS ORDERED LISTS.

#EACH ORDERED LIST IS THEN SCRAPED. AFTER THAT, LINKS FROM EACH LIST IS TAKEN.

#AN ISSUE -> FOR SOME REASON, WHEN THE UNORDERED LISTS ARE ITERATED TO APPEND NEWS LISTS, A '-1' IS 
#APPENDED FOR ALMOST EACH ELEMENT ADDED TO THE 'EACH LIST' VARIABLE. SINCE BeautifulSoup FUNCTIONS DON'T WORK ON INTEGERS, 
#THE TRY&EXCEPT METHOD IS USED TO REMOVE THE ERROR.

for each_list in news_unordered_list:
    
    each_list.find_all('li')
    
    for link in each_list:
        
        try:
            news_lists.append(link.find('a').get('href'))
            
        except AttributeError:
            pass
        
        

counter = 0
data_list = []


#IN THIS WEBSITE, THE WEBPAGES HAVE TWO FORMATS. ONE FORMAT IS THE NORMAL ONE, THE OTHER FORMAT IS PRESENT IN ALL WEBPAGES THAT 
#CONTAIN THE WORDS 'MY-KOLKATA' IN THE URL. SO, EACH FORMAT IS CONSIDERED FOR SCRAPING PURPOSES. 

for link in news_lists:
    
    initial_url = f"{BASE_URL}{link}"
#     print(initial_url)

    response = requests.get(initial_url)

    soup = BeautifulSoup(response.text, 'html.parser')
    country = 'Bangladesh'

    if 'my-kolkata' not in initial_url:

        title_tag = soup.find('div', class_ = 'articletsection pt-40')
        website_type = 'normal'

    elif 'my-kolkata' in initial_url:

        title_tag = soup.find('meta', {'property':'og:title'})
        website_type = 'my_kolkata'

    else:

        website_type = None 


    if website_type == 'normal':

        title = title_tag.find('h1').text 
        title_translation = 'None'

        content_summary_tag = title_tag.find('h2', class_= 'mt-24')
        content_summary = content_summary_tag.text if content_summary_tag else 'Content Summary not found'
        summary_translation = 'None'


        date_info = title_tag.find('div', class_ = 'publishdate mt-32')

        if date_info:

            date_data = date_info.text.split('Published')[1].strip()

            source_localtime = datetime.strptime(date_data,'%d.%m.%y, %I:%M %p')
            bangladesh_localtime = source_localtime + timedelta(minutes = 30)

            author = date_info.text.split('\n',2)[1]

        else:

            date_data = 'Date data not found'

        content = []
        content_div = soup.find('article', {'id':'contentbox'})

        if content_div:

            all_paras = content_div.find_all('p') 

            for para in all_paras:

                content.append(para.text)

            full_content = ''.join(content)

        else:
            content = 'Content not found'
        
        content_translation = 'None'

    elif website_type == 'my_kolkata':

        title = title_tag.get('content') 
        title_translation = 'None'
        
        content_summary_tag = soup.find('meta', {'property':'og:description'})
        content_summary = content_summary_tag.get('content') if content_summary_tag else 'Content Summary not found'
        summary_translation = 'None'

        author_date_tags = soup.find('div',class_ = 'enpublicdate mt24 dfjsb aic')

        if author_date_tags:

            author = author_date_tags.find('span').text

            date_data = author_date_tags.text.split('Published')[1].strip()
            source_localtime = datetime.strptime(date_data,'%d.%m.%y, %I:%M %p')

            bangladesh_localtime = source_localtime + timedelta(minutes = 30)

        else:
            date_data = 'Date data not found'

        content = []
        content_div = soup.find('article', class_ = 'articlecontentbox')

        if content_div:

            all_paras = content_div.find_all('p') 

            for para in all_paras:

                content.append(para.text)

            full_content = ''.join(content)

        else:
            content = 'Content not found'
        
        content_translation = 'None'

    elif website_type == None:

        title = 'Title not found'
        content_summary = 'Content Summary not found'
        content = 'Content not found'



    data_dict = {
        "url": initial_url,
        "title": title,
        "content": full_content,
        "content_summary": content_summary,
        "title_translation":title_translation,
        "content_translation":content_translation,
        "summary translation":summary_translation,
        "author": author,
        "country": country,
        'source_localtime': source_localtime,
        'bangladesh_localtime': bangladesh_localtime

    }

    counter+=1


    if (full_content != "Content Not Found" and content_summary != 'Content summary not found'):

            if data_dict not in data_list:
                    # Adding to data list
                    data_list.append(data_dict)
                    print(f'Link {counter} added')
    else:
            print(f'Link {counter}')
            print('Skipped due to missing info.')
            

df = pd.DataFrame(data_list)
df.head()

           

csv_filename = f"{country}_Telegraph_India.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["date"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="bangladesh_localtime", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)


Total number of pages taken is 13
Link 1 added
Link 2 added
Link 3 added
Link 4 added
Link 5 added
Link 6 added
Link 7 added
Link 8 added
Link 9 added
Link 10 added
Link 11 added
Link 12 added
Link 13 added
Link 14 added
Link 15 added
Link 16 added
Link 17 added
Link 18 added
Link 19 added
Link 20 added


Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://www.telegraphindia.com//north-east/ban...,Bangladesh visa centre in Assam's Silchar soon...,A Bangladesh visa centre will be opened here s...,'A proposal to start a border market at Harina...,,,,PTI,Bangladesh,2024-02-19 13:58:00,2024-02-19 14:28:00
1,https://www.telegraphindia.com//west-bengal/ea...,Eastern Himalaya Travel & Tour Operators’ Asso...,The Eastern Himalaya Travel & Tour Operators’ ...,Footfall of tourists from Bangladesh had incre...,,,,Avijit Sinha,Bangladesh,2024-02-19 11:04:00,2024-02-19 11:34:00
2,https://www.telegraphindia.com//gallery/news-o...,News of the day: Three Bengal ministers visit ...,The Eastern Himalaya Travel & Tour Operators’ ...,Here are the latest developments from India an...,,,,Our Web Desk,Bangladesh,2024-02-18 17:34:00,2024-02-18 18:04:00
3,https://www.telegraphindia.com//sports/cricket...,Bangladesh pacer Mustafizur Rahman hospitalise...,Experienced left-arm pacer Mustafizur Rahman w...,The accident occurred while they were particip...,,,,PTI,Bangladesh,2024-02-18 15:48:00,2024-02-18 16:18:00
4,https://www.telegraphindia.com//my-kolkata/eve...,10th Indo-Bangla Cross Border International Cy...,"On February 15, 13 cyclists began their journe...",Thirteen cyclists will travel 350km over seven...,,,,Debrup Chaudhuri,Bangladesh,2024-02-17 15:40:00,2024-02-17 16:10:00
