In [77]:
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re


BASE_URL = "https://theprint.in/"
country = "bangladesh"
initial_url = f"{BASE_URL}?s={country}"




#SCRIPT FOR FINDING ALL PAGES REGARDING BANGLADESH. VERY TIME CONSUMING, BUT SCRIPT WORKS. 
#There are 500 pages regarding Bangladesh, at the time of scraping.

#REPLACE THE 'FOR LOOP' with 'WHILE TRUE' command 

pages = 1
for pages in range(1,4):
    
    url_to_find_page = f"{BASE_URL}/page/{pages}/?s={country}"
    
    response = requests.get(url_to_find_page)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    pagination_button = soup.find('div', class_ = 'page-nav td-pb-padding-side')
    
    if pagination_button:
        pages+=1
        
    else:
        break
    
#print(f'There are {pages-1} pages regarding {country}')

#SCRAPING ALL NEWS LINKS

all_links = []
for pages in range(1,4):
    
    url_to_find_page = f"{BASE_URL}/page/{pages}/?s={country}"
    response = requests.get(url_to_find_page)
    soup = BeautifulSoup(response.text, 'html.parser')

    main_link_div = soup.find('div', class_= 'td-pb-span8 td-main-content')

    secondary_div = main_link_div.find('div', class_ = 'td-ss-main-content')

    all_link_divs = secondary_div.find_all('div', class_ = 'td_module_16 td_module_wrap td-animation-stack')
  

    for each_link_div in all_link_divs:

        link_holder = each_link_div.find('h3', class_ = 'entry-title td-module-title')
        link_text = link_holder.find('a').get('href')
        
        all_links.append(link_text)
            
print(f'Number of links: {len(all_links)}')

      
#SCRAPING THE CONTENT
      
counter = 0
data_list = []

#SCRAPING FIRST TWO LINKS AS TEST
for link in all_links[:20]:
    

    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')


    #TITLE

    title_tag = soup.find('h1',class_= 'tdb-title-text')
    title = title_tag.text if title_tag else 'Title not found'
    title_translation = 'None'

    #Content Summary

    content_sumary_tag = soup.find('div',class_ = 'td_block_wrap tdb_single_subtitle tdi_67 td-pb-border-top td_block_template_8')
    content_summary = content_sumary_tag.text if content_sumary_tag else 'Content Summary not found'
    content_summary = content_summary.strip()
    summary_translation = 'None'

    #AUTHOR

    author_tag = soup.find('div',class_ = 'td_block_wrap tdb_single_author tdi_68 td-pb-border-top td_block_template_8 tdb-post-meta')
    author = author_tag.text if author_tag else 'Author not found'
    author = author.strip()

    #Content

    content = []

    first_content_tag = soup.find('div',{'id':'postexcerpt'})
    second_content_tag = soup.find('div',{'id':'postcontent'})

    if first_content_tag or second_content_tag:

        all_paras_in_first = first_content_tag.find_all('p') if first_content_tag else ''

        for all_paras in all_paras_in_first:

            content.append(all_paras.text)



        all_paras_in_second = second_content_tag.find_all('p') if second_content_tag else ''

        for all_paras in all_paras_in_second:

            content.append(all_paras.text)


        full_content = ''.join(content)

        full_content = full_content.strip()

        full_content = re.sub('\n|\r|\t','', full_content)
        
        #TO REMOVE UNNECESSARY CONTENT
        
        temp_content = full_content.split('(PTI)',1)
        
        if len(temp_content) > 1:
            full_content = temp_content[1]
        else:
            full_content = temp_content[0]


    else:

        full_content = 'Content not found'


    content_translation = 'None'


    #DATE AND TIME 
    date_data = soup.find('meta', {'property':'article:published_time'}).get('content')

    if date_data:

        only_date = date_data.split('T')[0]
        only_time = date_data.split('T')[1]
        time = f"{only_time.split(':')[0]}:{only_time.split(':')[1]}"
        cleaned_date = f"{only_date},{time}"

        source_localtime = datetime.strptime(cleaned_date, "%Y-%m-%d,%H:%M") 
        bangladesh_localtime = source_localtime + timedelta(minutes=30)

    else:

        date_data = 'Date data not found'

        
    
    data_dict = {
        "url": link,
        "title": title,
        "content": full_content,
        "content_summary": content_summary,
        "title_translation":title_translation,
        "content_translation":content_translation,
        "summary translation":summary_translation,
        "author": author,
        "country": country,
        'source_localtime': source_localtime,
        'bangladesh_localtime': bangladesh_localtime

    }

    counter+=1


    if (full_content != "Content Not Found" and title != 'Title not found'):

            if data_dict not in data_list:
                    # Adding to data list
                    data_list.append(data_dict)
                    print(f'Link {counter} added')
    else:
            print(f'Link {counter}')
            print('Skipped due to missing info.')

            

df = pd.DataFrame(data_list)
df.head()
            
csv_filename = f"{country}_The_Print.csv"

# Checking if the CSV file already exists
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    # Merging new and existing dataframe
    df = pd.concat([existing_df, pd.DataFrame(data_list)], ignore_index=True)
    df["bangladesh_localtime"] = pd.to_datetime(df["bangladesh_localtime"])  # Converting the "date" column to datetime
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.sort_values(by="date", ascending=False)  # Sorting the date
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)  
else:
    # If csv file does not exist, then we create a new CSV file with the scraped data
    df = pd.DataFrame(data_list)
    df = df.sort_values(by="bangladesh_localtime", ascending=False)
    df = df.drop_duplicates(subset=["title"], keep="first")
    df = df.reset_index(drop=True)
    df.to_csv(csv_filename, index=False)
      

Number of links: 60
Link 1 added
Link 2 added
Link 3 added
Link 4 added
Link 5 added
Link 6 added
Link 7 added
Link 8 added
Link 9 added
Link 10 added
Link 11 added
Link 12 added
Link 13 added
Link 14 added
Link 15 added
Link 16 added
Link 17 added
Link 18 added
Link 19 added
Link 20 added


In [78]:
df.head()

Unnamed: 0,url,title,content,content_summary,title_translation,content_translation,summary translation,author,country,source_localtime,bangladesh_localtime
0,https://theprint.in/india/india-hosts-milan-na...,India hosts Milan naval exercise; around 50 co...,India on Monday kick-started a nine-day mega ...,Content Summary not found,,,,PTI,bangladesh,2024-02-19 15:15:00,2024-02-19 15:45:00
1,https://theprint.in/economy/govt-invested-rs-5...,Govt invested Rs 5 lakh crore in 10 years to t...,Union minister B L Verma on Monday said the c...,Content Summary not found,,,,PTI,bangladesh,2024-02-19 13:16:00,2024-02-19 13:46:00
2,https://theprint.in/india/bsf-seizes-smuggled-...,"BSF seizes smuggled gold worth over Rs 6 cr, n...",The BSF apprehended an Indian man and alleged...,Content Summary not found,,,,PTI,bangladesh,2024-02-19 10:45:00,2024-02-19 11:15:00
3,https://theprint.in/india/bangladesh-visa-cent...,Bangladesh visa centre in Silchar soon: Envoy,A Bangladesh visa centre will be opened here ...,Content Summary not found,,,,PTI,bangladesh,2024-02-19 07:30:00,2024-02-19 08:00:00
4,https://theprint.in/india/bengal-guv-to-visit-...,Bengal Guv to visit Chopra on Tuesday; talk to...,West Bengal Governor C V Ananda Bose will vis...,Content Summary not found,,,,PTI,bangladesh,2024-02-19 07:00:00,2024-02-19 07:30:00
