In [1]:
"""
requests_html_tutorial.py
~~~~~~~~~~~~~~~~~~~~~~~~~

This module demonstrates the usage of the `requests-html` library in Python for web scraping.
It covers various real-world examples including rendering JavaScript, extracting data, and
working with HTML elements. This script is designed as an educational tool for understanding
web scraping using Python.

Author: [Your Name]
Version: 1.0
"""



import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )

news_dict = {}
news_dict['category'] = []
news_dict['title'] = []
news_dict['date_time'] = []
news_dict['reporter'] = []
news_dict['body'] = []
news_dict['image_link'] = []
news_dict['page_link'] = []

def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       time_date, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/dailynayadigonto" , "twitter.com/dailynayadigonto" , 
                                        "linkedin.com/dailynayadigonto" , "instagram.com/dailynayadigonto")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, time_date, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

def render_javascript(url):
    """
    Demonstrates how to render JavaScript using the `requests-html` library.
    This function fetches the page content after JavaScript has been executed.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        # response.html.render()  # This will download Chromium if not found
        print("Rendered web page:", response.html.html)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_title_link(url): #use only once
    session = HTMLSession()
    try:
        link_lists = []
        response = session.get(url)
        title_link_lead = response.html.find("div.bg-bg2-light > h3 > a")

        for i in range (len(title_link_lead)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_link_lead[i].attrs['href'])

        title_links = response.html.find("div.mt-4 > h3 > a")
        for i in range (len(title_links)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_links[i].attrs['href'])
        
        return link_lists


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_information(url):
    """
    Extracts and prints specific information from a webpage using CSS selectors.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        news_dict['page_link'].append(url)

        # Example: Extracting category
        category = response.html.find("div.mb-2 > a")
        # for link in category:
        #     print(f"Category : {link.text} ")
        print(f"Category : {category[0].text}\n")
        news_dict['category'].append(category[0].text)

        #Extracting Title
        title = response.html.find("h1.font-semibold")
        print(f"Title : {title[0].text}\n")
        news_dict['title'].append(title[0].text)

        # reporter
        reporter = response.html.find("div.text-p-light > a")
        print(f"Reporter : {reporter[0].text}\n")
        news_dict['reporter'].append(reporter[0].text)

        #Extracting Time 
        times = response.html.find("section > time")
        # for i in times:
        #     # new_time = i.html.find('li')
        #     print(i.text)
        print(f"Date & Time : {times[0].text}\n")
        news_dict['date_time'].append(times[0].text)
        # print(len(times))

        #Extracting body
        news_body = response.html.find("article > p")
        body = ""
        print("Body : ")
        for i in news_body:
            print(i.text)
            body += i.text
        news_dict['body'].append(body)
        print("\n")


        #extracting Image source
        img = response.html.find("div.mb-2 > img.v-lazy-image")
        print(f"Image Link : {img[0].attrs['srcset']}\n")
        news_dict['image_link'].append(img[0].attrs['srcset'])

        
        

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def main():
    """
    Main function to execute the web scraping examples.
    """
    # print("Rendering JavaScript on a web page...")
    # render_javascript('https://www.dailynayadiganta.com/politics/857443/%E0%A6%B0%E0%A6%BE%E0%A6%9C%E0%A6%A7%E0%A6%BE%E0%A6%A8%E0%A7%80%E0%A6%A4%E0%A7%87-%E0%A6%B6%E0%A7%87%E0%A6%96-%E0%A6%B9%E0%A6%BE%E0%A6%B8%E0%A6%BF%E0%A6%A8%E0%A6%BE%E0%A6%B0-%E0%A6%AC%E0%A6%BF%E0%A6%B0%E0%A7%81%E0%A6%A6%E0%A7%8D%E0%A6%A7%E0%A7%87-%E0%A6%86%E0%A6%B0%E0%A7%8B-%E0%A7%AC-%E0%A6%B9%E0%A6%A4%E0%A7%8D%E0%A6%AF%E0%A6%BE-%E0%A6%AE%E0%A6%BE%E0%A6%AE%E0%A6%B2%E0%A6%BE')

    # print("\nExtracting information from a web page...")
    # extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
    pass

# extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
# extract_information('https://thefinancialexpress.com.bd/economy/bangladesh/bangladesh-bank-raises-cash-withdrawal-limit-to-tk-300000')

if __name__ == "__main__":
    conn = create_db_connection()
    link_lists = extract_title_link("https://thefinancialexpress.com.bd/page/economy/bangladesh")
    for i in link_lists:
        extract_information(i)
    df = pd.DataFrame.from_dict(news_dict)
    # Data preprocessing
    df['category_description'] = "This category contains all news regarding economy of " + df['category']
    df['reporter_mail'] = df['reporter']+"@yahoo.com"
    df['publisher_name'] = "The Financial Express"
    df['publisher_email'] = "editor@thefinancialexpress.com"
    df['publisher_phone'] = "+8801917231083"
    df['head_office_address'] = "Tropicana Tower (4th floor), 45, Topkhana Road, GPO Box : 2526 Dhaka- 1000"
    df['website'] = "https://thefinancialexpress.com.bd"

    #insert into database
    lenth = df.shape[0]
    for i in range (lenth):
        process_and_insert_news_data(conn,df['category'][i], df['title'][i],
                                  df['body'][i], df['image_link'][i], df['page_link'][i],df['reporter'][i],df['date_time'][i], 
                                  df['category_description'][i], df['reporter_mail'][i], df['publisher_name'][i],df['publisher_email'][i], 
                                  df['publisher_phone'][i], df['head_office_address'][i], df['website'][i])



MySQL Database connection successful
Category : Bangladesh

Title : Govt launches major banking reforms

Reporter : FE ONLINE DESK

Date & Time : Aug 28, 2024 09:07

Body : 
Bangladesh Bank and the interim government are undertaking significant efforts to restructure the country’s banking sector, focusing on recovering both local and foreign assets embezzled by corrupt individuals.
Officials have confirmed that these assets, laundered abroad, are being targeted for repatriation as part of a broader initiative to bring the financial sector up to international standards, reports UNB.
The government emphasized its commitment to reforming the financial sector, acknowledging that the process is complex and time-consuming.
A key element of this strategy will be the establishment of a banking commission tasked with investigating each implicated bank, uncovering the full extent of the corruption, and developing an action plan, according to the Chief Adviser’s press wing.
This plan, which can b

In [None]:
import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )
def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       time_date, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/dailynayadigonto" , "twitter.com/dailynayadigonto" , 
                                        "linkedin.com/dailynayadigonto" , "instagram.com/dailynayadigonto")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, time_date, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

In [45]:
# import pandas as pd
news_dict

{'category': ['Bangladesh'],
 'title': ['Bangladesh Bank raises cash withdrawal limit to Tk 300,000'],
 'date_time': ['Aug 17, 2024 21:16'],
 'reporter': ['bdnews24.com'],
 'body': ['Starting from Sunday, the Bangladesh Bank has set a new cash withdrawal limit of Tk 300,000 via cheques, up from the previous limit of Tk 200,000.The decision was communicated to all commercial banks on Saturday, with the central bank directing strict adherence to the new limit for security reasons.This follows the interim government’s directive for banking and financial sector reforms after the fall of the previous government on Aug 5.Earlier, on Aug 8, the interim government imposed stricter controls, limiting cheque withdrawals to Tk 100,000.Last week’s directive had raised this limit to Tk 200,000.The central bank has also advised banks to halt transactions if there is any suspicion involved.Despite the cheque withdrawal limit, digital transactions remain unrestricted, allowing for any amount to be tra

In [46]:
list_of_links = extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')

In [47]:
list_of_links

['https://thefinancialexpress.com.bd/economy/bangladesh/govt-launches-major-banking-reforms',
 'https://thefinancialexpress.com.bd/economy/bangladesh/political-stability-preconditions-for-economic-prosperity-says-fahmida-khatun',
 'https://thefinancialexpress.com.bd/economy/bangladesh/recovering-laundered-money-will-be-a-daunting-task-despite-interim-governments-commitment-experts-say',
 'https://thefinancialexpress.com.bd/economy/bangladesh/imf-assures-raising-budgetary-support',
 'https://thefinancialexpress.com.bd/economy/bangladesh/bb-earns-tk-151-billion-in-fy24',
 'https://thefinancialexpress.com.bd/economy/bangladesh/initiative-yet-to-gain-significant-traction',
 'https://thefinancialexpress.com.bd/economy/bangladesh/s-alams-assets-can-be-used-to-return-depositors-money-bb-governor',
 'https://thefinancialexpress.com.bd/economy/bangladesh/uk-keen-to-help-bangladesh-reform-banking-revenue-sectors',
 'https://thefinancialexpress.com.bd/economy/bangladesh/diaspora-bonds-lose-lustre

In [48]:
print('\t\t\t\t\tAll News')
for idx, i in enumerate(list_of_links, start=1):
    print(f'\nNews link {idx}: {i}\n')
    extract_information(i)

					All News

News link 1: https://thefinancialexpress.com.bd/economy/bangladesh/govt-launches-major-banking-reforms

Category : Bangladesh

Title : Govt launches major banking reforms

Reporter : FE ONLINE DESK

Date & Time : Aug 28, 2024 09:07

Body : 
Bangladesh Bank and the interim government are undertaking significant efforts to restructure the country’s banking sector, focusing on recovering both local and foreign assets embezzled by corrupt individuals.
Officials have confirmed that these assets, laundered abroad, are being targeted for repatriation as part of a broader initiative to bring the financial sector up to international standards, reports UNB.
The government emphasized its commitment to reforming the financial sector, acknowledging that the process is complex and time-consuming.
A key element of this strategy will be the establishment of a banking commission tasked with investigating each implicated bank, uncovering the full extent of the corruption, and developing a

In [50]:

print(len(news_dict['category']))
print(len(news_dict['title']))
print(len(news_dict['date_time']))
print(len(news_dict['reporter']))
print(len(news_dict['body']))
print(len(news_dict['page_link']))
print(len(news_dict['image_link']))

14
14
14
14
14
14
14


In [51]:
import pandas as pd
df = pd.DataFrame.from_dict(news_dict)
df

Unnamed: 0,category,title,date_time,reporter,body,image_link,page_link
0,Bangladesh,Bangladesh Bank raises cash withdrawal limit t...,"Aug 17, 2024 21:16",bdnews24.com,"Starting from Sunday, the Bangladesh Bank has ...",https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
1,Bangladesh,Govt launches major banking reforms,"Aug 28, 2024 09:07",FE ONLINE DESK,Bangladesh Bank and the interim government are...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
2,Bangladesh,Political stability precondition for economic ...,"Aug 31, 2024 09:28",UNB,Executive Director of the Centre for Policy Di...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
3,Bangladesh,Recovering laundered money will be a daunting ...,"Aug 31, 2024 08:55",,Despite the interim government’s strong commit...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
4,Bangladesh,IMF assures raising budgetary support,"Aug 30, 2024 03:40",FE REPORT,The International Monetary Fund (IMF) on Thurs...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
5,Bangladesh,Bangladesh Bank earns Tk 151b in FY24,"Aug 29, 2024 08:18",FE ONLINE DESK,Bangladesh Bank has earned Tk 151 billion net ...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
6,Bangladesh,Initiative yet to gain significant traction,"Aug 29, 2024 03:37",FE REPORT,Bangladesh needs to prioritise expanding its e...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
7,Bangladesh,S Alam’s assets can be used to return deposito...,"Aug 28, 2024 09:42",FE ONLINE DESK,Bangladesh Bank (BB) Governor Ahsan H Mansur h...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
8,Bangladesh,"UK keen to help Bangladesh reform banking, rev...","Aug 27, 2024 06:59",BSS,The United Kingdom (UK) has expressed their ke...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...
9,Bangladesh,Diaspora bonds lose lustre for low rates,"Aug 27, 2024 02:36",Arafat Ara,All three diaspora bonds of the government hav...,https://tfe-bd.sgp1.cdn.digitaloceanspaces.com...,https://thefinancialexpress.com.bd/economy/ban...


In [43]:
from requests_html import HTMLSession
session  = HTMLSession()
response = session.get('https://thefinancialexpress.com.bd/economy/bangladesh/bb-raises-cash-withdrawal-limit-to-tk-04m-daily')
 # Example: Extracting category
category = response.html.find("div.mb-2 > a")
# for link in category:
#     print(f"Category : {link.text} ")
# print(f"Category : {category[2].text}\n")
print(category[0].text)
title = response.html.find("h1.font-semibold")
# print(f"Title : {title[1].text}\n")
print(title[0].text)
times = response.html.find("section > time")
# for i in times:
#     # new_time = i.html.find('li')
#     print(i.text)
print(times[0].text)
body = response.html.find("article > p")
print("Body : ")
for i in body:
    print(i.text)
print("\n")
img = response.html.find("div.mb-2 > img.v-lazy-image")
print(img[0].attrs['srcset'])
reporter = response.html.find("div.text-p-light > a")
print(reporter[0].text)

Bangladesh
Bangladesh Bank raises cash withdrawal limit to Tk 0.4m daily
Aug 25, 2024 08:54
Body : 
Bangladesh Bank (BB) has ordered commercial banks not to allow cash withdrawals of more than Tk 0.40 million per account daily.
Earlier, the limit was Tk 0.30 million, reports BSS.
The central bank has hiked the limit once in each of the previous three weeks, which it imposed after the deterioration of the law and order and overall security situation in the country since the fall of the Awami League government on August 5.
Initially, the BB slapped the restriction on withdrawals of over Tk 0.10 million as banks were facing challenges in transferring cash from one place to another.
Earlier the central bank instructed commercial banks to monitor transactions through cheques and block any suspicious transfer of funds.


https://tfe-bd.sgp1.cdn.digitaloceanspaces.com/posts/55561/fe-bangladesh-bank-bb.jpg
FE ONLINE DESK


In [3]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
"""
requests_html_tutorial.py
~~~~~~~~~~~~~~~~~~~~~~~~~

This module demonstrates the usage of the `requests-html` library in Python for web scraping.
It covers various real-world examples including rendering JavaScript, extracting data, and
working with HTML elements. This script is designed as an educational tool for understanding
web scraping using Python.

Author: [Your Name]
Version: 1.0
"""



import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )

news_dict = {}
news_dict['category'] = []
news_dict['title'] = []
news_dict['date_time'] = []
news_dict['reporter'] = []
news_dict['body'] = []
news_dict['image_link'] = []
news_dict['page_link'] = []

def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       time_date, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/thefinancialexpress" , "twitter.com/thefinancialexpress" , 
                                        "linkedin.com/thefinancialexpress" , "instagram.com/thefinancialexpress")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, time_date, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

def render_javascript(url):
    """
    Demonstrates how to render JavaScript using the `requests-html` library.
    This function fetches the page content after JavaScript has been executed.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        # response.html.render()  # This will download Chromium if not found
        print("Rendered web page:", response.html.html)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_title_link(url): #use only once
    session = HTMLSession()
    try:
        link_lists = []
        response = session.get(url)
        title_link_lead = response.html.find("div.bg-bg2-light > h3 > a")

        for i in range (len(title_link_lead)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_link_lead[i].attrs['href'])

        title_links = response.html.find("div.mt-4 > h3 > a")
        for i in range (len(title_links)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_links[i].attrs['href'])
        
        return link_lists


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_information(url):
    """
    Extracts and prints specific information from a webpage using CSS selectors.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        news_dict['page_link'].append(url)

        # Example: Extracting category
        category = response.html.find("div.mb-2 > a")
        # for link in category:
        #     print(f"Category : {link.text} ")
        print(f"Category : {category[0].text}\n")
        news_dict['category'].append(category[0].text)

        #Extracting Title
        title = response.html.find("h1.font-semibold")
        print(f"Title : {title[0].text}\n")
        news_dict['title'].append(title[0].text)

        # reporter
        reporter = response.html.find("div.text-p-light > a")
        print(f"Reporter : {reporter[0].text}\n")
        news_dict['reporter'].append(reporter[0].text)

        #Extracting Time 
        times = response.html.find("section > time")
        # for i in times:
        #     # new_time = i.html.find('li')
        #     print(i.text)
        print(f"Date & Time : {times[0].text}\n")
        news_dict['date_time'].append(times[0].text)
        # print(len(times))

        #Extracting body
        news_body = response.html.find("article > p")
        body = ""
        print("Body : ")
        for i in news_body:
            print(i.text)
            body += i.text
        news_dict['body'].append(body)
        print("\n")


        #extracting Image source
        img = response.html.find("div.mb-2 > img.v-lazy-image")
        print(f"Image Link : {img[0].attrs['srcset']}\n")
        news_dict['image_link'].append(img[0].attrs['srcset'])

        
        

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def main():
    """
    Main function to execute the web scraping examples.
    """
    # print("Rendering JavaScript on a web page...")
    # render_javascript('https://www.dailynayadiganta.com/politics/857443/%E0%A6%B0%E0%A6%BE%E0%A6%9C%E0%A6%A7%E0%A6%BE%E0%A6%A8%E0%A7%80%E0%A6%A4%E0%A7%87-%E0%A6%B6%E0%A7%87%E0%A6%96-%E0%A6%B9%E0%A6%BE%E0%A6%B8%E0%A6%BF%E0%A6%A8%E0%A6%BE%E0%A6%B0-%E0%A6%AC%E0%A6%BF%E0%A6%B0%E0%A7%81%E0%A6%A6%E0%A7%8D%E0%A6%A7%E0%A7%87-%E0%A6%86%E0%A6%B0%E0%A7%8B-%E0%A7%AC-%E0%A6%B9%E0%A6%A4%E0%A7%8D%E0%A6%AF%E0%A6%BE-%E0%A6%AE%E0%A6%BE%E0%A6%AE%E0%A6%B2%E0%A6%BE')

    # print("\nExtracting information from a web page...")
    # extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
    pass

# extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
# extract_information('https://thefinancialexpress.com.bd/economy/bangladesh/bangladesh-bank-raises-cash-withdrawal-limit-to-tk-300000')

if __name__ == "__main__":
    conn = create_db_connection()
    link_lists = extract_title_link("https://thefinancialexpress.com.bd/page/economy/bangladesh")
    for i in link_lists:
        extract_information(i)
    df = pd.DataFrame.from_dict(news_dict)
    # Data preprocessing
    df['category_description'] = "This category contains all news regarding economy of " + df['category']
    df['reporter_mail'] = df['reporter']+"@yahoo.com"
    df['publisher_name'] = "The Financial Express"
    df['publisher_email'] = "editor@thefinancialexpress.com"
    df['publisher_phone'] = "+8801917231083"
    df['head_office_address'] = "Tropicana Tower (4th floor), 45, Topkhana Road, GPO Box : 2526 Dhaka- 1000"
    df['website'] = "https://thefinancialexpress.com.bd"

    #insert into database
    lenth = df.shape[0]
    for i in range (lenth):
        process_and_insert_news_data(conn,df['category'][i], df['title'][i],
                                  df['body'][i], df['image_link'][i], df['page_link'][i],df['reporter'][i],df['date_time'][i], 
                                  df['category_description'][i], df['reporter_mail'][i], df['publisher_name'][i],df['publisher_email'][i], 
                                  df['publisher_phone'][i], df['head_office_address'][i], df['website'][i])



MySQL Database connection successful
Category : Bangladesh

Title : Govt launches major banking reforms

Reporter : FE ONLINE DESK

Date & Time : Aug 28, 2024 09:07

Body : 
Bangladesh Bank and the interim government are undertaking significant efforts to restructure the country’s banking sector, focusing on recovering both local and foreign assets embezzled by corrupt individuals.
Officials have confirmed that these assets, laundered abroad, are being targeted for repatriation as part of a broader initiative to bring the financial sector up to international standards, reports UNB.
The government emphasized its commitment to reforming the financial sector, acknowledging that the process is complex and time-consuming.
A key element of this strategy will be the establishment of a banking commission tasked with investigating each implicated bank, uncovering the full extent of the corruption, and developing an action plan, according to the Chief Adviser’s press wing.
This plan, which can b

In [1]:
"""
requests_html_tutorial.py
~~~~~~~~~~~~~~~~~~~~~~~~~

This module demonstrates the usage of the `requests-html` library in Python for web scraping.
It covers various real-world examples including rendering JavaScript, extracting data, and
working with HTML elements. This script is designed as an educational tool for understanding
web scraping using Python.

Author: [Your Name]
Version: 1.0
"""



import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )

news_dict = {}
news_dict['category'] = []
news_dict['title'] = []
news_dict['date_time'] = []
news_dict['reporter'] = []
news_dict['body'] = []
news_dict['image_link'] = []
news_dict['page_link'] = []

def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       time_date, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/thefinancialexpress" , "twitter.com/thefinancialexpress" , 
                                        "linkedin.com/thefinancialexpress" , "instagram.com/thefinancialexpress")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, time_date, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

def render_javascript(url):
    """
    Demonstrates how to render JavaScript using the `requests-html` library.
    This function fetches the page content after JavaScript has been executed.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        # response.html.render()  # This will download Chromium if not found
        print("Rendered web page:", response.html.html)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_title_link(url): #use only once
    session = HTMLSession()
    try:
        link_lists = []
        response = session.get(url)
        title_link_lead = response.html.find("div.bg-bg2-light > h3 > a")

        for i in range (len(title_link_lead)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_link_lead[i].attrs['href'])

        title_links = response.html.find("div.mt-4 > h3 > a")
        for i in range (len(title_links)):
            link_lists.append("https://thefinancialexpress.com.bd" + title_links[i].attrs['href'])
        
        return link_lists


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_information(url):
    """
    Extracts and prints specific information from a webpage using CSS selectors.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        news_dict['page_link'].append(url)

        # Example: Extracting category
        category = response.html.find("div.mb-2 > a")
        # for link in category:
        #     print(f"Category : {link.text} ")
        print(f"Category : {category[0].text}\n")
        news_dict['category'].append(category[0].text)

        #Extracting Title
        title = response.html.find("h1.font-semibold")
        print(f"Title : {title[0].text}\n")
        news_dict['title'].append(title[0].text)

        # reporter
        reporter = response.html.find("div.text-p-light > a")
        print(f"Reporter : {reporter[0].text}\n")
        news_dict['reporter'].append(reporter[0].text)

        #Extracting Time 
        times = response.html.find("section > time")
        # for i in times:
        #     # new_time = i.html.find('li')
        #     print(i.text)
        print(f"Date & Time : {times[0].text}\n")
        news_dict['date_time'].append(times[0].text)
        # print(len(times))

        #Extracting body
        news_body = response.html.find("article#main-single-post p")
        body = ""
        print("Body : ")
        for i in news_body:
            print(i.text)
            body += i.text
        news_dict['body'].append(body)
        print("\n")


        #extracting Image source
        img = response.html.find("div.mb-2 > img.v-lazy-image")
        print(f"Image Link : {img[0].attrs['srcset']}\n")
        news_dict['image_link'].append(img[0].attrs['srcset'])

        
        

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def main():
    """
    Main function to execute the web scraping examples.
    """
    # print("Rendering JavaScript on a web page...")
    # render_javascript('https://www.dailynayadiganta.com/politics/857443/%E0%A6%B0%E0%A6%BE%E0%A6%9C%E0%A6%A7%E0%A6%BE%E0%A6%A8%E0%A7%80%E0%A6%A4%E0%A7%87-%E0%A6%B6%E0%A7%87%E0%A6%96-%E0%A6%B9%E0%A6%BE%E0%A6%B8%E0%A6%BF%E0%A6%A8%E0%A6%BE%E0%A6%B0-%E0%A6%AC%E0%A6%BF%E0%A6%B0%E0%A7%81%E0%A6%A6%E0%A7%8D%E0%A6%A7%E0%A7%87-%E0%A6%86%E0%A6%B0%E0%A7%8B-%E0%A7%AC-%E0%A6%B9%E0%A6%A4%E0%A7%8D%E0%A6%AF%E0%A6%BE-%E0%A6%AE%E0%A6%BE%E0%A6%AE%E0%A6%B2%E0%A6%BE')

    # print("\nExtracting information from a web page...")
    # extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
    pass

# extract_title_link('https://thefinancialexpress.com.bd/page/economy/bangladesh')
# extract_information('https://thefinancialexpress.com.bd/economy/bangladesh/bangladesh-bank-raises-cash-withdrawal-limit-to-tk-300000')

if __name__ == "__main__":
    conn = create_db_connection()
    link_lists = extract_title_link("https://thefinancialexpress.com.bd/page/economy/bangladesh")
    for i in link_lists:
        extract_information(i)
    df = pd.DataFrame.from_dict(news_dict)
    # Data preprocessing
    df['category_description'] = "This category contains all news regarding economy of " + df['category']
    df['reporter_mail'] = df['reporter']+"@yahoo.com"
    df['publisher_name'] = "The Financial Express"
    df['publisher_email'] = "editor@thefinancialexpress.com"
    df['publisher_phone'] = "+8801917231083"
    df['head_office_address'] = "Tropicana Tower (4th floor), 45, Topkhana Road, GPO Box : 2526 Dhaka- 1000"
    df['website'] = "https://thefinancialexpress.com.bd"

    #insert into database
    lenth = df.shape[0]
    for i in range (lenth):
        process_and_insert_news_data(conn,df['category'][i], df['title'][i],
                                  df['body'][i], df['image_link'][i], df['page_link'][i],df['reporter'][i],df['date_time'][i], 
                                  df['category_description'][i], df['reporter_mail'][i], df['publisher_name'][i],df['publisher_email'][i], 
                                  df['publisher_phone'][i], df['head_office_address'][i], df['website'][i])



MySQL Database connection successful
Category : Bangladesh

Title : Govt launches major banking reforms

Reporter : FE ONLINE DESK

Date & Time : Aug 28, 2024 09:07

Body : 
Bangladesh Bank and the interim government are undertaking significant efforts to restructure the country’s banking sector, focusing on recovering both local and foreign assets embezzled by corrupt individuals.
Officials have confirmed that these assets, laundered abroad, are being targeted for repatriation as part of a broader initiative to bring the financial sector up to international standards, reports UNB.
The government emphasized its commitment to reforming the financial sector, acknowledging that the process is complex and time-consuming.
A key element of this strategy will be the establishment of a banking commission tasked with investigating each implicated bank, uncovering the full extent of the corruption, and developing an action plan, according to the Chief Adviser’s press wing.
This plan, which can b