In [14]:
# pip install selenium
# pip install requests

## Step 1: Use NewsAPI to scrape headlines & articles URL

News API is a simple HTTP REST API for searching and retrieving live articles from all over the web.?

Using the FREE NewsAPI Developer Plan:\
✔ 100 requests per day\
✔ No extra request available\
✔ Articles have a 24 hour delay\
✔ Search articles up to a month old

In [15]:
# pip install newsapi-python

In [260]:
# unofficial Python client library to integrate News API into Python application w/o having to make HTTP requests directly

from newsapi import NewsApiClient # import newsapi client to set up NewsAPI object that will handle the req
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
from time import sleep
import random
import os

In [2]:
# initialize with NewsAPI key
API_KEY = '37439fc0e11546dd9b81a6f698800573'
newsapi = NewsApiClient(api_key=API_KEY)

In [3]:
# scrap weekly till today
end_date = datetime.now()
start_date = end_date - timedelta(days=30) # since can only search articles up to 1 month old 
date_ranges = pd.date_range(start=start_date, end=end_date, freq = '7D')

In [4]:
# stores scraped URLs, sources and titles
all_sources = []
all_URLs = []
all_titles = []

In [24]:
# directory and file paths
save_dir = r"C:\Users\jiayi\OneDrive - National University of Singapore\Desktop\DSA3101"
csv_file = os.path.join(save_dir, "esg_articles.csv")

#### Using Endpoints listed on NewsAPI documentation
3 methods to fetch data with API object\
✔ get_top_headlines( )\
✔ get_everything( )\
✔ get_sources( )

In [5]:
print("🔍 Scraping URLs from NewsAPI...")

# loop through each week to scrape URLs and titles 
for i in tqdm(range(len(date_ranges) -1)):
    from_date = date_ranges[i].strftime("%Y-%m-%d")
    to_date = date_ranges[i + 1].strftime("%Y-%m-%d")

    try:
        data = newsapi.get_everything(q = 'esg', # phrase to search in article title/ body
                                      from_param = from_date,
                                      to = to_date, 
                                      language = 'en', # only articles in eng
                                      page_size = 100 # number of results to return per page
                                     )

        articles = data.get("articles", [])
        for article in articles:
            if article["url"] not in all_URLs:
                all_URLs.append(article["url"])
                all_titles.append(article["title"])
                all_sources.append(article["source"])

        sleep(random.uniform(3, 8))  # random sleep time
        
    except Exception as e:
        print(f"API failed on {from_date} to {to_date}: {e}")
        sleep(10)  # sleep before retrying

print(f" Total URLs Scraped: {len(all_URLs)}")      

🔍 Scraping URLs from NewsAPI...


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

API failed on 2025-02-04 to 2025-02-11: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-02-05, but you have requested 2025-02-04. You may need to upgrade to a paid plan.'}


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:30<00:00,  7.60s/it]

 Total URLs Scraped: 267





In [11]:
# first article in list
data['articles'][0]

{'source': {'id': None, 'name': 'Forbes'},
 'author': 'Rajeev Peshawaria, Contributor, \n Rajeev Peshawaria, Contributor\n https://www.forbes.com/sites/rajeevpeshawaria/',
 'title': 'Five Reasons Why ESG And Wokeism Are Taking A Beating',
 'description': 'My simple assertion, one we’ve been making for a long time now, is that we were trying to solve existential environmental and social challenges unnaturally.',
 'url': 'https://www.forbes.com/sites/rajeevpeshawaria/2025/03/04/five-reasons-why-esg-and-wokeism-are-taking-a-beating/',
 'urlToImage': 'https://imageio.forbes.com/specials-images/imageserve/67c6bcdb4412ddedfe99df4c/0x0.jpg?format=jpg&height=900&width=1600&fit=bounds',
 'publishedAt': '2025-03-04T08:36:04Z',
 'content': 'The dearth of ESG and DEI?\r\nImage credit: ChatGPT\r\nWere ESG, DEI and sustainability fads that have since died? It certainly feels like it in some circles, doesnt it? I dont believe they are dead at a… [+5040 chars]'}

## Step 2: Use BeautifulSoup to visit and scrape all contents from each URL

BeautifulSoup = python library to pull out data from HTML and XML files.\
Automatically visit each article URL to scrape full content

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import os

In [22]:
print("\n🔍 Scraping Full News Content...")

# getting full text content from scraped articles URL

text = [] 
for url in tqdm(all_URLs):
    try:
        response = requests.get(url)  
        if response.status_code != 200: # req unsuccessful
            text.append("Unable to scrape text")  # fill missing text
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        # try different HTML patterns
        content = soup.find("div", {"id": re.compile("^content-body-[0-9]+")})
        if not content:
            content = soup.find("article")
        if not content:
            content = soup.find("div", {"class": re.compile(".*content.*")})
        if not content:
            content = soup.find("p")  # fallback to paragraphs

        if content:
            text.append(content.get_text(strip=True))
        else:
            text.append("Content Not Found")  # HTML patterns failed

        sleep(random.uniform(2, 5))  # sleep to avoid detection

    except Exception as e:
        text.append("Failed to Scrape")  # for any unknown exception
        continue


🔍 Scraping Full News Content...


100%|████████████████████████████████████████████████████████████████████████████████| 267/267 [16:35<00:00,  3.73s/it]


In [23]:
# ensure that all 4 lists have the same length before saving to CSV
print(len(all_URLs))
print(len(all_titles))
print(len(all_sources))
print(len(text))

267
267
267
267


In [25]:
# create a df with the scraped and cleaned data
scraped_text_df = pd.DataFrame({
    "Title": all_titles,
    "Source": all_sources,
    "URL": all_URLs,
    "Content": text,
    })

# if the CSV file exists, append new data
if os.path.exists(csv_file):
    print("✅ Existing CSV Found. Appending Data...")
    existing_df = pd.read_csv(csv_file)
    scraped_text_df = pd.concat([existing_df, scraped_text_df], ignore_index=True)
    scraped_text_df = scraped_text_df.drop_duplicates(subset="URL", keep="first")
    print(f"{len(scraped_text_df) - len(existing_df)} New Articles Appended")
else:
    print("🚨 No Existing CSV Found... Creating New CSV")

# save scraped data into the CSV file
scraped_text_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"✅ Data Saved to {csv_file}")

🚨 No Existing CSV Found... Creating New CSV
✅ Data Saved to C:\Users\jiayi\OneDrive - National University of Singapore\Desktop\DSA3101\esg_articles.csv


In [39]:
scraped_text_df

Unnamed: 0,Title,Source,URL,Content
0,How TIME and Statista Determined the World’s B...,"{'id': 'time', 'name': 'Time'}",https://time.com/7221214/worlds-best-companies...,"ByTIME StaffFebruary 12, 2025 7:48 AM ESTTIME ..."
1,How VCs are killing climate tech — and how the...,"{'id': 'the-next-web', 'name': 'The Next Web'}",https://thenextweb.com/news/how-vcs-are-killin...,Unable to scrape text
2,Costco's DEI clash has companies taking notes....,"{'id': 'business-insider', 'name': 'Business I...",https://www.businessinsider.com/dei-costco-dis...,RetailCostco's DEI clash has companies taking ...
3,The Great Wealth Transfer: Managing Inheritanc...,"{'id': None, 'name': 'Forbes'}",https://www.forbes.com/sites/matthewerskine/20...,MoneyWealth ManagementThe Great Wealth Transfe...
4,Juventus FC Looks To Become Serie A’s Most Sus...,"{'id': None, 'name': 'Forbes'}",https://www.forbes.com/sites/vitascarosella/20...,BusinessSportsMoneyJuventus FC Looks To Become...
...,...,...,...,...
262,Diginex Limited Announces Relocation of Headqu...,"{'id': None, 'name': 'GlobeNewswire'}",https://www.globenewswire.com/news-release/202...,Diginex Limited Announces Relocation of Headqu...
263,Can Microreactors Solve Data Centers' Unsustai...,"{'id': None, 'name': 'Storagereview.com'}",https://www.storagereview.com/news/can-microre...,AIEnterpriseCan Microreactors Solve Data Cente...
264,Government fund ‘comfortable’ with $32m harris...,"{'id': None, 'name': 'Crikey'}",http://www.crikey.com.au/2025/02/28/government...,"Share this articleIf you like this article, sh..."
265,Head to Head Review: Versus Systems (NASDAQ:VS...,"{'id': None, 'name': 'ETF Daily News'}",https://www.etfdailynews.com/2025/02/26/head-t...,Head to Head Review: Versus Systems (NASDAQ:VS...


## Step 3: Text Splitting for Manageable Chunks

In [35]:
# %pip install langchain-text-splitters

In [261]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [262]:
# load CSV into df
df = pd.read_csv('esg_articles.csv')

# initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, chunk_overlap=100, add_start_index=True
)

# create a list to store the documents objects
documents = []

# loop the rows of the df and convert them into Document format
for index, row in df.iterrows():
    
    # check if content is valid (not NaN or 'Unable to scrape text')
    if pd.notna(row['Content']) and row['Content'] != 'Unable to scrape text':
        document = Document(page_content=row['Content'], metadata={'title': row['Title'], 'source': row['Source'], 'url': row['URL']})
        documents.append(document) # append to documents list

all_splits = text_splitter.split_documents(documents) # all_splits now contains the chunked text content

In [263]:
#all_splits

## Step 4: Generating Embeddings and storing in Elasticsearch

In [264]:
#%pip install langchain-community

In [269]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import pandas as pd
import os

In [270]:
ES_HOST = "http://localhost:9200"
index_name = "esg_articles"

In [271]:
# initialize sentence transformer for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # same model as ESG reports

In [272]:
# initialize local es
try:
    es = Elasticsearch([ES_HOST])
except Exception as e:
    raise Exception(
        status_code=500, detail=f"Failed to connect to Elasticsearch: {str(e)}"
    )

In [273]:
# ensure index in elasticsearch is create w correct mapping - stored as dense_vector
index_mapping = {
        "properties": {
            "title": {"type": "text", "analyzer": "standard"},
            "source": {"type": "keyword"},
            "url": {"type": "keyword"},
            "content": {"type": "text", "analyzer": "standard"},
            "embeddings": {
                "type": "dense_vector",
                "dims": 384,  # The dimension of your embeddings, make sure this matches
                "similarity" : "cosine"
            },
        }
    }

In [274]:
def get_embeddings(text):
    try:
        # Create embeddings and convert to list from as needed by Elasticsearch
        return embedding_model.encode(text).tolist()
    except Exception as e:
        print(f"Error fetching embeddings for text: {text}. Error: {str(e)}")
        return None

def create_index():
    try:
        # Delete index if it already exists
        if es.indices.exists(index=index_name):
            es.indices.delete(index=index_name)
        # Create index with mapping
        es.indices.create(index=index_name, mappings=index_mapping)
        print(f"Index '{index_name}' created successfully!")
    except Exception as e:
        print(f"Error creating index '{index_name}': {str(e)}")

In [275]:
# Indexing all_splits data into Elasticsearch
def generate_documents_with_embeddings(all_splits):
    for doc in all_splits:
        action = {
            "_index": index_name,
            "_source": {
                "title": doc.metadata['title'],
                "source": doc.metadata['source'],
                "url": doc.metadata['url'],
                "content": doc.page_content,
                "embeddings": embedding_model.encode(doc.page_content).tolist()
            }
        }
        yield action  # Yield the action instead of just defining it

def index_documents(all_splits):
    success, failed = bulk(es, generate_documents_with_embeddings(all_splits))
    print(f"Successfully indexed {success} documents. Failed to index {failed} documents.")

In [276]:
create_index()  # Ensure the index is created first
index_documents(all_splits)  # Index the documents with embeddings

Index 'esg_articles' created successfully!
Successfully indexed 1843 documents. Failed to index [] documents.


In [277]:
response = es.indices.get_mapping(index=index_name)
print(response)

{'esg_articles': {'mappings': {'properties': {'content': {'type': 'text', 'analyzer': 'standard'}, 'embeddings': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'source': {'type': 'keyword'}, 'title': {'type': 'text', 'analyzer': 'standard'}, 'url': {'type': 'keyword'}}}}}


In [278]:
# Get the documents (e.g., first 5 documents)
index_name = "esg_articles"
response = es.search(index=index_name, body={
    "size": 1,  # Number of documents to retrieve
    "_source": ["title", "content", "url", "embeddings"]  # Specify the fields to retrieve
})

# Print the full response to inspect all available fields
for hit in response['hits']['hits']:
    print(f"Title: {hit['_source']['title']}")
    print(f"URL: {hit['_source']['url']}")
    print(f"Content: {hit['_source']['content']}")
    #print(f"Embeddings: {hit['_source']['embeddings']}")
    print("\n")

Title: Beyond ESG: Why Innovation Is A Defining Factor For Board Leadership
URL: https://www.forbes.com/councils/forbestechcouncil/2025/01/31/beyond-esg-why-innovation-is-a-defining-factor-for-board-leadership/
Content: InnovationBeyond ESG: Why Innovation Is A Defining Factor For Board LeadershipByGreg Ombach, Forbes Councils Member.forForbes Technology CouncilCOUNCIL POSTExpertise from Forbes Councils members, operated under license. Opinions expressed are those of the author.| Membership (fee-based)Jan 31, 2025, 08:15am ESTSave ArticleGreg Ombach, Head of Disruptive Research & Technology, Senior Vice President atAirbus.gettyThroughout my career, I have worked in the automotive, consumer electronics, and aerospace industries and served on various boards and advisory committees ranging from startups to publicly listed enterprises. These experiences have shown me how corporate governance has evolved to address emerging challenges like AI and environmental, social and governance (ESG). 

## Step 5: Implementing the Retrieval System

Implement a KNN (nearest neighbor) search, retrieve the top K documents with similar document chunks based on embeddings from Elasticsearch.

In [287]:
def generate_query_embedding(query):
    return embedding_model.encode(query).tolist()

In [288]:
def knn_search(query, index_name, k=5):
    query_embedding = get_embeddings(query)

    search_query = {
        "size": k,
        "query": {
            "knn": {
                "field": "embeddings",  # ✅ Use 'field' in ES 8.x
                "query_vector": query_embedding,
                "k": k,
                "num_candidates": 100  # Pre-select top 100 documents
            }
        },
        "_source": ["title", "url", "content"]  # Only return these fields
    }

    response = es.search(index=index_name, body=search_query)
    return response

In [289]:
# take top K most similar document chunks from Elasticsearch
query = "FHS World 2025 mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event."
response = knn_search(query, "esg_articles")

for hit in response["hits"]["hits"]:
    print(f"Title: {hit['_source']['title']}")
    print(f"URL: {hit['_source']['url']}")
    # print(f"Content:{hit['_source']['content']}")
    print(f"Similarity Score: {hit['_score']}\n") # score of how relevant/similar the content is

Title: Future Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event
URL: https://www.hospitalitynet.org/news/4125661.html
Similarity Score: 0.9050162

Title: Future Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event
URL: https://www.hospitalitynet.org/news/4125661.html
Similarity Score: 0.8496554

Title: Top takeaways from the FHS World Advisory Board
URL: https://www.hospitalitynet.org/news/4125791.html
Similarity Score: 0.84051085

Title: Top takeaways from the FHS World Advisory Board
URL: https://www.hospitalitynet.org/news/4125791.html
Similarity Score: 0.8343766

Title: Future Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event
URL: https://www.hospitalitynet.org/news/4125661.html
Similarity Score: 0.81534505



In [282]:
# combine the most relevant content together
def combine_content(response):
    combined_content = ""
    for hit in response['hits']['hits']:
        content = hit["_source"]["content"]
        combined_content += content + "\n\n" # add line btw articles
    return combined_content 
    
combined_text = combine_content(response) 
print("### Combined Page Content ###")
print(combined_text)

### Combined Page Content ###
Press ReleaseEvents & ConferencesFuture Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment eventFHS World set to return to Madinat Jumeirah in Dubai from 27-29 October 2025 for landmark editionThe Bench4 February 2025Future Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event — Source:The BenchFuture Hospitality Summit – FHS World 2025 to mark 20 years in the UAE for the region’s most influential hospitality and tourism investment event — Source:The BenchDubai, UAE. 4 February 2025. After another record event in 2024,Future Hospitality Summit– FHS World, will return to Dubai from 27-29 October 2025 for what will be a milestone event for organisersThe Bench, marking 20 years in the UAE for the region’s leading hospitality and tourism investment event, previously known as AHIC.FHS World 2024 s

## Step 6: Generate Responses with LLM

In [285]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llama3.1")

def fact_check(query, combined_text, confidence=False):
    if not combined_text.strip():
        return "Not Conclusive (No relevant context found)"

    prompt = f"""
    You are an expert fact-checking assistant.

    Fact: "{query}"
    Context: {combined_text}

    Compare the fact against the context.
    Classify the fact as one of the following:
    - "Yes": The context **confirms** the fact.
    - "No": The context **contradicts** the fact.
    - "Not Conclusive": The context does **not provide enough information** to confirm or contradict.

    Only return the label as output.
    """

    if confidence:
        prompt += "\nAdditionally, provide a confidence score from 0 to 100 in parentheses, e.g., Same (95%)."

    response = llm.invoke(prompt).strip()

    # Optional: Normalize the output
    response = response.split("\n")[0]  # In case LLM gives extra text
    response = response.capitalize()  # Force consistent casing

    return response

In [286]:
result = fact_check(query, combined_text, confidence=True)
print("Result:", result)

Result: "yes" (100%)
