In [None]:
from newsapi import NewsApiClient
from bs4 import BeautifulSoup
import requests
import re
from time import sleep
import random
import pandas as pd

def urls_given_by_newsapi_by_bankname(newsapi_key, start_date, end_date, bank_name):
    query = f'{bank_name} AND esg' # both bank_name and esg must appear in the article
    all_URLs = []
    all_titles = []
    all_sources = []
    newsapi = NewsApiClient(api_key = newsapi_key)
    data = newsapi.get_everything(q = query,
                                    from_param=start_date,
                                    to=end_date,
                                    language='en', # needs to be english articles
                                    page_size=100) # number of results to return per page
    articles = data.get("articles", [])
    for article in articles:
        if article["url"] not in all_URLs:
            all_URLs.append(article["url"])
            all_titles.append(article["title"])
            all_sources.append(article["source"])
    return list(zip(all_URLs, all_titles, all_sources))

def bs4_scrape_by_bankname_into_csv(newsapi_key, start_date, end_date, bank_name):
    # get full text content from scraped news articles URL
    res = []
    for url, title, src in urls_given_by_newsapi_by_bankname(newsapi_key, start_date, end_date, bank_name):
        row = [title, src, url]
        try:
            response = requests.get(url)  
            if response.status_code != 200: # req unsuccessful
                row.append("Unable to scrape text")  # fill missing text
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # try different HTML patterns
            content = soup.find("div", {"id": re.compile("^content-body-[0-9]+")})
            if not content:
                content = soup.find("article")
            if not content:
                content = soup.find("div", {"class": re.compile(".*content.*")})
            if not content:
                content = soup.find("p")  # fallback to paragraphs

            if content:
                row.append(content.get_text(strip=True))
            else:
                row.append("Content Not Found")  # HTML patterns failed

            sleep(random.uniform(2, 5))  # sleep to avoid detection

        except Exception as e:
            row.append("Failed to Scrape")  # for any unknown exception
            continue
        res.append(row)
    df = pd.DataFrame(res, columns = ['Title', 'Source', 'URL', 'Content'])
    df.to_csv(f'scraped_data_{bank_name}_{start_date}_to_{end_date}.csv', index = False)
    return df

test = bs4_scrape_by_bankname_into_csv('37439fc0e11546dd9b81a6f698800573', '2025-02-28', '2025-03-02', 'jpmorgan')

In [19]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=20
)
documents = []
for index, row in test.iterrows():

    # check if content is valid (not NaN or 'Unable to scrape text')
    if pd.notna(row['Content']) and row['Content'] != 'Unable to scrape text':
        document = Document(page_content=row['Content'], metadata={'title': row['Title'], 'source': row['Source'], 'url': row['URL']})
        documents.append(document) # append to documents list

all_splits = text_splitter.split_documents(documents) # all_splits now contains the chunked text content

In [22]:
documents[0].page_content

"Private Advisor Group LLC Has $1.36 Million Holdings in Nuveen ESG Emerging Markets Equity ETF (BATS:NUEM)Posted byMarketBeat NewsonFeb 28th, 2025Share on TwitterShare on FacebookShare on LinkedInShare on StocktwitsPrivate Advisor Group LLC lifted its position in  Nuveen ESG Emerging Markets Equity ETF (BATS:NUEM–Free Report) by 4.9% in the fourth quarter, according to its most recent Form 13F filing with the Securities and Exchange Commission. The institutional investor  owned 46,940 shares of the company’s stock after purchasing an additional 2,185 shares during the quarter. Private Advisor Group LLC owned 0.49% of Nuveen ESG Emerging Markets Equity ETF worth $1,362,000 at the end of the most recent reporting period.Several other hedge funds and other institutional investors have also recently bought and sold shares of the company. US Bancorp DE boosted its holdings in  Nuveen ESG Emerging Markets Equity ETF by 12.4% during the third quarter. US Bancorp DE now owns 6,013 shares of t