In [1]:
pip install newsapi-python

Collecting newsapi-python
  Using cached newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Using cached newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datetime import datetime
import pytz

# function to convert utc timestamp string into est tsring
def convert_utc_timestamp(utc_string):
    utc_time = datetime.strptime(utc_string, "%Y-%m-%dT%H:%M:%SZ")

    utc_time = pytz.utc.localize(utc_time)
    est_time = utc_time.astimezone(pytz.timezone('US/Eastern'))

    return est_time.strftime("%Y-%m-%d %H:%M:%S")
    

In [3]:
API_KEY="" # add news api, API key here

In [7]:
from newsapi import NewsApiClient
import pandas as pd


# using newsapi client to get data (ref: https://newsapi.org/docs/client-libraries/python)
newsapi = NewsApiClient(api_key=API_KEY)

# creating list to store data, later to be used to create our data frame
sources = list()
authors = list()
urls = list()
descriptions = list()
titles = list()
contents = list()
published_at_dates = list()

# loop to set date from 9th May to 30th May, 2025.
for i in range(9, 30):
    
    temp = i 
    if i < 10:
        temp = f"0{i}"
    start_date = f'2025-05-{temp}'
    
    # get articles for a single date
    all_articles = newsapi.get_everything(q='finance OR stocks OR markets&',
                                          from_param=start_date,
                                          to=start_date,
                                          language='en',
                                          sort_by='relevancy',
                                          page=1,
                                          page_size=100)
    
    # after getting the data we iterate over each article and append the data into their respective list
    for article in all_articles['articles']:
        is_source = article.get('source')
        if is_source:
            source = str({"id": is_source.get("id"), "name": is_source.get("name")})
        author = article.get('author')
        url = article.get('url')
        description = article.get('description')
        title = article.get('title')
        content = article.get('content')
        published_at = article.get('publishedAt')
        if published_at:
            published_at = convert_utc_timestamp(published_at)
        
        sources.append(source)
        authors.append(author)
        urls.append(url)
        descriptions.append(description)
        titles.append(title)
        contents.append(content)
        published_at_dates.append(published_at)
    
    
# creating a dictionary to covert the data into a data frame
data = {
    "source": sources,
    "author": authors,
    "url": urls,
    "description": descriptions,
    "title": titles,
    "content": contents,
    "published_at": published_at_dates
}
df = pd.DataFrame(data=data)

In [8]:
df.head()

Unnamed: 0,source,author,url,description,title,content,published_at
0,"{'id': 'business-insider', 'name': 'Business I...",Kristine Villarroel,https://www.businessinsider.com/vintage-photos...,The market crash of 1929 triggered the Great D...,20 vintage photos show the 1929 stock market c...,The widespread panic following the market cras...,2025-05-09 09:19:01
1,"{'id': None, 'name': 'Yahoo Entertainment'}",editorial-team@simplywallst.com (Simply Wall St),https://finance.yahoo.com/news/three-stocks-ma...,The United States market has shown positive mo...,Three Stocks That May Be Trading Below Their E...,The United States market has shown positive mo...,2025-05-09 07:37:55
2,"{'id': None, 'name': 'Fresno Bee'}",Joshua Tehee,https://www.fresnobee.com/news/local/article30...,"So far in 2025, 21 restaurants, markets and ot...",Event center in Fresno closed due to unsanitar...,Reality Check is a Fresno Bee series holding t...,2025-05-09 15:30:00
3,"{'id': 'business-insider', 'name': 'Business I...",nredmond@insider.com (Nora Redmond),https://www.businessinsider.com/hsbc-loan-to-h...,"Since Trump announced his ""Liberation Day"" tar...",HSBC launched a loan specifically to help busi...,Banking giant HSBC has launched a new financin...,2025-05-09 07:38:01
4,"{'id': 'business-insider', 'name': 'Business I...",Grace Eliza Goodwin,https://www.businessinsider.com/white-house-sa...,The White House says Trump's 10% baseline tari...,The White House says Trump's baseline 10% tari...,President Donald Trump wants to keep his basel...,2025-05-09 15:50:23


In [9]:
df.shape

(2086, 7)

In [10]:
df['content'].iloc[0]

'The widespread panic following the market crash laid the foundations for the economic downturn of the 1930s.Bettmann/Bettmann Archive\r\n<ul><li>The 1929 stock market crash marked the beginning of the … [+11039 chars]'

We retrived 2086 records successfully, from the above cell we can see that the data in the content column is incomplete. Thus our next step would be to scarpe the data using the url provided in every row and replace the content with the scraped content.

In [11]:
df.to_csv('financial_news_data.csv', index=False) 

In [12]:
df.head()

Unnamed: 0,source,author,url,description,title,content,published_at
0,"{'id': 'business-insider', 'name': 'Business I...",Kristine Villarroel,https://www.businessinsider.com/vintage-photos...,The market crash of 1929 triggered the Great D...,20 vintage photos show the 1929 stock market c...,The widespread panic following the market cras...,2025-05-09 09:19:01
1,"{'id': None, 'name': 'Yahoo Entertainment'}",editorial-team@simplywallst.com (Simply Wall St),https://finance.yahoo.com/news/three-stocks-ma...,The United States market has shown positive mo...,Three Stocks That May Be Trading Below Their E...,The United States market has shown positive mo...,2025-05-09 07:37:55
2,"{'id': None, 'name': 'Fresno Bee'}",Joshua Tehee,https://www.fresnobee.com/news/local/article30...,"So far in 2025, 21 restaurants, markets and ot...",Event center in Fresno closed due to unsanitar...,Reality Check is a Fresno Bee series holding t...,2025-05-09 15:30:00
3,"{'id': 'business-insider', 'name': 'Business I...",nredmond@insider.com (Nora Redmond),https://www.businessinsider.com/hsbc-loan-to-h...,"Since Trump announced his ""Liberation Day"" tar...",HSBC launched a loan specifically to help busi...,Banking giant HSBC has launched a new financin...,2025-05-09 07:38:01
4,"{'id': 'business-insider', 'name': 'Business I...",Grace Eliza Goodwin,https://www.businessinsider.com/white-house-sa...,The White House says Trump's 10% baseline tari...,The White House says Trump's baseline 10% tari...,President Donald Trump wants to keep his basel...,2025-05-09 15:50:23


In [15]:
df['source'].nunique()

194

In [14]:
df['source'].unique()

array(["{'id': 'business-insider', 'name': 'Business Insider'}",
       "{'id': None, 'name': 'Yahoo Entertainment'}",
       "{'id': None, 'name': 'Fresno Bee'}",
       "{'id': None, 'name': 'Quartz India'}",
       "{'id': None, 'name': 'Gizmodo.com'}",
       "{'id': None, 'name': 'Forbes'}",
       "{'id': None, 'name': 'MacRumors'}",
       "{'id': None, 'name': 'Deadline'}",
       "{'id': 'the-times-of-india', 'name': 'The Times of India'}",
       "{'id': 'abc-news', 'name': 'ABC News'}",
       "{'id': None, 'name': 'Comecruisewith.com'}",
       "{'id': None, 'name': 'Upenn.edu'}",
       "{'id': None, 'name': 'AppleInsider'}",
       "{'id': None, 'name': 'Calculatedriskblog.com'}",
       "{'id': None, 'name': 'TheStreet'}",
       "{'id': None, 'name': 'Substack.com'}",
       "{'id': None, 'name': 'Biztoc.com'}",
       "{'id': None, 'name': 'The Daily Caller'}",
       "{'id': None, 'name': 'CoinDesk'}",
       "{'id': None, 'name': 'Android Police'}",
       "{'id': No