In [2]:
import feedparser as fp
import pandas as pd
import json
import newspaper
import dateutil
from datetime import *
import nltk
from IPython.display import display
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# nltk.download('punkt')

pd.options.display.max_columns = 20    #Changes the number of columns diplayed (default is 20)
pd.options.display.max_rows = 60       #Changes the number of rows diplayed (default is 60)
pd.options.display.max_colwidth = 500   #Changes the number of characters in a cell (default is 50)

In [3]:
def load_sources(file):
    # Function that loads in the sources from the JSON database
    try:
        with open(file) as data:
            sources = json.load(data)
        print(f'INFO: Using custom "{file}" as source file.')
        return sources
    except:
        raise Exception(f'Error in "load_sources()"')

sources = "sources.json"
data = load_sources(sources)

INFO: Using custom "sources.json" as source file.


In [4]:
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

In [5]:
for source, content in data.items():
    urls = content['rss']
    for url in urls:
        print(url)
        feed = fp.parse(url)
        display(feed)

https://rss.nytimes.com/services/xml/rss/nyt/Business.xml


{'bozo': False,
 'entries': [{'title': 'In a Shaky Oil Market, OPEC Has Bitter Decisions to Make',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml',
    'value': 'In a Shaky Oil Market, OPEC Has Bitter Decisions to Make'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://www.nytimes.com/2023/11/29/business/energy-environment/opec-oil-prices.html'},
    {'href': 'https://www.nytimes.com/2023/11/29/business/energy-environment/opec-oil-prices.html',
     'rel': 'standout',
     'type': 'text/html'}],
   'link': 'https://www.nytimes.com/2023/11/29/business/energy-environment/opec-oil-prices.html',
   'id': 'https://www.nytimes.com/2023/11/29/business/energy-environment/opec-oil-prices.html',
   'guidislink': False,
   'summary': 'Anticipating a drop in demand for 2024, major producers, led by Saudi Arabia, are trying to reduce supply.',
   'summary_detail': {'type': 'text

https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml


{'bozo': False,
 'entries': [{'title': 'U.S. Debates How Much to Sever Electric Car Industry’s Ties to China',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml',
    'value': 'U.S. Debates How Much to Sever Electric Car Industry’s Ties to China'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://www.nytimes.com/2023/11/29/business/economy/electric-car-china-supply-chain.html'},
    {'href': 'https://www.nytimes.com/2023/11/29/business/economy/electric-car-china-supply-chain.html',
     'rel': 'standout',
     'type': 'text/html'}],
   'link': 'https://www.nytimes.com/2023/11/29/business/economy/electric-car-china-supply-chain.html',
   'id': 'https://www.nytimes.com/2023/11/29/business/economy/electric-car-china-supply-chain.html',
   'guidislink': False,
   'summary': 'Some firms argue that a law aimed at popularizing electric vehicles risks turning the United States i

In [6]:
#print(data['Reuters']['rss'][0])
def print_scrape_status(count):
    print(f"Scraped {count} articles", end="\r")

def scrape(data,news_date):
    # Function that scrapes the content from the URLs in the source data
    try:
        articles_list = []
        for source, content in data.items():
            for url in content['rss']:
                d = fp.parse(url)
                for entry in d.entries:
                    article = {}
                    if hasattr(entry,'published'):
                        article_date = dateutil.parser.parse(getattr(entry,'published'))
                        if (article_date.strftime('%Y-%m-%d') >= str(news_date)):
                            try:
                                content = newspaper.Article(entry.link)
                                content.download()
                                content.parse()  
                                content.nlp()
                                try:
                                    article['source'] = source
                                    article['url'] = entry.link
                                    article['date'] = article_date.strftime('%Y-%m-%d')
                                    article['time'] = article_date.strftime('%H:%M:%S %Z') # hour, minute, timezone (converted)
                                    article['title'] = content.title
                                    article['title sentiment'] = analyze_sentiment(content.title)
                                    article['body'] = content.text
                                    article['summary'] = content.summary
                                    article['summary sentiment'] = analyze_sentiment(content.text)
                                    article['keywords'] = content.keywords
                                    article['image_url'] = content.top_image
                                    articles_list.append(article)
                                    print_scrape_status(len(articles_list))
                                except Exception as e:
                                    print(e)
                                    print('continuing...')
                            except Exception as e: 
                                print(e)
                                print('continuing...')
        return articles_list
    except:
        raise Exception(f'Error in "Scraper.scrape()"')

output = scrape(data,date(2023,11,29))

Scraped 17 articles

In [7]:
print(date.today())
print(date(2023,11,1))

2023-11-29
2023-11-01


In [8]:
df = pd.DataFrame(output)
display(df[['source','time','title','title sentiment','summary','summary sentiment']].sort_values('summary sentiment'))
display(df[['body']])

Unnamed: 0,source,time,title,title sentiment,summary,summary sentiment
1,New York Times,14:33:05 UTC,"Citi Is Sued Over Sex Abuse. Before 2022, It Would Have Been a Secret.",-0.6369,"Ardith Lindsey, a managing director at Citi, alleged that her 15-year career at the bank had increasingly become a “traumatizing” experience, especially after she ended a relationship with a former supervisor.\nThe supervisor, Mani Singh, then sent her dozens of threatening text messages, according to her lawsuit.\nMark Costiglio, a Citi spokesman, said the bank had opened an investigation shortly after Ms. Lindsey disclosed the relationship in November 2022 and complained about Mr. Singh’s ...",-0.9674
9,New York Times,13:13:17 UTC,What to Expect at Today’s DealBook Summit,0.0,"The lineup for DealBook Summit 2023On Wednesday, DealBook will be live and in person at our annual summit in New York.\nThe DealBook team and reporters from The Times will be reporting live from the conference.\nHere are the speakers:Vice President Kamala HarrisTsai Ing-wen , the president of TaiwanElon Musk , the chairman and C.E.O.\nof Tesla and the chairman and chief technology officer of XLina Khan , the chair of the Federal Trade CommissionJamie Dimon , the chairman and C.E.O.\nIs this ...",-0.9357
5,New York Times,18:59:25 UTC,London’s Black Cabs Can Soon Join Uber. But Will They?,0.1531,"Uber said it needed several hundred drivers to sign up in order to launch the service.\nBut many London cabdrivers had a scathing response.\n“We don’t need a partnership with Uber,” said the Licensed Taxi Drivers Association, a union that represents a majority of the city’s nearly 18,000 cabdrivers, in the headline of a release on Wednesday.\nThere was “no demand” for such a partnership from taxi drivers, the union’s general secretary, Steve McNamara, said in a statement, adding that their m...",-0.7959
6,New York Times,17:58:22 UTC,"Don’t Be Afraid of the iPhone’s NameDrop Feature, Experts Say",0.0,"“To the extent there’s panic here about nonconsensual taking of contact information, I’m not that worried,” he said.\nTo use the feature, Apple users need to have updated their devices to the latest version of the operating system — iOS 17.1 for the iPhone or watchOS 10.1 for the Apple Watch, both of which have the feature enabled as a default setting.\nThey can then choose to exchange contact details, or one may simply receive contact information from the other without reciprocating.\nNameD...",-0.6857
7,New York Times,15:54:39 UTC,"Here’s the Lineup for the 2023 DealBook Summit, and When to Watch",0.0,"The hype and hope driving artificial intelligence, the rise of antisemitism since Hamas’ Oct. 7 attack on Israel, inflation, dysfunction in Washington and the streaming wars — these topics and more will be addressed at the 2023 DealBook Summit.\nOver more than eight hours on Wednesday, Andrew Ross Sorkin will interview the biggest newsmakers in the worlds of business, politics and culture.\nThe first interview will begin shortly after 9 a.m. Eastern.\nInterviews that will be carried on nytim...",-0.5423
4,New York Times,19:08:56 UTC,"With Charlie Munger’s Death, Berkshire Loses a Custodian of Its Culture",-0.7351,"Shares of Berkshire Hathaway barely budged on Wednesday, a day after its vice chairman, Charlie Munger, died, reflecting the view among shareholders that Mr. Munger’s absence on the conglomerate’s day to day would have little impact on its future, even as they mourned the loss of the man who helped shape Berkshire’s culture.\nMr. Munger, who helped build Berkshire into a global investing powerhouse, died at a California hospital on Tuesday morning, according to an announcement from Berkshire...",-0.0772
13,New York Times,10:01:54 UTC,"Charles T. Munger, Much More Than Warren Buffett’s No. 2, Dies at 99",0.0,"Charles T. Munger, who quit a well-established law career to be Warren E. Buffett’s partner and maxim-spouting alter-ego as they transformed a struggling New England textile company into the spectacularly successful investment firm Berkshire Hathaway, died on Tuesday in Santa Barbara, Calif.\nHis death, at a hospital, was announced by Berkshire Hathaway.\nMr. Buffett has described him as the originator of Berkshire Hathaway’s investing approach.\n“The blueprint he gave me was simple: Forget ...",0.1779
8,New York Times,13:00:09 UTC,Jezebel to Be Resurrected by Paste Magazine,0.0,"Jezebel, the famed feminist website, is set to return less than a month after it was shuttered.\nPaste Magazine, a music and culture outlet, acquired Jezebel on Tuesday and planned to start publishing on the site again as soon as Wednesday, said Josh Jackson, a co-founder and the editor in chief of Paste.\n“The idea of there not being a Jezebel right now just didn’t seem to make sense,” Mr. Jackson said.\nJezebel, once part of the Gawker universe of websites, brought a brash new kind of inte...",0.3716
2,New York Times,18:54:46 UTC,G.M. to Cut Spending on Cruise Self-Driving Unit,-0.2732,General Motors is slowing the expansion of its Cruise automated driving division and significantly cutting spending at the unit after suspending operations in response to growing safety concerns about its driverless cars.\nThe company had been planning to roll out a ride service in San Francisco and three other cities and begin testing Cruise vehicles on the streets of several other markets.\nIt now plans to focus on only one city as it works to improve the operation of its fleet of driverle...,0.6712
0,New York Times,14:43:49 UTC,"In a Shaky Oil Market, OPEC Has Bitter Decisions to Make",-0.5719,"The agenda — whether to cut production further, and by how much — is likely to be unpalatable for many of the 23 members.\nIt has dropped even as producers in OPEC Plus, a bigger group that includes Russia, have cut production, but the coming months seem unlikely to give oil producers a respite from this squeeze.\nAfter three years of pandemic recovery and robust increases in demand for oil, appetite is expected to slow in 2024.\nOverall economic expansion is expected to be tepid while more ...",0.7883


Unnamed: 0,body
0,"These are tricky times for the world’s major oil producers: Prices are lower, the health of the global economy is uncertain, and, even as the Organization of the Petroleum Exporting Countries tries to cut output, supplies from other producers, notably the United States, are growing.\n\nNo wonder the group postponed its year-end meeting. Initially scheduled for last weekend in Vienna, the meeting is now planned for Thursday, barring another postponement. The agenda — whether to cut production..."
1,"A female employee of Citigroup recently sued the bank, accusing its leadership of tolerating a work culture in which she was sexually harassed and abused by male executives, in a high-profile application of a 2022 federal law that nullified forced arbitration for such cases.\n\nArdith Lindsey, a managing director at Citi, alleged that her 15-year career at the bank had increasingly become a “traumatizing” experience, especially after she ended a relationship with a former supervisor. The sup..."
2,General Motors is slowing the expansion of its Cruise automated driving division and significantly cutting spending at the unit after suspending operations in response to growing safety concerns about its driverless cars.\n\nThe company had been planning to roll out a ride service in San Francisco and three other cities and begin testing Cruise vehicles on the streets of several other markets. It now plans to focus on only one city as it works to improve the operation of its fleet of driverl...
3,"This year’s DealBook Summit will include conversations with global leaders and powerful figures from Wall Street, Silicon Valley and Hollywood. Each has been at the heart of the news this year and will be at the center of some of the biggest events in the months ahead.\n\nHere are the guests speaking with DealBook’s founder, Andrew Ross Sorkin:\n\nKamala Harris was elected vice president of the United States in 2020, after serving as a senator, the attorney general of California and the dist..."
4,"Shares of Berkshire Hathaway barely budged on Wednesday, a day after its vice chairman, Charlie Munger, died, reflecting the view among shareholders that Mr. Munger’s absence on the conglomerate’s day to day would have little impact on its future, even as they mourned the loss of the man who helped shape Berkshire’s culture.\n\nMr. Munger, who helped build Berkshire into a global investing powerhouse, died at a California hospital on Tuesday morning, according to an announcement from Berkshi..."
5,"The ride-hailing company framed the announcement as a partnership, sweetening the deal for new drivers by nixing the percentage of their fare that goes to Uber for the first six months. The first drivers, it said, had already begun signing up. Uber said it needed several hundred drivers to sign up in order to launch the service.\n\nBut many London cabdrivers had a scathing response.\n\n“We don’t need a partnership with Uber,” said the Licensed Taxi Drivers Association, a union that represent..."
6,"“To the extent there’s panic here about nonconsensual taking of contact information, I’m not that worried,” he said.\n\nHere’s what you need to know.\n\nHow does NameDrop work?\n\nTo use the feature, Apple users need to have updated their devices to the latest version of the operating system — iOS 17.1 for the iPhone or watchOS 10.1 for the Apple Watch, both of which have the feature enabled as a default setting.\n\nUsers hold one device over the other, within a few centimeters, until NameDr..."
7,"The hype and hope driving artificial intelligence, the rise of antisemitism since Hamas’ Oct. 7 attack on Israel, inflation, dysfunction in Washington and the streaming wars — these topics and more will be addressed at the 2023 DealBook Summit.\n\nOver more than eight hours on Wednesday, Andrew Ross Sorkin will interview the biggest newsmakers in the worlds of business, politics and culture. The first interview will begin shortly after 9 a.m. Eastern. Interviews that will be carried on nytim..."
8,"Jezebel, the famed feminist website, is set to return less than a month after it was shuttered.\n\nPaste Magazine, a music and culture outlet, acquired Jezebel on Tuesday and planned to start publishing on the site again as soon as Wednesday, said Josh Jackson, a co-founder and the editor in chief of Paste.\n\n“The idea of there not being a Jezebel right now just didn’t seem to make sense,” Mr. Jackson said.\n\nJezebel, once part of the Gawker universe of websites, brought a brash new kind o..."
9,"The lineup for DealBook Summit 2023\n\nOn Wednesday, DealBook will be live and in person at our annual summit in New York.\n\nAndrew takes the stage around 9 a.m. Eastern, and the first interview kicks off soon after. The DealBook team and reporters from The Times will be reporting live from the conference.\n\nEven if you are not with us, you can follow along here beginning at 8:30 a.m. Eastern.\n\nHere are the speakers:\n\nVice President Kamala Harris\n\nTsai Ing-wen , the president of Taiw..."


In [9]:
display(df[['summary sentiment','summary']].sort_values('summary sentiment'))

Unnamed: 0,summary sentiment,summary
1,-0.9674,"Ardith Lindsey, a managing director at Citi, alleged that her 15-year career at the bank had increasingly become a “traumatizing” experience, especially after she ended a relationship with a former supervisor.\nThe supervisor, Mani Singh, then sent her dozens of threatening text messages, according to her lawsuit.\nMark Costiglio, a Citi spokesman, said the bank had opened an investigation shortly after Ms. Lindsey disclosed the relationship in November 2022 and complained about Mr. Singh’s ..."
9,-0.9357,"The lineup for DealBook Summit 2023On Wednesday, DealBook will be live and in person at our annual summit in New York.\nThe DealBook team and reporters from The Times will be reporting live from the conference.\nHere are the speakers:Vice President Kamala HarrisTsai Ing-wen , the president of TaiwanElon Musk , the chairman and C.E.O.\nof Tesla and the chairman and chief technology officer of XLina Khan , the chair of the Federal Trade CommissionJamie Dimon , the chairman and C.E.O.\nIs this ..."
5,-0.7959,"Uber said it needed several hundred drivers to sign up in order to launch the service.\nBut many London cabdrivers had a scathing response.\n“We don’t need a partnership with Uber,” said the Licensed Taxi Drivers Association, a union that represents a majority of the city’s nearly 18,000 cabdrivers, in the headline of a release on Wednesday.\nThere was “no demand” for such a partnership from taxi drivers, the union’s general secretary, Steve McNamara, said in a statement, adding that their m..."
6,-0.6857,"“To the extent there’s panic here about nonconsensual taking of contact information, I’m not that worried,” he said.\nTo use the feature, Apple users need to have updated their devices to the latest version of the operating system — iOS 17.1 for the iPhone or watchOS 10.1 for the Apple Watch, both of which have the feature enabled as a default setting.\nThey can then choose to exchange contact details, or one may simply receive contact information from the other without reciprocating.\nNameD..."
7,-0.5423,"The hype and hope driving artificial intelligence, the rise of antisemitism since Hamas’ Oct. 7 attack on Israel, inflation, dysfunction in Washington and the streaming wars — these topics and more will be addressed at the 2023 DealBook Summit.\nOver more than eight hours on Wednesday, Andrew Ross Sorkin will interview the biggest newsmakers in the worlds of business, politics and culture.\nThe first interview will begin shortly after 9 a.m. Eastern.\nInterviews that will be carried on nytim..."
4,-0.0772,"Shares of Berkshire Hathaway barely budged on Wednesday, a day after its vice chairman, Charlie Munger, died, reflecting the view among shareholders that Mr. Munger’s absence on the conglomerate’s day to day would have little impact on its future, even as they mourned the loss of the man who helped shape Berkshire’s culture.\nMr. Munger, who helped build Berkshire into a global investing powerhouse, died at a California hospital on Tuesday morning, according to an announcement from Berkshire..."
13,0.1779,"Charles T. Munger, who quit a well-established law career to be Warren E. Buffett’s partner and maxim-spouting alter-ego as they transformed a struggling New England textile company into the spectacularly successful investment firm Berkshire Hathaway, died on Tuesday in Santa Barbara, Calif.\nHis death, at a hospital, was announced by Berkshire Hathaway.\nMr. Buffett has described him as the originator of Berkshire Hathaway’s investing approach.\n“The blueprint he gave me was simple: Forget ..."
8,0.3716,"Jezebel, the famed feminist website, is set to return less than a month after it was shuttered.\nPaste Magazine, a music and culture outlet, acquired Jezebel on Tuesday and planned to start publishing on the site again as soon as Wednesday, said Josh Jackson, a co-founder and the editor in chief of Paste.\n“The idea of there not being a Jezebel right now just didn’t seem to make sense,” Mr. Jackson said.\nJezebel, once part of the Gawker universe of websites, brought a brash new kind of inte..."
2,0.6712,General Motors is slowing the expansion of its Cruise automated driving division and significantly cutting spending at the unit after suspending operations in response to growing safety concerns about its driverless cars.\nThe company had been planning to roll out a ride service in San Francisco and three other cities and begin testing Cruise vehicles on the streets of several other markets.\nIt now plans to focus on only one city as it works to improve the operation of its fleet of driverle...
0,0.7883,"The agenda — whether to cut production further, and by how much — is likely to be unpalatable for many of the 23 members.\nIt has dropped even as producers in OPEC Plus, a bigger group that includes Russia, have cut production, but the coming months seem unlikely to give oil producers a respite from this squeeze.\nAfter three years of pandemic recovery and robust increases in demand for oil, appetite is expected to slow in 2024.\nOverall economic expansion is expected to be tepid while more ..."


In [13]:
# To .csv and save in same folder as notebook
df.to_csv('news.csv')