In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import seaborn as sns
from datetime import date

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Environment settings: 
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

In [2]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    ##### Web scrapper for infinite scrolling page #####
    driver = webdriver.Chrome(executable_path=r"E:\Chromedriver\chromedriver_win32_chrome83\chromedriver.exe")
    driver.get(url)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if i == 20:
            break 

    ##### Extract Reddit URLs #####
    soup = BeautifulSoup(driver.page_source, "html.parser")
    return soup

In [3]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [4]:
BASE_URL = 'https://finance.yahoo.com' #Global Variable 

def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    news_source = news_tag.find_all('span')[0].text #source
    news_time = news_tag.find_all('span')[1].text #time
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    news_content = news_tag.find('p').text #content
    news_image = news_tag.findParent().find('img')['src'] #thumb image
    return { 'source' : news_source,
            'time' : news_time,    
            'headline' : news_headline,
            'url' : BASE_URL + news_url,
            'content' : news_content,
           }

In [5]:
def scrape_yahoo_news(url, path=None):
    """Get the yahoo finance market news and write them to CSV file """
    if path is None:
        path = 'stock-market-news.csv'
        
    #print('Requesting html page')
    doc = get_page(url)

    #print('Extracting news tags')
    news_list = get_news_tags(doc)

    #print('Parsing news tags')
    news_data = [parse_news(news_tag) for news_tag in news_list]

    #print('Save the data to a CSV')
    news_df = pd.DataFrame(news_data)
    #news_df.to_csv(path, index=None)
    
    #This return statement is optional, we are doing this just analyze the final output 
    return news_df 

In [6]:
# url = 'https://finance.yahoo.com/topic/stock-market-news/'
# doc = get_page(url)
# news_list = get_news_tags(doc)
# news_list

## Entire Stock Market News

In [7]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Report Date = ", date.today())
print("Report Time =", current_time)
YAHOO_NEWS_URL = BASE_URL + '/topic/stock-market-news/'
news_df = scrape_yahoo_news(YAHOO_NEWS_URL)

Report Date =  2022-12-15
Report Time = 13:34:25


In [8]:
from transformers import pipeline

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [9]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [10]:
sentiment = [None] * len(news_df)
sentiment_score = [None] * len(news_df)
index = -1
for sentence in news_df['headline']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
news_df['Sentiment'] = sentiment
news_df['Score'] = sentiment_score


sentiment = [None] * len(news_df)
sentiment_score = [None] * len(news_df)
index = -1
for sentence in news_df['content']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
news_df['Sentiment2'] = sentiment
news_df['Score2'] = sentiment_score

In [11]:
news_df[news_df['Score'] > 0.8]

Unnamed: 0,source,time,headline,content,Sentiment,Score,Sentiment2,Score2
0,MarketWatch,13 minutes ago,‘The Fed is going to overdo it’: Financial markets react to U.S. central bank’s 2023 rate outlook and weak data,"Financial markets are still absorbing the Federal Reserve's stridently hawkish interest-rate outlook for 2023, plus fresh signs of a weakening U.S. economy",NEGATIVE,0.999301,POSITIVE,0.989069
1,MarketWatch,13 minutes ago,"Dow skids nearly 950 points, stocks hit session lows in afternoon trade","U.S. stocks tumble on Thursday early afternoon, adding to the previous day's losses, a day after the Federal Reserve raised rates and revived recession worries.",NEGATIVE,0.994169,NEGATIVE,0.998871
2,The Wall Street Journal,14 minutes ago,Dow Falls More Than 900 Points on Interest-Rate Expectations,U.S. stocks fell a day after the Federal Reserve signaled plans to lift interest rates through the spring.,NEGATIVE,0.999368,NEGATIVE,0.999638
3,Bloomberg,17 minutes ago,Stocks Push Lower as Traders Digest Rate Moves: Markets Wrap,"(Bloomberg) -- Stocks across global financial markets were pummeled after a wave of rate hikes from central banks, with the Federal Reserve and the European Central Bank warning of more pain to come. Most Read from BloombergIs Putin Finally Getting Smart About His Ukraine Disaster?Elon Musk’s Tesla Share Sales Approach the $40 Billion MarkPowell Says Fed Still Has a ‘Ways to Go’ After Half-Point HikeThis Is the World’s Biggest Stock Winner of 2022 With 1,600% GainUS equities hit session lows in",NEGATIVE,0.993602,NEGATIVE,0.995497
4,MarketWatch,25 minutes ago,"AMD stock named Morgan Stanley’s top chip pick, but not out of ‘enthusiasm’","AMD is now Morgan Stanley's top pick in the semiconductor space, taking that spot from Lam Research.",NEGATIVE,0.994043,POSITIVE,0.998617
5,Yahoo Finance,36 minutes ago,"Stock market news live updates: Stocks plunge as rate hikes rattle markets, retail sales miss",U.S. stocks descended Thursday morning as Wall Street reeled from another sizable rate hike by Federal Reserve officials and assessed similar moves by central bank officials across the Atlantic. A disappointing reading on consumer spending also weighed on sentiment.,NEGATIVE,0.998273,NEGATIVE,0.996583
6,Investor's Business Daily,36 minutes ago,Investors In China Stocks Operating Blind Under New Covid Policies,China stocks booked broad losses Thursday as Covid cases appeared to be increasing. Analysts advise caution ahead of China's Lunar New Year.,NEGATIVE,0.996488,NEGATIVE,0.98975
7,Reuters,38 minutes ago,Column-Don't fight the Fed? Someone better remind markets: McGeever,"ORLANDO, Fla. (Reuters) - ""Don't fight the Fed"" is one of the most hallowed commandments in financial markets, but for most of this year traders and investors have ignored it. It turns out that playing some Fed policy turn in 2023, rather than guessing any absolute peak rate itself, would have proven lucrative even in the face of constant Fed pushback about possible easing next year. That was when, against a backdrop of surging inflation, the Fed's original pivot to its most hawkish polic...",NEGATIVE,0.994439,NEGATIVE,0.993897
8,Yahoo Finance,38 minutes ago,"Retail stocks including Macy's, Target get smoked as markets tank after retail sales miss",An ugly day for retail stocks followed an ugly retail sales report early Thursday morning.,NEGATIVE,0.996973,NEGATIVE,0.999786
9,Investor's Business Daily,41 minutes ago,"IBD Stock Of The Day Vertex Eyes A Brighter 2023 On Moderna, Crispr Plans","Vertex is the IBD Stock Of The Day. The FDA signed off on its newest study in cystic fibrosis, but VRTX stock is trading sideways.",POSITIVE,0.993967,NEGATIVE,0.998644
