In [51]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [52]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    ##### Web scrapper for infinite scrolling page #####
    driver = webdriver.Chrome(executable_path=r"E:\Chromedriver\chromedriver_win32_chrome83\chromedriver.exe")
    driver.get(url)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if i == 10:
            break 

    ##### Extract Reddit URLs #####
    soup = BeautifulSoup(driver.page_source, "html.parser")
    return soup

In [53]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [54]:
BASE_URL = 'https://finance.yahoo.com' #Global Variable 

def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    news_content = news_tag.find('p').text #content
    news_image = news_tag.findParent().find('img')['src'] #thumb image
    return { 'source' : news_source,
            'headline' : news_headline,
            'url' : BASE_URL + news_url,
            'content' : news_content,
            'image' : news_image
           }

In [55]:
import pandas as pd

In [56]:
def scrape_yahoo_news(url, path=None):
    """Get the yahoo finance market news and write them to CSV file """
    if path is None:
        path = 'stock-market-news.csv'
        
    print('Requesting html page')
    doc = get_page(url)

    print('Extracting news tags')
    news_list = get_news_tags(doc)

    print('Parsing news tags')
    news_data = [parse_news(news_tag) for news_tag in news_list]

    print('Save the data to a CSV')
    news_df = pd.DataFrame(news_data)
    news_df.to_csv(path, index=None)
    
    #This return statement is optional, we are doing this just analyze the final output 
    return news_df 

In [58]:
YAHOO_NEWS_URL = BASE_URL+'/topic/stock-market-news/'
news_df = scrape_yahoo_news(YAHOO_NEWS_URL)

Requesting html page


  driver = webdriver.Chrome(executable_path=r"E:\Chromedriver\chromedriver_win32_chrome83\chromedriver.exe")


Extracting news tags
Parsing news tags
Save the data to a CSV


In [59]:
news_df.head()

Unnamed: 0,source,headline,url,content,image
0,Business,"Stocks Mixed on China Reopening, Recession Fea...",https://finance.yahoo.com/news/asia-stocks-fac...,(Bloomberg) -- Stocks in Asia fluctuated follo...,https://s.yimg.com/uu/api/res/1.2/CKM9ymnkGony...
1,Business,Why This Market Rally Is So Dangerous; Tesla N...,https://finance.yahoo.com/m/4fb69026-f66a-32a5...,"The market rally is holding key levels, but th...",https://s.yimg.com/uu/api/res/1.2/4jm0NjL0spXk...
2,Business,Hong Kong Stocks Rebound on Report City May Re...,https://finance.yahoo.com/news/hong-kong-stock...,(Bloomberg) -- Hong Kong stocks jumped followi...,https://s.yimg.com/uu/api/res/1.2/YRnZMl7jn9Le...
3,Business,Asia stocks edge up despite global growth worries,https://finance.yahoo.com/news/asia-stocks-edg...,"Asian equities edged higher on Thursday, propp...",https://s.yimg.com/uu/api/res/1.2/5_bi.G5fM7G7...
4,Business,Are Options Traders Betting on a Big Move in P...,https://finance.yahoo.com/news/options-traders...,Investors need to pay close attention to Peabo...,https://s.yimg.com/uu/api/res/1.2/rbQppgrmFdTX...
