In [None]:
import jovian

In [None]:
# Execute this to save new versions of the notebook
jovian.commit(project="yahoo-finance-web-scraper")

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [None]:
my_url = 'https://finance.yahoo.com/topic/stock-market-news/'

headers = {"User-Agent": "Mozilla/5.0"}
cookies = {"CONSENT": "YES+cb.20210720-07-p0.en+FX+410"}

response = requests.get(my_url, headers=headers)

In [None]:
print("response.ok : {} , response.status_code : {}".format(response.ok , response.status_code))

In [None]:
print("Preview of response.text : ", response.text[:500])

In [None]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    response = requests.get(url, headers=headers)
    if not response.ok:
        print('Status code:', response.status_code)
        raise Exception('Failed to load page {}'.format(url))
    page_content = response.text
    doc = BeautifulSoup(page_content, 'html.parser')
    return doc

In [None]:
doc = get_page(my_url)
print('Type of doc: ',type(doc))

In [None]:
doc.find('title')

In [None]:
div_tags = doc.find_all('div', {'class': "Ov(h) Pend(44px) Pstart(25px)"})

In [None]:
len(div_tags)

In [None]:
print(div_tags[1])

In [None]:
print("Source: ", div_tags[1].find('div').text)
print("Head Line : {}".format(div_tags[1].find('a').text))

In [None]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [None]:
BASE_URL = 'https://finance.yahoo.com' #Global Variable 

def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    news_content = news_tag.find('p').text #content
    news_image = news_tag.findParent().find('img')['src'] #thumb image
    return { 'source' : news_source,
            'headline' : news_headline,
            'url' : BASE_URL + news_url,
            'content' : news_content,
            'image' : news_image
           }

In [None]:
import pandas as pd

In [None]:
def scrape_yahoo_news(url, path=None):
    """Get the yahoo finance market news and write them to CSV file """
    if path is None:
        path = 'stock-market-news.csv'
        
    print('Requesting html page')
    doc = get_page(url)

    print('Extracting news tags')
    news_list = get_news_tags(doc)

    print('Parsing news tags')
    news_data = [parse_news(news_tag) for news_tag in news_list]

    print('Save the data to a CSV')
    news_df = pd.DataFrame(news_data)
    news_df.to_csv(path, index=None)
    
    #This return statement is optional, we are doing this just analyze the final output 
    return news_df 

In [None]:
YAHOO_NEWS_URL = BASE_URL+'/topic/stock-market-news/'
news_df = scrape_yahoo_news(YAHOO_NEWS_URL)

In [None]:
news_df.head()