In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import seaborn as sns
from datetime import date

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Environment settings: 
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

In [2]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    ##### Web scrapper for infinite scrolling page #####
    driver = webdriver.Chrome(executable_path=r"E:\Chromedriver\chromedriver_win32_chrome83\chromedriver.exe")
    driver.get(url)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if i == 10:
            break 

    ##### Extract Reddit URLs #####
    soup = BeautifulSoup(driver.page_source, "html.parser")
    return soup

In [3]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [4]:
BASE_URL = 'https://finance.yahoo.com' #Global Variable 

def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    news_source = news_tag.find_all('span')[0].text #source
    news_time = news_tag.find_all('span')[1].text #time
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    news_content = news_tag.find('p').text #content
    news_image = news_tag.findParent().find('img')['src'] #thumb image
    return { 'source' : news_source,
            'time' : news_time,    
            'headline' : news_headline,
            'url' : BASE_URL + news_url,
            'content' : news_content,
           }

In [5]:
def scrape_yahoo_news(url, path=None):
    """Get the yahoo finance market news and write them to CSV file """
    if path is None:
        path = 'stock-market-news.csv'
        
    #print('Requesting html page')
    doc = get_page(url)

    #print('Extracting news tags')
    news_list = get_news_tags(doc)

    #print('Parsing news tags')
    news_data = [parse_news(news_tag) for news_tag in news_list]

    #print('Save the data to a CSV')
    news_df = pd.DataFrame(news_data)
    #news_df.to_csv(path, index=None)
    
    #This return statement is optional, we are doing this just analyze the final output 
    return news_df 

In [6]:
# url = 'https://finance.yahoo.com/topic/stock-market-news/'
# doc = get_page(url)
# news_list = get_news_tags(doc)
# news_list

## Collecting Stock Market News From Yahoo Finance

In [7]:
# from datetime import datetime

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Report Date = ", date.today())
# print("Report Time =", current_time)
# YAHOO_NEWS_URL = BASE_URL + '/topic/stock-market-news/'
# news_df = scrape_yahoo_news(YAHOO_NEWS_URL)

In [8]:
# from transformers import pipeline

### Sentiment Analysis of Yahoo Finance

In [9]:
# sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# sentiment = [None] * len(news_df)
# sentiment_score = [None] * len(news_df)
# index = -1
# for sentence in news_df['headline']:
#     index+=1
#     result = sentiment_pipeline(sentence[:512])[0]
#     sentiment[index] = result['label']
#     sentiment_score[index] = result['score']
# news_df['Sentiment'] = sentiment
# news_df['Score'] = sentiment_score


# sentiment = [None] * len(news_df)
# sentiment_score = [None] * len(news_df)
# index = -1
# for sentence in news_df['content']:
#     index+=1
#     result = sentiment_pipeline(sentence[:512])[0]
#     sentiment[index] = result['label']
#     sentiment_score[index] = result['score']
# news_df['Sentiment2'] = sentiment
# news_df['Score2'] = sentiment_score

In [10]:
# news_df[news_df['Score'] > 0.8]

### Sentiment of Headline

In [11]:
# sns.histplot(x = 'Sentiment', data = news_df[news_df['Score'] > 0.8], hue = 'Sentiment', hue_order=['POSITIVE', 'NEGATIVE']).set_title("Sentiment of Headline")

### Sentiment of Content

In [12]:
# sns.histplot(x = 'Sentiment2', data = news_df[news_df['Score2'] > 0.8], hue = 'Sentiment2', hue_order=['POSITIVE', 'NEGATIVE']).set_title("Sentiment of Content")

In [13]:
import sys
sys.path.insert(1, 'C:/Users/Woon/Desktop/Columbia/Applied Analytics/Term3/Sentiment_SNP')

import helper_functions
from helper_functions import *

In [14]:
from collections import defaultdict
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)


## Each Articles - From Finviz

In [15]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError
import socket
import urllib

In [16]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Report Date = ", date.today())
print("Report Time =", current_time)

Report Date =  2022-12-17
Report Time = 13:12:56


In [17]:
# Scraping useful information

url = "https://finviz.com/news.ashx"


headers = {"User-Agent": "Mozilla/5.0"}
cookies = {"CONSENT": "YES+cb.20210720-07-p0.en+FX+410"}

req = Request(url, headers=headers)

try:
    contents = urlopen(req).read() 
    soup = BeautifulSoup(contents, features="html.parser")

    link2 = []
    headline2 = []
    date2 = []

    for row in soup.find_all('tr', class_ = 'nn'):
        headline = row.find('a', class_ = 'nn-tab-link')
       
        if headline == None:
            continue

        headline2.append(headline.text)
        link = row.find('a', class_ = 'nn-tab-link')
        link2.append(link['href'])
        date = row.find('td', class_ = 'nn-date')
        date2.append(date)

except urllib.error.HTTPError as err:
    print(err.code)

except socket.timeout as se:
    print("socket timeout")

In [18]:
#Turning the data into a dataframe
df = pd.DataFrame(date2)

In [19]:
df['Date'] = df[0]
df['Headline'] = headline2
df = pd.DataFrame(list(zip(df['Date'], df['Headline'])), columns=['Date', 'Headline'])
df['Links'] = link2

In [20]:
# # Filtering only links starting with https
# df['Links_True'] = list(
#     map(lambda x: x.startswith('http'), df['Links']))

# df = df[df['Links_True'] == True].reset_index(drop=True)

In [21]:
# Creating word count of headline (no longer needed)
i = []
for string in df['Headline']:
    i.append(count_words(string))
df['word_count'] = i

In [22]:
from transformers import pipeline

In [23]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [24]:
# Sentiment analysis of headline
df['Sentiment_Head'] = -2

sentiment = [None] * len(df)
sentiment_score = [None] * len(df)
index = -1
for sentence in df['Headline']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
df['Sentiment_Head'] = sentiment
df['Score_Head'] = sentiment_score

In [25]:
# Collect entire articles

headers = {"User-Agent": "Mozilla/5.0"}
cookies = {"CONSENT": "YES+cb.20210720-07-p0.en+FX+410"}

article = []

for url in df['Links']:
    req = Request(url, headers=headers)

    try:
        contents = urlopen(req).read() 
        soup = BeautifulSoup(contents, features="html.parser")

        p = soup.find_all('p')
        paragraphs = []
        for x in p:
            paragraphs.append(str(x))

        paragraphs = ''.join(paragraphs)

        article.append(paragraphs) 

    except urllib.error.HTTPError as err:
        print(err.code)

    except socket.timeout as se:
        print("socket timeout")

df['article'] = article


In [26]:
df = df[df['Score_Head'] > 0.9]

In [27]:
# Summary of Articles
summary_pipeline = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-16384-book-summary")

summary = [None] * len(df)
index = -1
for articles in df['article']:
    index+=1
    result = summary_pipeline(articles[:512])[0]
    summary[index] = result
df['summary'] = summary

Your max_length is set to 512, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 512, but you input_length is only 191. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)
Your max_length is set to 512, but you input_length is only 186. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)
Your max_length is set to 512, but you input_length is only 160. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=80)
Your max_length is set to 512, but you input_length is only 198. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=99)
Your max_length is set to 512, but you input_length is only 251. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=125)
Your max_length is set to 512, but you input_length is only 160. You might 

In [28]:
# Sentiment analysis of Article Summary
df['Sentiment_Article'] = -2

sentiment = [None] * len(df)
sentiment_score = [None] * len(df)
index = -1
for sentence in df['Headline']:
    index+=1
    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
    sentiment_score[index] = result['score']
df['Sentiment_Article'] = sentiment
df['Score_Article'] = sentiment_score

In [37]:
df

Unnamed: 0,Date,Headline,Links,word_count,Sentiment_Head,Score_Head,article,summary,Sentiment_Article,Score_Article
0,12:56PM,Stocks are tumbling because investors now fear recession more than inflation,https://www.marketwatch.com/story/stock-market-investors-now-fear-recession-more-than-inflation-heres-why-11671234234,11,NEGATIVE,0.998701,"<p>A stock-market paradox, in which bad news about the economy is seen as good news for equities, may have run its course. If so, investors should expect bad news to be bad news for stocks heading into the new year — and there may be plenty of it.</p><p>But first, why would good news be bad news? Investors have spent 2022 largely focused on the Federal Reserve and its rapid series of large rate hikes aimed at bringing inflation to heel. Economic news pointing to slower growth and less fuel f...","{'summary_text': 'A stock market paradox: In which bad news is seen as ""good news"" for Equities, there may be a run-up in the price of those stocks.'}",NEGATIVE,0.998701
1,12:53PM,Stocks Fall as Recession Concerns Mount,https://www.wsj.com/articles/global-stocks-markets-dow-update-12-16-2022-11671190874,6,NEGATIVE,0.999386,"<p>\nThis copy is for your personal, non-commercial use only. Distribution and use of this material are governed by\nour Subscriber Agreement and by copyright law. For non-personal use or to order multiple copies, please contact\nDow Jones Reprints at 1-800-843-0008 or visit www.djreprints.com.\n</p><p>https://www.wsj.com/articles/global-stocks-markets-dow-update-12-16-2022-11671190874</p><p class=""css-1evf9kl-ListenToArticleTitle e1f89e201"">Listen to article</p><p class=""css-17yomag-Minutes...","{'summary_text': 'This is a copyrighted work, and may not be used in any other way.'}",NEGATIVE,0.999386
2,12:41PM,Self-driving truck company TuSimple to lay off hundreds days before Christmas: report,https://foxbusiness.com/economy/self-driving-truck-company-tusimple-lay-off-hundreds-days-before-christmas-report,12,NEGATIVE,0.999238,"<p>\n Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided by <a href=""https://www.factset.com/"">Factset</a>.\n Powered and implemented by <a href=""https://www.factset.com/solutions/business-needs/digital-solutions"">FactSet Digital Solutions</a>. \n <a href=""https://www.factset.com/privacy"">Legal Statement</a>. Mutual Fund and ETF data provided by <a href=""https://lipperalpha.refinitiv.com/"">Refinitiv Lipper</a>.\n </...","{'summary_text': 'The narrator gives us some background on the company and its business. It's pretty clear that it's not exactly a big deal, but it does give us an idea of how important it is to be in business.'}",NEGATIVE,0.999238
3,12:32PM,Puerto Rico Power Utility Plan to Cut Debt by 40%,https://www.bloomberg.com/news/articles/2022-12-17/puerto-rico-board-seeking-to-cut-power-utility-s-9-billion-of-debt?srnd=markets-vp,10,NEGATIVE,0.932489,"<p class=""continue"">To continue, please click the box below to let us know you're not a robot.</p><p class=""info__text"">Please make sure your browser supports JavaScript and cookies and that you are not\n blocking them from loading.\n For more information you can review our <a class=""info__link"" href=""/notices/tos"">Terms of\n Service</a> and <a class=""info__link"" href=""/notices/tos"">Cookie Policy</a>.</p><p class=""info__text"">For inquiries related to this...","{'summary_text': 'This page is designed to let you know that if you're a robot, you can continue.'}",NEGATIVE,0.932489
4,12:11PM,Moscow appeals for army recruits to fight in Ukraine,https://www.cnn.com/2022/12/17/europe/moscow-army-recruits-intl/index.html,9,POSITIVE,0.929347,"<p class=""paragraph inline-placeholder"" data-component-name=""paragraph"" data-editable=""text"" data-uri=""archive.cms.cnn.com/_components/paragraph/instances/paragraph_C90383D9-C6C7-76BF-E3B5-2092FB310E80@published"">\n Moscow has begun a new campaign to encourage Russians to enlist in the armed forces and fight in <a href=""https://www.cnn.com/europe/live-news/russia-ukraine-war-news-12-17-22/index.html"" target=""_blank"">Ukraine</a>, despite the Kremlin having denied needing more recruits. \...",{'summary_text': 'The Russians are preparing to fight in the U.S.'},POSITIVE,0.929347
5,11:53AM,Elon Musk offers journalists he banned from Twitter ability to return under certain condition,https://www.cnn.com/2022/12/17/business/elon-musk-twitter-ban-reverse-conditions/index.html,14,NEGATIVE,0.99467,"<p class=""market-feature-ribbon__column-header"">Markets <svg class=""right-arrow"" enable-background=""new 0 0 492.004 492.004"" version=""1.1"" viewbox=""0 0 492.004 492.004"" xml:space=""preserve"" xmlns=""http://www.w3.org/2000/svg"">\n<path d=""m484.14 226.89l-177.68-177.68c-5.072-5.072-11.832-7.856-19.04-7.856-7.216 0-13.972 2.788-19.044 7.856l-16.132 16.136c-5.068 5.064-7.86 11.828-7.86 19.04 0 7.208 2.792 14.2 7.86 19.264l103.66 103.88h-329.32c-14.848 0-26.58 11.624-26.58 26.476v22.812c0 14.852 11...","{'summary_text': 'The following morning, the UM has gone to work on his new machine.'}",NEGATIVE,0.99467
6,11:11AM,Hedge Fund Manager Netting 29% Gain Sees S&P 500 Going Nowhere,https://www.bloomberg.com/news/articles/2022-12-17/hedge-fund-manager-netting-29-gain-sees-s-p-500-going-nowhere,11,NEGATIVE,0.999123,"<p class=""continue"">To continue, please click the box below to let us know you're not a robot.</p><p class=""info__text"">Please make sure your browser supports JavaScript and cookies and that you are not\n blocking them from loading.\n For more information you can review our <a class=""info__link"" href=""/notices/tos"">Terms of\n Service</a> and <a class=""info__link"" href=""/notices/tos"">Cookie Policy</a>.</p><p class=""info__text"">For inquiries related to this...","{'summary_text': 'This page is designed to let you know that if you're a robot, you can continue.'}",NEGATIVE,0.999123
7,10:20AM,Taco Bell mulling plan to make fries permanent fixture on menu to compete with McDonald's,https://foxbusiness.com/lifestyle/taco-bell-mulling-plan-make-fries-permanent-fixture-menu-compete-mcdonalds,15,NEGATIVE,0.992,"<p>\n Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided by <a href=""https://www.factset.com/"">Factset</a>.\n Powered and implemented by <a href=""https://www.factset.com/solutions/business-needs/digital-solutions"">FactSet Digital Solutions</a>. \n <a href=""https://www.factset.com/privacy"">Legal Statement</a>. Mutual Fund and ETF data provided by <a href=""https://lipperalpha.refinitiv.com/"">Refinitiv Lipper</a>.\n </...","{'summary_text': 'The narrator gives us some background on the company and its business. It's pretty clear that it's not exactly a big deal, but it does give us an idea of how important it is to be in business.'}",NEGATIVE,0.992
8,10:00AM,Meta Just Issued a Status Update. Wall Street Should Read Into It.,https://www.wsj.com/articles/meta-just-issued-a-status-update-wall-street-should-read-into-it-11671239013?mod=rss_markets_main,12,POSITIVE,0.93658,"<p>\nThis copy is for your personal, non-commercial use only. Distribution and use of this material are governed by\nour Subscriber Agreement and by copyright law. For non-personal use or to order multiple copies, please contact\nDow Jones Reprints at 1-800-843-0008 or visit www.djreprints.com.\n</p><p>https://www.wsj.com/articles/meta-just-issued-a-status-update-wall-street-should-read-into-it-11671239013</p><p class=""css-1evf9kl-ListenToArticleTitle e1f89e201"">Listen to article</p><p class...","{'summary_text': 'This is a copyrighted work, and may not be used in any other way.'}",POSITIVE,0.93658
9,09:33AM,"Russia's barrage of missiles leaves many Ukrainians without light, power or heat",https://www.edition.cnn.com/webview/europe/live-news/russia-ukraine-war-news-12-17-22/index.html,12,NEGATIVE,0.999702,"<p class=""sc-gZMcBi sc-gFaPwZ kYgNRY"" color=""#0C0C0C"" font-size=""inherit"" font-weight=""bold""><span class=""sc-bdVaJa ehlOSy""><img class=""sc-kgoBCf sc-cpmLhU glTLdn"" size=""6"" src=""//cdn.cnn.com/cnn/2019/images/03/20/live-story-status_light.gif""/></span><span class=""sc-fhYwyz sc-jzgbtB cfzbmM"">Live</span></p><p class=""sc-gZMcBi sc-gFaPwZ hgGOoE"" color=""#0C0C0C"" font-size=""inherit"">Russia's war in Ukraine</p><p class=""sc-gZMcBi sc-gFaPwZ kYgNRY"" color=""#0C0C0C"" font-size=""inherit"" font-weight=""b...",{'summary_text': 'Russia's War in Ukraine.'},NEGATIVE,0.999702


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155 entries, 0 to 178
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               155 non-null    object 
 1   Headline           155 non-null    object 
 2   Links              155 non-null    object 
 3   word_count         155 non-null    int64  
 4   Sentiment_Head     155 non-null    object 
 5   Score_Head         155 non-null    float64
 6   article            155 non-null    object 
 7   summary            155 non-null    object 
 8   Sentiment_Article  155 non-null    object 
 9   Score_Article      155 non-null    float64
dtypes: float64(2), int64(1), object(7)
memory usage: 13.3+ KB


In [35]:
## Get the bar chart from 5 rated reviews ##
freq_dict = defaultdict(int)
for sent in df["summary"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(10), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04,
                          subplot_titles=["Frequent words"])
fig.append_trace(trace0, 1, 1)
#fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=600, width=600, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots');

AttributeError: 'dict' object has no attribute 'split'