In [49]:

#https://medium.com/illumination/scraping-news-and-creating-a-word-cloud-in-python-10ea312c49ba
import requests
import urllib.request
import time
import spacy
from bs4 import BeautifulSoup
import pandas as pd
from flask import Flask, render_template, jsonify

In [56]:
# Create a dictionary of stock symbols and company names

table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
SP_dict = table[0].set_index('Symbol')
SP_dict=SP_dict['Security'].to_dict()

In [13]:
stock_name="AMZN"
numResults=100
url =f'https://www.google.com/search?q=stock+symbol+{stock_name}&tbm=nws&hl=en&num={str(numResults)}'
# https://www.google.com/search?q=stock+symbol+AMZN&tbm=nws&hl=en&num=50

In [14]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [15]:
# Get the headline text
results = soup.find_all("div", attrs = {"class": "ZINbbc"})
headlines = []
descriptions = []
links = []
for result in results:
    try:
        headline = result.find("div", attrs={"class":"vvjwJb"}).get_text()
        if headline != "": 
            headlines.append(headline)
            links.append("google.com"+result.a["href"])
        description = result.find("div", attrs={"class":"s3v9rd"}).get_text()
        if description != "": 
            # remove the "1 day ago" etc.
            description = description[description.find("·")+1:]
            descriptions.append(description)
    except:
        continue
# flattens string
text_h = " ".join(headlines)
text_p = " ".join(descriptions)
text = text_h + text_p

In [16]:
sp = spacy.load('en_core_web_sm')
doc = sp(text)

In [85]:
#Only allow valid tokens which are not stop words
# and punctuation symbols.https://realpython.com/natural-language-processing-spacy-python/#word-frequency
def is_token_allowed(token):
    if (not token or token.is_stop or token.is_punct or not token.pos_ in ["ADJ"]
       or token.lemma_.strip().lower() in ["inc", "stock", "stocks", "price", "market", stock_name]+
        [word.lower() for word in SP_dict[stock_name].replace(".com","").split()]):
        return False
    else:
        return True

def preprocess_token(token):
     # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

pos_dict={}
for token in doc:
    if is_token_allowed(token):
        pos_dict[preprocess_token(token)]=token.pos_

In [86]:
# Set up list of dictionaries with words, pos, counts
from collections import Counter
word_freq = Counter([token.lemma_.strip().lower() for token in doc if not token.is_stop and not token.is_punct])
words = [{"Words": key, "POS": pos_dict[key], "Counts": word_freq[key]} for key in pos_dict.keys()]

In [87]:
# Add headlines and links
for item in words:
    headline_dict={}
    for i in range(len(headlines)-1):
        if item["Words"] in descriptions[i].lower() or item["Words"] in headlines[i].lower():
            headline_dict[headlines[i]]=links[i]
    item.update({"links": headline_dict})

In [88]:
## FOR DEVELOPMENT: WRITE LIST TO JSON FILE
import json
with open('static/data/data.json', 'w') as fout:
    json.dump(words , fout)

In [34]:
words

[{'Words': '3',
  'POS': 'NUM',
  'Counts': 3,
  'links': {"3 Top Stocks That'll Make You Richer in June (and Beyond)": 'google.com/url?q=https://www.fool.com/investing/2021/06/01/3-top-stocks-thatll-make-you-richer-in-june/&sa=U&ved=2ahUKEwjc3IHNhvzwAhWSK5QKHRb7AckQxfQBMAB6BAgAEAE&usg=AOvVaw3mjcS8NDxvB4GMzagz60ff',
   'Amazon Stock: Is The MGM Acquisition A Financial Burden?': 'google.com/url?q=https://www.thestreet.com/amazon/news/amazon-stock-is-the-mgm-acquisition-a-financial-burden&sa=U&ved=2ahUKEwjc3IHNhvzwAhWSK5QKHRb7AckQxfQBMAF6BAhjEAE&usg=AOvVaw3ZTcIZ7_zltzgi9ngJaFJh',
   'Notable Thursday Option Activity: AMZN, LQDT, CMG': 'google.com/url?q=https://www.nasdaq.com/articles/notable-thursday-option-activity%253A-amzn-lqdt-cmg-2021-05-06&sa=U&ved=2ahUKEwjc3IHNhvzwAhWSK5QKHRb7AckQxfQBMAR6BAhgEAE&usg=AOvVaw17zb7bq77GMjC85LrfoWFZ',
   'Amazon stock may be 70% undervalued and the company worth $3 trillion: analyst': 'google.com/url?q=https://finance.yahoo.com/news/amazon-stock-may-be