In [46]:
import requests
import urllib.request
import time
import spacy
from bs4 import BeautifulSoup
import pandas as pd
from joblib import dump, load
import json
clf = load('sentiment.joblib')

In [26]:
stock_name = "BAC"

In [27]:
sentdict = {0: "Negative", 1: "Positive"}

In [28]:
# Create a dictionary of stock symbols and company names

table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
SP_dict = table[0].set_index('Symbol')
SP_dict=SP_dict['Security'].to_dict()

In [29]:
numResults = 100
url = f'https://www.google.com/search?q=stock+symbol+{stock_name}&tbm=nws&hl=en&num={str(numResults)}'
# https://www.google.com/search?q=stock+symbol+AMZN&tbm=nws&hl=en&num=50

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the links, headline and description text
results = soup.find_all("div", attrs={"class": "ZINbbc"})
headlines = []
descriptions = []
links = []
combined = []
for result in results:
    try:
        headline = result.find("div", attrs={"class": "vvjwJb"}).get_text()
        if headline != "":
            headlines.append(headline)
            links.append("google.com"+result.a["href"])
        description = result.find(
            "div", attrs={"class": "s3v9rd"}).get_text()
        if description != "":
            # remove the "1 day ago" etc.
            description = description[description.find("·")+1:]
            descriptions.append(description)
        combined.append(headline+" "+description)
    except:
        continue
sentiments = [sentdict[clf.predict([text])[0]] for text in combined]

In [30]:
pd.DataFrame({"Headline": headlines,
            "Descriptions": descriptions,
            "Links": links,
            "Sentiment": sentiments})

Unnamed: 0,Headline,Descriptions,Links,Sentiment
0,February 2022 Options Now Available For Bank o...,... in Bank of America Corp (Symbol: BAC) saw...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
1,BAC July 30th Options Begin Trading,Investors in Bank of America Corp (Symbol: BA...,google.com/url?q=https://www.nasdaq.com/articl...,Positive
2,Why Astra Space Stock Just Dropped 10%,"10 stocks we like better than Astra Space, In...",google.com/url?q=https://www.nasdaq.com/articl...,Negative
3,Buy WISH Stock Before It Joins GME and AMC as ...,WISH Stock has been jumping for a while now d...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
4,"Wall Street and Your Fiends Both Like SoFi, So...",Everyone we've told about SOFI stock loves th...,google.com/url?q=https://www.nasdaq.com/articl...,Positive
...,...,...,...,...
95,Thinking about trading options or stock in Nio...,"NEW YORK, May 4, 2021 /PRNewswire/ -- Investo...",google.com/url?q=https://www.prnewswire.com/ne...,Positive
96,Diamond Hill Capital Trades Citigroup for Bank...,"The firm invested in 15,187,746 new shares of...",google.com/url?q=https://finance.yahoo.com/new...,Positive
97,"The Zacks Analyst Blog Highlights: JPMorgan, B...",Stocks recently featured in the blog include:...,google.com/url?q=https://finance.yahoo.com/new...,Positive
98,Bank of America: 3 Strong Value Stocks to Buy Now,Bank of America analysts have tagged three te...,google.com/url?q=https://finance.yahoo.com/new...,Positive


In [31]:
# flattens string
text_h = " ".join(headlines)
text_p = " ".join(descriptions)
text = text_h + text_p

sp = spacy.load('en_core_web_sm')
doc = sp(text)

In [32]:
#Only allow valid tokens which are not stop words
# and punctuation symbols.https://realpython.com/natural-language-processing-spacy-python/#word-frequency
def is_token_allowed(token):
    if (not token or token.is_stop or token.is_punct or not token.pos_ in ["ADJ"]
        or token.lemma_.strip().lower() in ["inc", "stock", "stocks", "price", "market", stock_name.lower()]+
        [word.lower() for word in SP_dict[stock_name].lower().replace(".com","").split()]):
        return False
    else:
        return True

def preprocess_token(token):
        # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

In [33]:
# Part of speech
pos_dict={}
for token in doc:
    if is_token_allowed(token):
        pos_dict[preprocess_token(token)]=token.pos_

# Set up list of dictionaries with words, pos, counts
from collections import Counter
word_freq = Counter([token.lemma_.strip().lower()
                     for token in doc if not token.is_stop and not token.is_punct])
words = [{"Words": key, "POS": pos_dict[key], "Counts": word_freq[key]}
         for key in pos_dict.keys()]

In [39]:
# Add headlines, links, and sentiment
for item in words:
    headline_dict = {}
    for i in range(len(headlines)-1):
        if item["Words"] in descriptions[i].lower() or item["Words"] in headlines[i].lower():
            headline_dict[headlines[i]] = [links[i], sentiments[i]]
    item.update({"links": headline_dict, "sentiment": sentiment_val})

In [43]:
words

[{'Words': 'available',
  'POS': 'ADJ',
  'Counts': 2,
  'links': {'February 2022 Options Now Available For Bank of America (BAC)': ['google.com/url?q=https://www.nasdaq.com/articles/february-2022-options-now-available-for-bank-of-america-bac-2021-06-17&sa=U&ved=2ahUKEwjPvLDLydTxAhVE5uAKHai9BXgQxfQBMAB6BAgFEAE&usg=AOvVaw3bcCGO7fys2YMpCTXgyjmQ',
    'Negative'],
   'Interesting BAC Put And Call Options For May 7th': ['google.com/url?q=https://www.nasdaq.com/articles/interesting-bac-put-and-call-options-for-may-7th-2021-03-25&sa=U&ved=2ahUKEwjPvLDLydTxAhVE5uAKHai9BXgQxfQBMDp6BAgqEAE&usg=AOvVaw3behp8MJefh6DSUdQFdiLY',
    'Positive']},
  'sentiment': 'Positive'},
 {'Words': 'good',
  'POS': 'ADJ',
  'Counts': 4,
  'links': {},
  'sentiment': 'Positive'},
 {'Words': 'worth',
  'POS': 'ADJ',
  'Counts': 2,
  'links': {"If You Invested $1000 in Bank of America a Decade Ago, This is How Much It'd Be Worth Now": ['google.com/url?q=https://finance.yahoo.com/news/invested-1000-bank-america-decad

In [44]:
words = [{"Pos_Neg":sum([clf.predict([text])[0] for text in combined])/len(combined),"cloudData": words}]

In [45]:
words

[{'Pos_Neg': 0.46,
  'cloudData': [{'Words': 'available',
    'POS': 'ADJ',
    'Counts': 2,
    'links': {'February 2022 Options Now Available For Bank of America (BAC)': ['google.com/url?q=https://www.nasdaq.com/articles/february-2022-options-now-available-for-bank-of-america-bac-2021-06-17&sa=U&ved=2ahUKEwjPvLDLydTxAhVE5uAKHai9BXgQxfQBMAB6BAgFEAE&usg=AOvVaw3bcCGO7fys2YMpCTXgyjmQ',
      'Negative'],
     'Interesting BAC Put And Call Options For May 7th': ['google.com/url?q=https://www.nasdaq.com/articles/interesting-bac-put-and-call-options-for-may-7th-2021-03-25&sa=U&ved=2ahUKEwjPvLDLydTxAhVE5uAKHai9BXgQxfQBMDp6BAgqEAE&usg=AOvVaw3behp8MJefh6DSUdQFdiLY',
      'Positive']},
    'sentiment': 'Positive'},
   {'Words': 'good',
    'POS': 'ADJ',
    'Counts': 4,
    'links': {},
    'sentiment': 'Positive'},
   {'Words': 'worth',
    'POS': 'ADJ',
    'Counts': 2,
    'links': {"If You Invested $1000 in Bank of America a Decade Ago, This is How Much It'd Be Worth Now": ['google.com/url

In [48]:
with open('jsonsample.txt', 'w') as outfile:
    json.dump(words, outfile)