In [1]:
import requests
import urllib.request
import time
import spacy
from bs4 import BeautifulSoup
import pandas as pd
from joblib import dump, load
import json
clf = load('sentiment.joblib')

In [2]:
stock_name = "BAC"

In [3]:
sentdict = {0: "Negative", 1: "Positive"}

In [4]:
# Create a dictionary of stock symbols and company names

table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
SP_dict = table[0].set_index('Symbol')
SP_dict=SP_dict['Security'].to_dict()

In [5]:
numResults = 100
url = f'https://www.google.com/search?q=stock+symbol+{stock_name}&tbm=nws&hl=en&num={str(numResults)}'
# https://www.google.com/search?q=stock+symbol+AMZN&tbm=nws&hl=en&num=50

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the links, headline and description text
results = soup.find_all("div", attrs={"class": "ZINbbc"})
headlines = []
descriptions = []
links = []
combined = []
for result in results:
    try:
        headline = result.find("div", attrs={"class": "vvjwJb"}).get_text()
        if headline != "":
            headlines.append(headline)
            links.append("google.com"+result.a["href"])
        description = result.find(
            "div", attrs={"class": "s3v9rd"}).get_text()
        if description != "":
            # remove the "1 day ago" etc.
            description = description[description.find("·")+1:]
            descriptions.append(description)
        combined.append(headline+" "+description)
    except:
        continue
sentiments = [sentdict[clf.predict([text])[0]] for text in combined]

In [6]:
pd.DataFrame({"Headline": headlines,
            "Descriptions": descriptions,
            "Links": links,
            "Sentiment": sentiments})

Unnamed: 0,Headline,Descriptions,Links,Sentiment
0,February 2022 Options Now Available For Bank o...,... in Bank of America Corp (Symbol: BAC) saw...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
1,"ESGU, BAC, PG, KO: Large Inflows Detected at ETF",Among the largest underlying components of ES...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
2,Bank of America Enters Oversold Territory,Bank of America Corp (Symbol: BAC) presently ...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
3,People's United Financial a Top Ranked SAFE Di...,People's United Financial Inc (Symbol: PBCT) ...,google.com/url?q=https://www.nasdaq.com/articl...,Positive
4,Here's Why Provention Bio Stock Is Diving Today,"Shares of Provention Bio (NASDAQ: PRVB), a cl...",google.com/url?q=https://www.nasdaq.com/articl...,Negative
...,...,...,...,...
95,"Noteworthy ETF Inflows: IVE, BAC, DIS, VZ",Among the largest underlying components of IV...,google.com/url?q=https://www.nasdaq.com/articl...,Negative
96,Bruce Berkowitz's Fairholme Fund Swaps Berkshi...,(NYSE:BAC) and exited its holding of Berkshir...,google.com/url?q=https://finance.yahoo.com/new...,Negative
97,5 Top-Performing S&P 500 Bank Stocks of Q1 Wor...,Tip: Try a valid symbol or a specific company...,google.com/url?q=https://finance.yahoo.com/new...,Positive
98,Bank of America Announces Preliminary Voting R...,Bank of America Corporation stock (NYSE: BAC)...,google.com/url?q=https://finance.yahoo.com/new...,Positive


In [7]:
# flattens string
text_h = " ".join(headlines)
text_p = " ".join(descriptions)
text = text_h + text_p

sp = spacy.load('en_core_web_sm')
doc = sp(text)

In [8]:
#Only allow valid tokens which are not stop words
# and punctuation symbols.https://realpython.com/natural-language-processing-spacy-python/#word-frequency
def is_token_allowed(token):
    if (not token or token.is_stop or token.is_punct or not token.pos_ in ["ADJ"]
        or token.lemma_.strip().lower() in ["inc", "stock", "stocks", "price", "market", stock_name.lower()]+
        [word.lower() for word in SP_dict[stock_name].lower().replace(".com","").split()]):
        return False
    else:
        return True

def preprocess_token(token):
        # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

In [9]:
# Part of speech
pos_dict={}
for token in doc:
    if is_token_allowed(token):
        pos_dict[preprocess_token(token)]=token.pos_

# Set up list of dictionaries with words, pos, counts
from collections import Counter
word_freq = Counter([token.lemma_.strip().lower()
                     for token in doc if not token.is_stop and not token.is_punct])
words = [{"Words": key, "POS": pos_dict[key], "Counts": word_freq[key]}
         for key in pos_dict.keys()]

In [10]:
# Add headlines, links, and sentiment
for item in words:
    headline_dict = {}
    sentcount_pos = 0
    sentcount_neg = 0
    for i in range(len(headlines)-1):
        if item["Words"] in descriptions[i].lower() or item["Words"] in headlines[i].lower():
            headline_dict[headlines[i]] = [links[i], sentiments[i]]
            if sentiments[i] == "Negative":
                sentcount_neg+=1
            else:
                sentcount_pos+=1
            word_score = sentcount_pos/(sentcount_pos+sentcount_neg)
    item.update({"links": headline_dict, "WordScore": word_score})

In [11]:
words

[{'Words': 'available',
  'POS': 'ADJ',
  'Counts': 1,
  'links': {'February 2022 Options Now Available For Bank of America (BAC)': ['google.com/url?q=https://www.nasdaq.com/articles/february-2022-options-now-available-for-bank-of-america-bac-2021-06-17&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQBMAB6BAgAEAE&usg=AOvVaw21K-VcFUGxOsJzWqk_AbKm',
    'Negative']},
  'WordScore': 0.0},
 {'Words': 'large',
  'POS': 'ADJ',
  'Counts': 7,
  'links': {'ESGU, BAC, PG, KO: Large Inflows Detected at ETF': ['google.com/url?q=https://www.nasdaq.com/articles/esgu-bac-pg-ko%253A-large-inflows-detected-at-etf-2021-06-29&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQBMAF6BAhjEAE&usg=AOvVaw1lKG32m0lqXWeU0U_CswCm',
    'Negative'],
   'Bank of America (BAC) Outpaces Stock Market Gains: What You Should Know': ['google.com/url?q=https://finance.yahoo.com/news/bank-america-bac-outpaces-stock-214509571.html&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQBMAl6BAhbEAE&usg=AOvVaw1C_YTgTcoXUl7i4kdtYnTn',
    'Nega

In [12]:
words = [{"Pos_Neg":sum([clf.predict([text])[0] for text in combined])/len(combined),"cloudData": words}]

In [13]:
words

[{'Pos_Neg': 0.42,
  'cloudData': [{'Words': 'available',
    'POS': 'ADJ',
    'Counts': 1,
    'links': {'February 2022 Options Now Available For Bank of America (BAC)': ['google.com/url?q=https://www.nasdaq.com/articles/february-2022-options-now-available-for-bank-of-america-bac-2021-06-17&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQBMAB6BAgAEAE&usg=AOvVaw21K-VcFUGxOsJzWqk_AbKm',
      'Negative']},
    'WordScore': 0.0},
   {'Words': 'large',
    'POS': 'ADJ',
    'Counts': 7,
    'links': {'ESGU, BAC, PG, KO: Large Inflows Detected at ETF': ['google.com/url?q=https://www.nasdaq.com/articles/esgu-bac-pg-ko%253A-large-inflows-detected-at-etf-2021-06-29&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQBMAF6BAhjEAE&usg=AOvVaw1lKG32m0lqXWeU0U_CswCm',
      'Negative'],
     'Bank of America (BAC) Outpaces Stock Market Gains: What You Should Know': ['google.com/url?q=https://finance.yahoo.com/news/bank-america-bac-outpaces-stock-214509571.html&sa=U&ved=2ahUKEwjhgrOCnN7xAhUzrJUCHT3XC0QQxfQB

In [14]:
with open('jsonsample.txt', 'w') as outfile:
    json.dump(words, outfile)