Installations

In [None]:
# Web Scraping Installations
# !pip install beautifulsoup4
# !pip install selenium
# !pip install requests

# Sentiment Analysis Installations
# !pip install nltk
# !pip install gensim
# !pip install contractions

Imports

In [1]:
# Web Scraping Imports
from bs4 import BeautifulSoup
import requests
import selenium.webdriver as webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Sentiment Analysis Imports
import nltk
from nltk.util import ngrams
import gensim
import contractions
import pickle

Web Scraper

In [2]:
class WebScraper:
    def __init__(self, stock):
        self.stock = stock
        self.url = "https://finance.yahoo.com/"
        self.driver = None
        self.stock_name = None
        self.hyperlink_list = []
        self.headline_list = []
        self.article_list = []

    def setup_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.get(self.url)
        self.driver.maximize_window()

    def accept_cookies(self):
        consent_button = WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='consent-page']/div/div/div/form/div[2]/div[2]/button[1]")))
        self.driver.execute_script("arguments[0].click();", consent_button)

    def search_stock(self):
        search_box = WebDriverWait(self.driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[id="ybar-sbq"][name="p"]')))
        initial_url = self.driver.current_url
        self.driver.execute_script("arguments[0].value = arguments[1];", search_box, self.stock)
        search_box.send_keys(Keys.ENTER)
        WebDriverWait(self.driver, 5).until(lambda driver: driver.current_url != initial_url)
        WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))

    def scrape_articles(self):
        response = requests.get(self.driver.current_url)
        soup = BeautifulSoup(response.text, "lxml")
        self.stock_name = soup.find("h1", class_="yf-xxbei9").text.strip()
        self.hyperlink_list = [a["href"] for a in soup.find_all("a", class_="subtle-link fin-size-small thumb yf-1e4diqp")]
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
        for hyperlink in self.hyperlink_list.copy():
            try:
                article = [] # each element in this list is a different paragraph
                response = requests.get(hyperlink, headers=headers)
                soup = BeautifulSoup(response.text, "lxml")
                headline = soup.find("h1", class_="cover-title yf-1o1tx8g") # works for most yahoo finance articles
                headline = headline.text.strip() if headline else None # returns None if the mainstream case above fails
                if not headline: # special case for some yahoo tech articles where "Yahoo Tech" is the first h1 headline
                    headline = next((h.text.strip() for h in soup.find_all("h1") if not h.text.strip().startswith("Yahoo")), None) # create an iterator object and return the first element that does not start with "Yahoo"
                if headline:
                    article.append(headline if headline[-1] in ".?!" else headline + ".")
                for paragraph in soup.find_all("p"):
                    paragraph = paragraph.text.strip()
                    if paragraph:
                        article.append(paragraph if paragraph[-1] in ".?!" else paragraph + ".")
                self.headline_list.append(headline)
                self.article_list.append(" ".join(article)) # use the join method to join all the separated header and paragraphs into one long string
            except: # for cases where an article is locked behind a paywall or diverts the user to another news website
                self.hyperlink_list.remove(hyperlink)

    def close_driver(self):
        if self.driver:
            self.driver.quit()

Sentiment Analyzer

In [3]:
class SentimentAnalyzer:
    def __init__(self):
        with open("dictionary.pkl", "rb") as f:
            self.dictionary = pickle.load(f)
        with open("maxent_sentiment_classifier.pkl", "rb") as f:
            self.classifier = pickle.load(f)
        self.stop_list = nltk.corpus.stopwords.words('english')
        self.lemmatizer = nltk.stem.WordNetLemmatizer()

    def preprocess_text(self, text):
        article = nltk.word_tokenize(text)
        article = [w.lower() for w in article if w.isalnum() and w not in self.stop_list]
        article = [self.lemmatizer.lemmatize(contractions.fix(w)) for w in article]
        bigrams = [' '.join(w) for w in list(ngrams(article, 2))]
        article.extend(bigrams)
        return article

    def predict_sentiment(self, text):
        article = self.preprocess_text(text)
        vector = self.dictionary.doc2bow(article)
        article_as_dict = {id: 1 for (id, tf) in vector}
        return self.classifier.classify(article_as_dict)

Stock Sentiment Application

In [4]:
class StockSentimentApp:
    def __init__(self, stock):
        self.stock = stock
        self.scraper = WebScraper(stock)
        self.analyzer = SentimentAnalyzer()
        self.sentiment_count = {'optimistic': 0, 'neutral': 0, 'pessimistic': 0}

    def run(self):
        self.scraper.setup_driver()
        self.scraper.accept_cookies()
        self.scraper.search_stock()
        self.scraper.scrape_articles()
        self.scraper.close_driver()

        for (i, (headline, article, hyperlink)) in enumerate(zip(self.scraper.headline_list, self.scraper.article_list, self.scraper.hyperlink_list)):
            sentiment = self.analyzer.predict_sentiment(article)
            print(f"{i+1}) Headline: {headline}")
            print(f"   Sentiment: {sentiment}")
            print(f"   Article Link: {hyperlink}\n")
            self.sentiment_count[sentiment.lower()] += 1

        print("Summary:")
        for (sentiment, count) in self.sentiment_count.items():
            print(f"{sentiment.capitalize()}: {count}\t", end="")

        overall_sentiment = max(self.sentiment_count, key=self.sentiment_count.get)
        print(f"\nThe overall sentiment for {self.scraper.stock_name} is {overall_sentiment.capitalize()}.")


if __name__ == "__main__":
    stock = input("Enter a Stock: ")
    app = StockSentimentApp(stock)
    app.run()

1) Headline: Apple and Google ‘should face investigation over mobile browser duopoly’
   Sentiment: Neutral
   Article Link: https://finance.yahoo.com/news/apple-google-face-investigation-over-131128843.html

2) Headline: Apple Inc (AAPL)’s AI Push: Key Insights and Market Reactions
   Sentiment: Optimistic
   Article Link: https://finance.yahoo.com/news/apple-inc-aapl-ai-push-125341804.html

3) Headline: EU Drops Probe of Apple’s Treatment of Rival Audiobook, Ebook Developers in App Store
   Sentiment: Pessimistic
   Article Link: https://finance.yahoo.com/m/9be27d96-cc6e-32fb-9e72-ce8cc15e2c7a/eu-drops-probe-of-apple%E2%80%99s.html

4) Headline: UK should use new powers to probe Apple-Google mobile browser duopoly, report says
   Sentiment: Optimistic
   Article Link: https://finance.yahoo.com/news/uk-regulator-consider-probing-apples-114555053.html

5) Headline: Apple reportedly developing conversational Siri using LLMs
   Sentiment: Neutral
   Article Link: https://finance.yahoo.co