# Combined Workflow
Notes: 
* Continued issues accessing articles behind a paywall or external source (e.g., https://finance.yahoo.com/news/top-midday-stories-pepsico-buy-160405890.html)

In [112]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests
import yfinance as yf
import pandas as pd
import numpy as np
import math
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from seleniumbase import Driver
from pymongo import MongoClient
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from bs4 import BeautifulSoup
import google.generativeai as genai
import json
from openai import OpenAI
from datetime import datetime

In [114]:
##################################
# Get website html data 
##################################
def get_basesoup(driver, url, wait=False, until_class='ClassOfMyElement'):
    driver.get(url)

    # Wait for page and element to completely load
    if wait:
        delay = 3 # seconds
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, until_class)))
        except TimeoutException:
            print("Loading took too much time!")
    
    basesoup=BeautifulSoup(driver.page_source,"html.parser")
    return basesoup

In [116]:
##################################
#Return a list of strings for a given url, 
#where each string is a sentence in the linked article.
##################################
def get_news_info(url):
    '''
    Return a list of strings for a given url, 
    where each string is a sentence in the linked article.
    '''
    soup = get_basesoup(driver, url)

    try:
        article = soup.find("div", class_="article-wrap no-bb")
        
        if not article:
            print(f"No articles found on page {url}")
            return []

        # cover_wrap = article.find("div", class_="cover-wrap yf-1p8y0lh")
        # title = cover_wrap.find("h1", class_="cover-title yf-1p8y0lh")
             
        body_wrap = article.find("div", class_="body-wrap yf-i23rhs")
        body = body_wrap.find("div", class_="body yf-5ef8bf")
        text = body.find_all("p", class_="yf-1pe5jgt")
        # for paragraph in text:
        #     print(paragraph.text.strip())
        
        return [paragraph.text.strip() for paragraph in text]
            
        
    except:
        print(f"Error accessing articles on page {url}")
        return []
    
    

In [118]:
#########################################
#gets URLS from yahoo finance API
########################################
def get_urls_yfinance(ticker):
    news = yf.Ticker(ticker).news
    urls = {dictionary['link'] for dictionary in news}
    return urls

In [120]:
#########################################
# Scrape all related articles
########################################
def get_list_all_articles_text_data(urls):
    article_texts = []
    
    for url in urls:
        article_texts.append(get_news_info(url)) #webscraping step

    return article_texts

In [122]:
#####################################
#Combine list of sentences
##################################
def combine_sentences(text_data):
    output_text = ""
    for sentence in text_data:
        output_text += " " + sentence
    return output_text

In [124]:
##################################
#get finbert sentiment scores for one text
#################################
def use_finbert(text_data):
    text = combine_sentences(text_data)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the logits (raw output predictions)
    logits = outputs.logits
    
    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities and predicted sentiment class
    predicted_class = torch.argmax(probs).item()  # 0: negative, 1: neutral, 2: positive
    confidence = torch.max(probs).item()

    # Sentiment mapping
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    
    # Get the predicted sentiment label
    predicted_sentiment = sentiment_labels[predicted_class]
    
    # Set a confidence threshold (e.g., 70%)
    confidence_threshold = 0.7
    
    # Output result with a check on confidence
    # if confidence >= confidence_threshold:
    #     print(f"Sentiment: {predicted_sentiment} (Confidence: {confidence:.2f})")
    # else:
    #     print("Sentiment prediction not reliable enough based on confidence threshold.")
    return [predicted_sentiment, confidence]

In [126]:
##############################################
#get aggregate sentiment scores from one day from finbert
##############################################
def get_sentiment_scores_finbert(article_texts):
    cnt_neu = 0
    cnt_pos = 0
    cnt_neg = 0
    total_confidence = 0
    for text in article_texts:
        sentiment, confidence = use_finbert(text)
        if sentiment == "Neutral":
            cnt_neu += 1
        elif sentiment == "Positive":
            cnt_pos += 1
        elif sentiment == "Negative":
            cnt_neg += 1
        total_confidence += confidence
    return [cnt_neu, cnt_pos, cnt_neg, total_confidence/len(article_texts)]

In [128]:
##################################
#get gemini sentiment scores for one text
#################################
def get_gemini_sentiment_score_one_article(api_key_gemini, text):
    genai.configure(api_key=api_key_gemini)
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content("Please conduct sentiment analysis on the following articles of interest. Here is the text: "+text+
                                  '''Output text should be in JSON format with no extra information or text. Do NOT include extra formatting.
                                  Your response should start with { and end with }. Do not include `.
                                  Include categories neutral-sentiment,''' +
                                  "positive-sentiment, negative-sentiment, summary and stock-tickers. The sentiment categories should include "+
                                  "an integer from 0 to 9, where 0 means that the text doesn't fit that category and 9 means it fits well." +
                                  "The summary should be a once-sentence summary about the text. Stock-tickers should be the tickers of stocks related" +
                                  "to the articles." +
                                  '''If no article is given, output an empty 
                                  json "{}" only. Here is an example of required formatting: ''' +

                                  ''' {"neutral-sentiment": #,
                                    "npositive-sentiment": #,
                                    "negative-sentiment": #,
                                    "related-stocks": ["ABC", "DEF", "GHI"] +
                                    "nsummary : "Include a 2-sentence summary of the article text here." +
                                    }''')
    response_string = response.text
    if response_string == "{}":
        return None
    try:
        response_json = json.loads(response_string)
    except json.JSONDecodeError:
        return {}
    return response_json

In [130]:
##################################
#get OpenAI sentiment scores for one text
#make sure to change the api_key string to
#appropriate key
#################################
def get_openai_sentiment_score_one_article(text):
    client = OpenAI(
        api_key= ""
        #api_key= ""
    )
    # Construct the prompt dynamically
    prompt = ("Please conduct sentiment analysis on the following articles of interest. Here is the text: " + text +
              '''Output text should be in JSON format with no extra information or text. Do NOT include extra formatting.
              Your response should start with { and end with }. Do not include `.
              Include categories neutral-sentiment, positive-sentiment, negative-sentiment, summary and stock-tickers. The sentiment categories should include 
              an integer from 0 to 9, where 0 means that the text doesn't fit that category and 9 means it fits well. 
              The summary should be a one-sentence summary about the text. Stock-tickers should be the tickers of stocks related 
              to the articles.
              If no article is given, output an empty 
              json "{}" only. Do not include any newline characters in your responseHere is an example of required formatting: 
    
              {"neutral-sentiment": #,
               "positive-sentiment": #,
               "negative-sentiment": #,
               "related-stocks": ["ABC", "DEF", "GHI"]
              }'''
    )
    
    # Make the API call to OpenAI
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content":prompt,
            }
        ],
        model="gpt-4o",
        #api_key = 
    )
    
    # Extract the assistant's response from the API response
    response_string = chat_completion.choices[0].message.content
    if response_string == "{}":
        return {}
    try:
        response_json = json.loads(response_string)
    except json.JSONDecodeError:
        return {}
        print("here")
    return response_json



In [132]:
##############################################
#get aggregate sentiment scores from one day from OpenAI
##############################################
def get_all_openai_scores(article_texts):
    avg_neu = 0
    avg_pos = 0
    avg_neg = 0
    num_articles = len(article_texts)
    for article in article_texts:
        text_data = combine_sentences(article)
        sentiment_scores = get_openai_sentiment_score_one_article(text_data)
        if not sentiment_scores:
            continue
        else:
            avg_neu += sentiment_scores["neutral-sentiment"]
            avg_pos += sentiment_scores["positive-sentiment"]
            avg_neg += sentiment_scores["negative-sentiment"]
    try:
        scores =  [avg_neu/num_articles, avg_pos/num_articles, avg_neg/num_articles]
    except ZeroDivisionError:
        scores =  [0,0,0]

In [134]:
##############################################
#get aggregate sentiment scores from one day from gemini
##############################################
def get_all_gemini_sentiment_scores(article_texts):
    avg_neu = 0
    avg_pos = 0
    avg_neg = 0
    api_key_gemini= ""
    num_articles = len(article_texts)
    for article in article_texts:
        text_data = combine_sentences(article)
        sentiment_scores = get_gemini_sentiment_score_one_article(api_key_gemini, text_data)
        if not sentiment_scores:
            continue
        else:
            avg_neu += sentiment_scores["neutral-sentiment"]
            avg_pos += sentiment_scores["positive-sentiment"]
            avg_neg += sentiment_scores["negative-sentiment"]
    try:
        scores =  [avg_neu/num_articles, avg_pos/num_articles, avg_neg/num_articles]
    except ZeroDivisionError:
        scores =  [0,0,0]

    return scores

In [136]:
##############################################
#make pandas dataframe for one stock one day
##############################################
def make_dataframe(ticker):
    data = yf.download(ticker, start = datetime.now(), end = datetime.now())
    df = pd.DataFrame(data)
    urls = get_urls_yfinance(ticker)
    article_texts = get_list_all_articles_text_data(urls)
    
    sentiment_labels_finbert = ["neutral-count-finbert", "positive-count-finbert","negative-count-finbert","average-confidence-finbert"]
    sentiment_scores_finbert = get_sentiment_scores_finbert(article_texts)
    for i in range(len(sentiment_labels_finbert)):
      df[sentiment_labels_finbert[i]] = sentiment_scores_finbert[i]

    sentiment_labels_gemini = ["average-neutral-score-gemini", "average-positive-score-gemini","average-negative-score-gemini"]
    sentiment_scores_gemini = get_all_openai_scores(article_texts)
    for i in range(len(sentiment_labels_gemini)):
        try:
            df[sentiment_labels_gemini[i]] = sentiment_scores_gemini[i]
        except TypeError:
            df[sentiment_labels_gemini[i]] = 0
        
    df["prediction-label"] = ''
    df['ticker'] = ticker
    df['number-employees'] = get_num_employees(ticker)
    df['date_object'] = ''
    df['sector'] = sectors[ticker]
    return df

In [32]:
##############################################
#Get number of employees using yahoo finance API
#If getting number of employees is unsuccesful,
#add an entry in the num_employees_dict manually.
##############################################
def get_num_employees(ticker):
    num_employees_dict = {'TSM': 76478,
                     'SBUX': 381000,
                     'V': 105400,
                     }
    stock = yf.Ticker(ticker)
    try:
        return stock.info['fullTimeEmployees']
    except KeyError:
        try:
            return num_employees_dict[ticker]
        except KeyError:
            return None

In [34]:
##############################################
#Update MongoDB database by adding the dataframe
#passed as an argument
##############################################
def update_db(df):
    connection_string = ""

    # Step 1: Connect to MongoDB Atlas
    client = MongoClient(connection_string)
    
    # Step 2: Select the database and collection
    db = client["predictive-analysis-dataset"]  # Replace with your database name
    collection = db["stocks"]  # Replace with your collection name
    
    # Step 4: Convert DataFrame to List of Dictionaries
    data = df.to_dict(orient="records")  # Converts rows into a list of dictionaries
    
    # Step 5: Insert data into MongoDB Atlas collection
    try:
        collection.insert_many(data)
    except TypeError:
        print("Not uploaded to MongoDB")
    

In [60]:
##############################################
#Updates the prediction by using the previous day's
#close and the next day's open. This updates the prediction
#of one stock.
##############################################
def update_prediction_one_stock(stock_ticker, filename):
    df = pd.read_csv('stock_data.csv')
    stock_df = df[df['ticker'] == stock_ticker]
    stock_df = stock_df.reset_index(drop=True)
    dates_list = list(stock_df['Date'].unique())
    for i in range(1,len(dates_list)):
        prev_close = stock_df.loc[i-1, 'Close']
        next_open = stock_df.loc[i, 'Open']
        didIncrease = False
        if prev_close < next_open:
            didIncrease = True
            stock_df.loc[i, 'prediction-label'] = 1
        elif next_open <= prev_close:
            didIncrease = False
            stock_df.loc[i, 'prediction-label'] = 0
    stock_df_index = 0
    for index, row in df.iterrows():
        if row['ticker'] == stock_ticker:
            df.loc[index, 'prediction-label'] = stock_df.loc[stock_df_index, 'prediction-label']
            stock_df_index += 1
    df.to_csv(filename, index = False)

In [62]:
##############################################
#Updates all predictions in the dataframe by comparing the 
#current day's open and close.
##############################################
def update_all_predictions(filename):
    df = pd.read_csv(filename)
    for index, row in df.iterrows():
        if row['Open'] > row['Close']:
            df.loc[index, 'prediction-label'] = 0
        else:
            df.loc[index, 'prediction-label'] = 1
    df.to_csv(filename, index = False)

In [138]:
##############################################
#Updates all the entries in the number of employees
#column
##############################################
def update_num_employees(filename):
    df = pd.read_csv(filename)
    for index, row in df.iterrows():
        if pd.isna(df.loc[index, 'number-employees']):
            df.loc[index, 'number-employees'] = get_num_employees(row['ticker'])
    df.to_csv(filename, index = False)

In [71]:
##############################################
#Updates all the entries in the sector
#column
##############################################
def update_all_sectors(filename):
    df = pd.read_csv(filename)
    for index, row in df.iterrows():
        df.loc[index, 'sector'] = sectors[row['ticker']]
    df.to_csv(filename, index = False)

In [73]:
##############################################
# Stock ticker list that can be updated with any tickers
# when more financial and sentiment data needs to be collected.
##############################################
stock_ticker_list = [ 'LCID' ,'PFE', 'VZ', 'NVDA', 'JNJ', 'T', 'RTX', 'MDT', 'GOOGL', 'BSX', 'META',
                   'TSLA', 'AAPL', 'ABNB', 'AMZN', 'MSFT', 'V', 
                     'WMT', 'PYPL', 'MA', 'DIS', 'NFLX', 'AMD', 'INTU', 
                     'GS', 'MS', 'KO', 'XOM', 'IBM', 'CVX', 'UNH', 'PEP', 'HD',
                     'NKE', 'MCD', 'CSCO', 'BABA', 'BA', 'UNP', 'CAT', 'GE', 'ORCL', 'AMGN',
                     'LMT', 'COP', 'TXN', 'ZM', 'SQ', 'SO', 'AXP', 'DHR', 'COST', 'DE', 
                     'LOW', 'KHC', 'BIDU', 'TMO', 'UAL', 
                    'WFC', 'CL', 'UPS', 'PM', 'BHP', 'TSM', 'SAP', 'C', 'QCOM', 'INTC', 
                     'SLB', 'VLO', 
                    'CSX', 'AMT', 'DUK', 'NSC', 'STZ', 'LLY', 'KMI', 'CHTR', 'PG', 
                     'LUV', 'F',
                    'PGR', 'TGT', 'MCO', 'PRU', 'PLD', 'AIG', 'SPG', 'DOW', 'SBUX', 'MSCI', 'TRV', 
                    'ZTS', 'MMM', 'EXC', 'FIS', 'ISRG']


In [75]:
##############################################
# Each stock ticker above is associated with a 
# sector in this dictionary. When a stock ticker
# is added/deleted, entries in the sector dict
# must be added/deleted too.

# Sector Categories:
# Technology: Companies involved in electronics, software, and hardware (e.g., Apple, Microsoft, NVIDIA, Intel).
# Finance: Banks, insurance, investment firms (e.g., JPMorgan, Goldman Sachs, Visa, PayPal).
# Healthcare: Pharmaceutical companies, medical devices, healthcare providers (e.g., Pfizer, Johnson & Johnson, Medtronic, Amgen).
# Consumer Goods: Companies that produce goods for personal consumption (e.g., Coca-Cola, PepsiCo, McDonald's, Procter & Gamble).
# Energy: Oil, gas, renewable energy, utilities (e.g., ExxonMobil, Chevron, Duke Energy).
# Industrials: Companies in manufacturing, construction, defense, aerospace (e.g., Boeing, Honeywell, Lockheed Martin).
# Telecommunications: Companies that provide communication services (e.g., Verizon, AT&T, Comcast).
# Entertainment: Companies in media and entertainment (e.g., Netflix, Disney).
# Automotive: Companies related to cars, trucks, and electric vehicles (e.g., Tesla, Ford, Lucid Motors).
# Aerospace: Companies in aircraft production and space exploration (e.g., Boeing, Lockheed Martin).
##############################################
sectors = {
    'LCID': 'Automotive',
    'PFE': 'Healthcare',
    'VZ': 'Telecommunications',
    'NVDA': 'Technology',
    'JNJ': 'Healthcare',
    'T': 'Telecommunications',
    'RTX': 'Industrials',
    'MDT': 'Healthcare',
    'GOOGL': 'Technology',
    'BSX': 'Healthcare',
    'META': 'Technology',
    'TSLA': 'Automotive',
    'AAPL': 'Technology',
    'ABNB': 'Consumer Goods',
    'AMZN': 'Consumer Goods',
    'MSFT': 'Technology',
    'V': 'Finance',
    'WMT': 'Consumer Goods',
    'PYPL': 'Finance',
    'MA': 'Finance',
    'DIS': 'Entertainment',
    'NFLX': 'Entertainment',
    'AMD': 'Technology',
    'INTU': 'Technology',
    'GS': 'Finance',
    'MS': 'Finance',
    'KO': 'Consumer Goods',
    'XOM': 'Energy',
    'IBM': 'Technology',
    'CVX': 'Energy',
    'UNH': 'Healthcare',
    'PEP': 'Consumer Goods',
    'HD': 'Consumer Goods',
    'NKE': 'Consumer Goods',
    'MCD': 'Consumer Goods',
    'CSCO': 'Technology',
    'BABA': 'Technology',
    'BA': 'Aerospace',
    'UNP': 'Industrials',
    'CAT': 'Industrials',
    'GE': 'Industrials',
    'ORCL': 'Technology',
    'AMGN': 'Healthcare',
    'LMT': 'Aerospace',
    'COP': 'Energy',
    'TXN': 'Technology',
    'ZM': 'Technology',
    'SQ': 'Technology',
    'SO': 'Energy',
    'AXP': 'Finance',
    'DHR': 'Healthcare',
    'COST': 'Consumer Goods',
    'DE': 'Industrials',
    'LOW': 'Consumer Goods',
    'KHC': 'Consumer Goods',
    'BIDU': 'Technology',
    'TMO': 'Healthcare',
    'UAL': 'Aerospace',
    'WFC': 'Finance',
    'CL': 'Consumer Goods',
    'UPS': 'Industrials',
    'PM': 'Consumer Goods',
    'BHP': 'Energy',
    'TSM': 'Technology',
    'SAP': 'Technology',
    'C': 'Finance',
    'QCOM': 'Technology',
    'INTC': 'Technology',
    'SLB': 'Energy',
    'VLO': 'Energy',
    'CSX': 'Industrials',
    'AMT': 'Telecommunications',
    'DUK': 'Energy',
    'NSC': 'Industrials',
    'STZ': 'Consumer Goods',
    'LLY': 'Healthcare',
    'KMI': 'Energy',
    'SPY': 'Finance',
    'CHTR': 'Telecommunications',
    'PG': 'Consumer Goods',
    'LUV': 'Airlines',
    'F': 'Automotive',
    'PGR': 'Finance',
    'TGT': 'Consumer Goods',
    'MCO': 'Finance',
    'PRU': 'Finance',
    'PLD': 'Real Estate',
    'AIG': 'Finance',
    'SPG': 'Real Estate',
    'DOW': 'Chemicals',
    'SBUX': 'Consumer Goods',
    'MSCI': 'Finance',
    'TRV': 'Finance',
    'ZTS': 'Healthcare',
    'MMM': 'Industrials',
    'EXC': 'Energy',
    'FIS': 'Finance',
    'ISRG': 'Healthcare'
}

In [77]:
##############################################
#uncomment the line below if stock_data.csv file hasn't been created
#make_dataframe('TSLA') #only necessary if stock_data file has not been created
#This makes the dataframe and adds it to the csv file.
#THIS HAS TO BE RUN EVERY WEEKDAY TO COLLECT DATA EVERY DAY.
##############################################
options = webdriver.ChromeOptions()
driver = Driver(uc=True, incognito=True)
for stock in stock_ticker_list:
    curr_df = make_dataframe(stock)
    curr_df.to_csv('stock_data.csv', mode = 'a', header = False)
    update_db(curr_df)
driver.quit()

KeyboardInterrupt: 