In [207]:
import os
import requests
import re
from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

import finnhub
from dotenv import load_dotenv
from pathlib import Path    
import numpy as np
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import sys
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
sys.path.append('../') # Change the python path at runtime
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

# Self-created modules
from src.utils import path as path_yq








# Set up Variables

In [3]:
load_dotenv()

SUBJECTIVITY_API_KEY = os.environ.get('SUBJECTIVITY_API_KEY')
POLARITY_API_KEY = os.environ.get('POLARITY_API_KEY')
INTENSITY_API_KEY = os.environ.get('INTENSITY_API_KEY')
FINNHUB_API_KEY = os.environ.get('FINNHUB_API_KEY')

In [51]:
BACKTEST_START_DATE = '2023-11-01'
BACKTEST_END_DATE = '2024-01-28'

cur_dir = Path.cwd()
root_dir = path_yq.get_root_dir(cur_dir)

# Finnhub API

- Finnhub API
https://finnhub.io/docs/api/company-news

# Fetch News Data

UNIX timezone 1706743226
https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjMxeO_w5CEAxWByjgGHQwwCT8QFnoECA0QAw&url=https%3A%2F%2Fnote.nkmk.me%2Fen%2Fpython-unix-time-datetime%2F%23%3A~%3Atext%3DUnix%2520time%2520is%2520the%2520number%2CPosix%2520time%252C%2520and%2520so%2520on.&usg=AOvVaw0s2rDBPUsnO1N5cO74o2AR&opi=89978449

TODO: Need to remove similar news (might not be the same title)

- 2024-02-04: API down 

In [60]:
dates_path = Path.joinpath(root_dir, 'data', 'raw', 'trading_dates.csv')
dates = pd.read_csv(dates_path, index_col=False)

In [62]:
dates

Unnamed: 0,Date
0,2023-11-01
1,2023-11-02
2,2023-11-03
3,2023-11-06
4,2023-11-07
...,...
56,2024-01-24
57,2024-01-25
58,2024-01-26
59,2024-01-29


In [66]:
finnhub_client = finnhub.Client(api_key=FINNHUB_API_KEY)

# print(finnhub_client.general_news('general', min_id=0))

# Need to use _from instead of from to avoid conflict

#TODO: Cannot fetch all days together. Need to fetch one by one, but rmb rate limit

all_news = pd.DataFrame()

tar_dir = Path.joinpath(root_dir, 'data', 'raw', 'boeing_news')
tar_dir.mkdir(parents=True, exist_ok=True)

# 61 dates
for date in dates['Date']:
    print(date)
    try:
        resp = finnhub_client.company_news('BA', _from=date, to=date)
        df = pd.DataFrame(resp)
        if not df.empty:
            all_news = pd.concat([all_news, df], axis=0)

            csv_path = Path.joinpath(tar_dir, f'{date}.csv')
            df.to_csv(csv_path, index=False)
            
            print(f"Saved news for {date}")
        else:
            print(f"No news for {date}")

        time.sleep(2)  # Respect the API's rate limit
    except Exception as e:
        print(f"Error fetching news for {date}: {e}")

# Save the combined news DataFrame
all_news_path = Path.joinpath(tar_dir.parent, 'boeing_news_20231101_to_20240131.csv')
all_news.to_csv(all_news_path, index=False)
print("All news data fetched and saved.")


# TODO: Explore
# Company Peers
# print(finnhub_client.company_peers('AAPL'))

2023-11-01
Saved news for 2023-11-01
2023-11-02
Saved news for 2023-11-02
2023-11-03
Saved news for 2023-11-03
2023-11-06
Saved news for 2023-11-06
2023-11-07
Saved news for 2023-11-07
2023-11-08
Saved news for 2023-11-08
2023-11-09
Saved news for 2023-11-09
2023-11-10
Saved news for 2023-11-10
2023-11-13
Saved news for 2023-11-13
2023-11-14
Saved news for 2023-11-14
2023-11-15
Saved news for 2023-11-15
2023-11-16
Saved news for 2023-11-16
2023-11-17
Saved news for 2023-11-17
2023-11-20
Saved news for 2023-11-20
2023-11-21
Saved news for 2023-11-21
2023-11-22
Saved news for 2023-11-22
2023-11-24
Saved news for 2023-11-24
2023-11-27
Saved news for 2023-11-27
2023-11-28
Saved news for 2023-11-28
2023-11-29
Saved news for 2023-11-29
2023-11-30
Saved news for 2023-11-30
2023-12-01
Saved news for 2023-12-01
2023-12-04
Saved news for 2023-12-04
2023-12-05
Saved news for 2023-12-05
2023-12-06
Saved news for 2023-12-06
2023-12-07
Saved news for 2023-12-07
2023-12-08
Saved news for 2023-12-08
2

In [55]:
pd.to_datetime(1706023800, unit="s")

Timestamp('2024-01-23 15:30:00')

In [78]:
news_path = Path.joinpath(root_dir, 'data', 'raw', 'boeing_news_20231101_to_20240131.csv')
df = pd.DataFrame(pd.read_csv(news_path))
display(df)



Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,1698879069,UPDATE 4-Boeing says 'cyber incident' hit part...,123567202,https://s.yimg.com/cv/apiv2/social/images/yaho...,BA,Yahoo,"Boeing, one of the world's largest defense and...",https://finnhub.io/api/news?id=4f4d4f7f51b5495...
1,company,1698878899,Boeing says 'cyber incident' hit parts busines...,123567203,https://media.zenfs.com/en/reuters-finance.com...,BA,Yahoo,"WASHINGTON (Reuters) -Boeing, one of the world...",https://finnhub.io/api/news?id=5b36596aef91c5f...
2,company,1698877471,UPDATE 1-US Air Force blows up Minuteman III i...,123567205,https://s.yimg.com/cv/apiv2/social/images/yaho...,BA,Yahoo,The U.S. Air Force said on Wednesday it had bl...,https://finnhub.io/api/news?id=37c95272c9136be...
3,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...
4,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,https://s.yimg.com/ny/api/res/1.2/E5UuO8v96Pop...,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...
...,...,...,...,...,...,...,...,...,...
2115,company,1706580480,"Delta begins refresh on select Boeing jets, ex...",125411052,,BA,Thefly.com,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=e028898028c7a4c...
2116,company,1706578515,Correction to Boeing Article,125396750,,BA,Finnhub,Boeing made the request to the Federal Aviatio...,https://finnhub.io/api/news?id=1da3575cc5d6fb2...
2117,company,1706578320,7 Retirement Stocks to Anchor Your Portfolio f...,125408289,,BA,InvestorPlace,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=135cf06db0f1bc9...
2118,company,1706576280,Boeing drops request for MAX 7 exemption from ...,125418618,,BA,Alliance News,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=f975f0323c4c576...


In [79]:
# General information
print(df.describe(include='all'))


       category      datetime                                     headline  \
count      2120  2.120000e+03                                         2120   
unique        1           NaN                                         1992   
top     company           NaN  What You Missed On Wall Street This Morning   
freq       2120           NaN                                           17   
mean        NaN  1.703565e+09                                          NaN   
std         NaN  2.322037e+06                                          NaN   
min         NaN  1.698804e+09                                          NaN   
25%         NaN  1.701427e+09                                          NaN   
50%         NaN  1.704498e+09                                          NaN   
75%         NaN  1.705546e+09                                          NaN   
max         NaN  1.706658e+09                                          NaN   

                  id                                           

In [80]:
# Check NA values
print(df.isna().sum())

# Convert UNIX to pandas datetime
df['datetime2'] = pd.to_datetime(df['datetime'], unit="s")
df.loc[pd.isna(df['summary']), :]


category       0
datetime       0
headline       0
id             0
image       1196
related        0
source         0
summary       11
url            0
dtype: int64


Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url,datetime2
325,company,1700479008,Deutsche Bank Upgrades Boeing (BA),123991449,,BA,Fintel,,https://finnhub.io/api/news?id=97d67e03c4787d7...,2023-11-20 11:16:48
436,company,1701177335,RBC Capital Upgrades Boeing (BA),124139945,,BA,Fintel,,https://finnhub.io/api/news?id=6a765cba60e0c95...,2023-11-28 13:15:35
529,company,1701436102,Stifel Initiates Coverage of Boeing (BA) with ...,124222334,,BA,Fintel,,https://finnhub.io/api/news?id=508f1cee94cf1a0...,2023-12-01 13:08:22
685,company,1702415986,William Blair Initiates Coverage of Boeing (BA...,124467956,,BA,Fintel,,https://finnhub.io/api/news?id=1e22f078b43aa81...,2023-12-12 21:19:46
1293,company,1704874800,My Dividend Stock Portfolio: New December Divi...,124963549,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=c9e29c89a81ddd0...,2024-01-10 08:20:00
1452,company,1705446325,Wells Fargo Downgrades Boeing (BA),125091559,,BA,Fintel,,https://finnhub.io/api/news?id=fcd7a145e602912...,2024-01-16 23:05:25
1883,company,1706199660,NEW YORK MARKET CLOSE: GDP and easing inflatio...,125326823,,BA,Alliance News,,https://finnhub.io/api/news?id=5df32569aba0983...,2024-01-25 16:21:00
1892,company,1706186105,United Airlines: A Bright Future Despite Boein...,125311137,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=3d0a145860b5853...,2024-01-25 12:35:05
1973,company,1706246222,B of A Securities Downgrades Boeing (BA),125315964,,BA,Fintel,,https://finnhub.io/api/news?id=ad83d971b24c01c...,2024-01-26 05:17:02
2042,company,1706510801,Wall Street Breakfast Podcast: United Airlines...,125373285,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=f8a6021d052f26e...,2024-01-29 06:46:41


In [87]:
df2 = df.drop(columns=['image'])
df2.dropna(inplace=True)
df2.reset_index(inplace=True, drop=True)
df2

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2
0,company,1698879069,UPDATE 4-Boeing says 'cyber incident' hit part...,123567202,BA,Yahoo,"Boeing, one of the world's largest defense and...",https://finnhub.io/api/news?id=4f4d4f7f51b5495...,2023-11-01 22:51:09
1,company,1698878899,Boeing says 'cyber incident' hit parts busines...,123567203,BA,Yahoo,"WASHINGTON (Reuters) -Boeing, one of the world...",https://finnhub.io/api/news?id=5b36596aef91c5f...,2023-11-01 22:48:19
2,company,1698877471,UPDATE 1-US Air Force blows up Minuteman III i...,123567205,BA,Yahoo,The U.S. Air Force said on Wednesday it had bl...,https://finnhub.io/api/news?id=37c95272c9136be...,2023-11-01 22:24:31
3,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00
4,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00
...,...,...,...,...,...,...,...,...,...
2104,company,1706580480,"Delta begins refresh on select Boeing jets, ex...",125411052,BA,Thefly.com,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=e028898028c7a4c...,2024-01-30 02:08:00
2105,company,1706578515,Correction to Boeing Article,125396750,BA,Finnhub,Boeing made the request to the Federal Aviatio...,https://finnhub.io/api/news?id=1da3575cc5d6fb2...,2024-01-30 01:35:15
2106,company,1706578320,7 Retirement Stocks to Anchor Your Portfolio f...,125408289,BA,InvestorPlace,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=135cf06db0f1bc9...,2024-01-30 01:32:00
2107,company,1706576280,Boeing drops request for MAX 7 exemption from ...,125418618,BA,Alliance News,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=f975f0323c4c576...,2024-01-30 00:58:00


In [144]:
# Double confirm there is no more na
print(df2.isna().sum())
print('\n')
print(f"Number of unique dates: {len(df2['datetime2'].dt.strftime('%Y-%m-%d').unique())}")


category     0
datetime     0
headline     0
id           0
related      0
source       0
summary      0
url          0
datetime2    0
soup         0
dtype: int64




NameError: name 'df2' is not defined

In [94]:
# Number of duplicated values in id
print(df['id'].duplicated().sum())

# Print unique values in different columns
for column in ['category', 'source', 'related']:
    print(f"Unique values in '{column}':")
    print(df[column].unique())

0
Unique values in 'category':
['company']
Unique values in 'source':
['Yahoo' 'MarketWatch' 'Seeking Alpha' 'Market News Video' 'Thefly.com'
 'Benzinga' 'SeekingAlpha' 'TipRanks' 'GuruFocus' 'Associated Press, The'
 'Alliance News' 'InvestorPlace' 'PR Newswire' 'TalkMarkets' 'Finnhub'
 'StockMarket' 'Fintel' 'DowJones' 'Stock Options Channel' 'ETF Channel'
 'Business Wire' 'United Press International' '247WallSt'
 'Preferred Stock Channel']
Unique values in 'related':
['BA']


# Remove similar news

A lot of summary has something like this "Looking for stock market analysis and research with proves results? Zacks.com offers in-depth financial research with over 30years of proven results."
TODO: Find out why

In [None]:
# pd.to_datetime(1706519791 - 1706512579, unit='s')

def find_similar_news(df):
    """
    Find similar news in df.
    """
    # Example using 'news_content' column
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['summary'])

    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Define a threshold for identifying similar articles (e.g., 0.5)
    threshold = 0.6

    # Find articles that are similar based on the threshold
    similar_pairs = []
    for i in range(cosine_sim.shape[0]):
        for j in range(i+1, cosine_sim.shape[0]):
            if cosine_sim[i, j] > threshold:
                similar_pairs.append((i, j))

    # similar_pairs contains pairs of indices in your DataFrame that are similar
                
    print(len(similar_pairs))
    # print(similar_pairs)

    duplicated_idx = set()
    for i, j in similar_pairs:
        # Published less than 1 day
        if abs(df.loc[i, 'datetime'] - df.loc[j, 'datetime']) < 86400:
            duplicated_idx.add(j)
            print(f"Published on the same day")
        # print(df.loc[i, 'summary'])
        # print(df.loc[j, 'summary'])
        # print('\n')

    

    duplicated_list = sorted(list(duplicated_idx))
    # print(duplicated_list)
    return duplicated_list

duplicated_list = find_similar_news(df2)



In [96]:
# Drop the rows identified to be similar
df3 = df2.drop(index=duplicated_list, axis=0).reset_index(drop=True)
print(f"Dropped {len(duplicated_list)} rows with similar summaries.")

# Removed similar news
dedup_path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_dedup_20231101_to_20240131.csv')
df3.to_csv(dedup_path, index=False)

Dropped 982 rows with similar summaries.


Dropped 982 rows with similar summaries.
Left with about 1k+ (about half)

# News Content

## Fetch News Content
- Problems
    - Premium subscription

In [235]:
# More robust way to fetch news for a df
# Need to think of how to "add in" the content for rows with problem (additional iterations)
def fetch_news_content(df, col):
    """
    Fetches news content for each URL in the DataFrame and updates/adds a column with the fetched content.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing news URLs.
    - col (str): Column name where the fetched content will be stored.
    """
    # Check if the column exists, if not, initialize it
    if col not in df.columns:
        df[col] = pd.NA  # Using pandas NA for missing data
        
    # Create an empty list to store news content
    content_list = []

    session = requests.Session()  # Use session for connection pooling

    # Set up save directory
    cur_dir = Path.cwd()
    tar_dir = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'data', 'proc')
    
    
    # executable_path='/Users/tangyiqwan/dev/tools/chromedriver-mac-arm64/chromedriver'

    
    # Correct instantiation of the WebDriver using WebDriver Manager
    # driver = webdriver.Chrome(ChromeDriverManager().install())
    # Set the page load timeout
    
    pattern = r'Thank you for your patience'

    for index, row in df.iterrows():
        try:# Count rows containing the pattern
            content = row.get(col, None)
            if pd.isna(content) or content in ['Timeout', 'Error', ''] or re.search(pattern, content):
                # Use Selenium for some Javascript loaded page
                
                # Set up driver every time (when I put outside loop cannot work)
                service = Service()
                options = Options()
                options.headless = False
                driver = webdriver.Chrome(options=options)
                driver.set_page_load_timeout(30)  # Timeout in seconds
                driver.get(row['url'])
                
                time.sleep(np.random.uniform(3, 5))
                soup = driver.page_source

                driver.quit()
                # print(soup)
                content_list.append(str(soup))  # Convert soup object to string to store in DataFrame
                
                # response = session.get(row['url'], timeout=5)
                # soup = BeautifulSoup(response.content, 'html.parser')
                # print(soup)
                # content_list.append(str(soup))  # Convert soup object to string to store in DataFrame
            else:
                content_list.append(row[col])
        except TimeoutException:
            print(f"Timeout occurred for index, url: {index}, {row['url']}")
            content_list.append("Timeout")
        except Exception as e:
            print(f"Error fetching content from {row['url']}: {e}")
            content_list.append("Error")
        
        # Save periodically
        # if index % 10 == 0:  # For example, save every 10 articles
        #     df.loc[:index, col] = pd.Series(content_list)
        #     partial_file_path = Path.joinpath(tar_dir, f'boeing_partial_{col}_20231101_to_20240131.csv')
        #     df.to_csv(partial_file_path, index=False)
        
        time.sleep(1)  # Delay to avoid server blocking
    
    # print(content_list)
    # Update the DataFrame with all the fetched content
    df[col] = pd.Series(content_list, index=df.index)
    # print(df)

    full_file_path = Path.joinpath(tar_dir, f'boeing_{col}_20231101_to_20240131.csv')
    # Save the final DataFrame to CSV
    df.to_csv(full_file_path, index=False)

    return df


In [229]:
dedup_path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_dedup_20231101_to_20240131.csv')
df3 = pd.read_csv(dedup_path, index_col=False)

In [241]:
# Now has 267 errors
# Assuming it has a url column
# path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_partial_soup_20231101_to_20240131.csv')
path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_soup_20231101_to_20240131.csv')
df4 = pd.read_csv(path, index_col=False)

for i in range(5):
    df4 = fetch_news_content(df4, 'soup')
    error_df = find_error(df4.iloc[:412])
    print(f"The number of errors is: {len(error_df)}")


# Try to fill up NA (the df is defined later)
# fetch_news_content(df5)
    

  error_df = df[df['soup'].isin(['Timeout', 'Error', '']) | df['soup'].isna() | df4['soup'].str.contains(pattern, regex=True, na=False)]


Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup
3,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,Timeout
4,company,1698856800,"Boeing Co. stock rises Wednesday, outperforms ...",123561515,BA,MarketWatch,Shares of Boeing Co. rose 1.37% to $189.38 Wed...,https://finnhub.io/api/news?id=dc7aa107a6d496b...,2023-11-01 16:40:00,Timeout
5,company,1698852540,SpaceX Chief Elon Musk calls out his strugglin...,123567209,BA,Yahoo,One of Elon Musk's secondary goals with SpaceX...,https://finnhub.io/api/news?id=1c4a165f7274f89...,2023-11-01 15:29:00,Timeout
8,company,1698840660,"Goldman Sachs' 25 ""conviction list"" stocks to buy",123546609,BA,Seeking Alpha,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=5eda28e6d58826b...,2023-11-01 12:11:00,Timeout
11,company,1698832800,Dow's 100-point rally highlighted by gains for...,123561517,BA,MarketWatch,Led by positive growth for shares of Microsoft...,https://finnhub.io/api/news?id=4f191e8aed3e191...,2023-11-01 10:00:00,Timeout


The number of errors is: 107


In [244]:
len(df4)

1127

- I think is using beautiful soup only?
    - First round has 394 "Thank you for your patience" in 1096 rows (after removing erros, NA, etc.)

- Selenium:
    - After 1st round have 267/1127 errors (df4 length is 1127)
    - Expect 2nd time have about 30% improvements

In [179]:
# pattern = r'Thank you for your patience'

# # Count rows containing the pattern
# rows_with_phrase = df4['soup'].str.contains(pattern, regex=True, na=False)
# count_with_phrase = rows_with_phrase.sum()
# count_with_phrase

393

In [143]:
soup_path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_soup_20231101_to_20240131.csv')
df4.to_csv(soup_path, index=False)

In [146]:
# TODO: Fix Save news_df4 to csv first
df4 = pd.read_csv(soup_path, index_col=False)
df4

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup
0,company,1698879069,UPDATE 4-Boeing says 'cyber incident' hit part...,123567202,BA,Yahoo,"Boeing, one of the world's largest defense and...",https://finnhub.io/api/news?id=4f4d4f7f51b5495...,2023-11-01 22:51:09,"<!DOCTYPE html>\n\n<html lang=""en-us""><head>\n..."
1,company,1698877471,UPDATE 1-US Air Force blows up Minuteman III i...,123567205,BA,Yahoo,The U.S. Air Force said on Wednesday it had bl...,https://finnhub.io/api/news?id=37c95272c9136be...,2023-11-01 22:24:31,"<!DOCTYPE html>\n\n<html lang=""en-us""><head>\n..."
2,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00,"<!DOCTYPE html>\n\n<html lang=""en-us""><head>\n..."
3,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,"<!DOCTYPE html>\n\n<html lang=""en-us""><head>\n..."
4,company,1698856800,"Boeing Co. stock rises Wednesday, outperforms ...",123561515,BA,MarketWatch,Shares of Boeing Co. rose 1.37% to $189.38 Wed...,https://finnhub.io/api/news?id=dc7aa107a6d496b...,2023-11-01 16:40:00,"<!DOCTYPE html>\n\n<html data-env=""prod"" data-..."
...,...,...,...,...,...,...,...,...,...,...
1122,company,1706610048,Ryanair CEO says he has confidence in Boeing,125399932,BA,Yahoo,Ryanair CEO Michael o’Leary said on Tuesday he...,https://finnhub.io/api/news?id=5fd915e2ac70e41...,2024-01-30 10:20:48,"<!DOCTYPE html>\n<html class=""NoJs Fz(62.5%) P..."
1123,company,1706598983,News Highlights : Top Company News of the Day ...,125398834,BA,Finnhub,"GM's Fourth-Quarter Profit Dinged by Strike, E...",https://finnhub.io/api/news?id=21445f9ebdcf39e...,2024-01-30 07:16:23,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<..."
1124,company,1706595240,Boeing Withdraws Safety Exemption Request for ...,125399933,BA,Yahoo,Boeing is withdrawing a request for a safety e...,https://finnhub.io/api/news?id=51c0e727c994bf1...,2024-01-30 06:14:00,"<!DOCTYPE html>\n\n<html lang=""en-us""><head>\n..."
1125,company,1706591102,"U.S. futures subdued; Microsoft, Alphabet to r...",125398829,BA,Yahoo,Investing.com -- U.S. futures hovered broadly ...,https://finnhub.io/api/news?id=5883fe6bad3f9ac...,2024-01-30 05:05:02,"<!DOCTYPE html>\n<html class=""NoJs Fz(62.5%) P..."


In [234]:
# Check if there is any unsuccessful events
def find_error(df):
    pattern = r'Thank you for your patience'
    error_df = df[df['soup'].isin(['Timeout', 'Error', '']) | df['soup'].isna() | df4['soup'].str.contains(pattern, regex=True, na=False)]

    if len(error_df):
        display(error_df.head())
        pass
    else:
        print("All content are successfully fetched.")

    return error_df

error_df = find_error(df4)
print(len(error_df))

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup
3,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,Timeout
4,company,1698856800,"Boeing Co. stock rises Wednesday, outperforms ...",123561515,BA,MarketWatch,Shares of Boeing Co. rose 1.37% to $189.38 Wed...,https://finnhub.io/api/news?id=dc7aa107a6d496b...,2023-11-01 16:40:00,Timeout
5,company,1698852540,SpaceX Chief Elon Musk calls out his strugglin...,123567209,BA,Yahoo,One of Elon Musk's secondary goals with SpaceX...,https://finnhub.io/api/news?id=1c4a165f7274f89...,2023-11-01 15:29:00,Timeout
8,company,1698840660,"Goldman Sachs' 25 ""conviction list"" stocks to buy",123546609,BA,Seeking Alpha,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=5eda28e6d58826b...,2023-11-01 12:11:00,Timeout
11,company,1698832800,Dow's 100-point rally highlighted by gains for...,123561517,BA,MarketWatch,Led by positive growth for shares of Microsoft...,https://finnhub.io/api/news?id=4f191e8aed3e191...,2023-11-01 10:00:00,Timeout


267


In [214]:
def drop_na(df):
    # Drop all the news_content with na
    print(df.isna().sum())
    df1 = df.dropna()
    df1.reset_index(inplace=True, drop=True)
    return df1

# df6['url'].duplicated().sum()
df5 = drop_na(df4)
df5


category     0
datetime     0
headline     0
id           0
related      0
source       0
summary      0
url          0
datetime2    0
soup         0
dtype: int64


Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup
0,company,1698879069,UPDATE 4-Boeing says 'cyber incident' hit part...,123567202,BA,Yahoo,"Boeing, one of the world's largest defense and...",https://finnhub.io/api/news?id=4f4d4f7f51b5495...,2023-11-01 22:51:09,"<html id=""atomic"" class=""Fz(62.5%) Pos(r) fina..."
1,company,1698877471,UPDATE 1-US Air Force blows up Minuteman III i...,123567205,BA,Yahoo,The U.S. Air Force said on Wednesday it had bl...,https://finnhub.io/api/news?id=37c95272c9136be...,2023-11-01 22:24:31,"<html id=""atomic"" class=""Fz(62.5%) Pos(r) fina..."


In [223]:
def parse_soup(soup, min_length=10):
    """
    Extract text from <p> tags and filter out short or irrelevant content.
    
    Parameters:
    - soup (str): Soup content scraped beforehand.
    - min_length (int): Minimum length of text to keep.
    
    Returns:
    - str: Filtered and concatenated text from <p> tags.
    """
    soup = BeautifulSoup(soup, 'html.parser')
    # Target a specific division by class (or you can use id, etc.)
    target_class = 'caas-body' # FIXME: For Yahoo
    target_div = soup.find('div', class_=target_class)
    # print(soup)
    # If the target division is found, extract texts from its <p> tags
    if target_div:
        p_texts = [p.get_text().strip() for p in target_div.find_all('p')]
    else:
        # Fallback to all <p> tags if the target division is not found
        p_texts = [p.get_text().strip() for p in soup.find_all('p')]
    
    # Exclude texts based on keywords or if they're too short
    # keywords_to_exclude = ['Tip:', 'News •']
    # filtered_texts = [text for text in p_texts if len(text) >= min_length and not any(keyword in text for keyword in keywords_to_exclude)]
    
    # Filter out short or irrelevant content
    filtered_texts = [text for text in p_texts if len(text) >= min_length] # TODO:
    print(filtered_texts)  # Debugging print, can be removed
    return '\n'.join(filtered_texts)


In [182]:
print(df5.loc[0, 'url'])

https://finnhub.io/api/news?id=4f4d4f7f51b54950341f38e269b98d4323eb962b423de0b8f18bbd4a72275c86


In [224]:
df5['news_content'] = df5['soup'].apply(parse_soup)

['(Adds detail on webpages being down)', 'By Valerie Insinna and Zeba Siddiqui', "WASHINGTON, Nov 1 (Reuters) - Boeing, one of the world's largest defense and space contractors, said on Wednesday it was investigating a cyber incident that impacted elements of its parts and distribution business and cooperating with a law enforcement probe into it.", 'Boeing acknowledged the incident days after the Lockbit cybercrime gang said on Friday it had stolen "a tremendous amount" of sensitive data from the U.S. planemaker that it would dump online if Boeing didn\'t pay ransom by Nov. 2.', "The Lockbit threat was no longer on the gang's website as of Wednesday, and it didn't immediately respond to a request for comment. Boeing declined to comment on whether Lockbit was behind the cyber incident it disclosed.", '"This issue does not affect flight safety," a Boeing spokesperson said. "We are actively investigating the incident and coordinating with law enforcement and regulatory authorities. We ar

In [None]:
# Check how many news each company publishes in total

def parse_content(df):
    for idx, row in df.iterrows():
        source = df['source']

        if source == 'Yahoo':
            pass
        elif source == 'MarketWatch':
            pass
        elif source == 'Seeking Alpha':
            pass
        elif source == 'Market News Video':
            pass
        elif source == 'Thefly.com': # FIXME: premium?
            pass
        elif source == 'Benzinga':
            pass
        elif source == 'SeekingAlpha':
            pass
        elif source == 'TipRanks':
            pass
        elif source == 'GuruFocus':
            pass
        elif source == 'Associated Press, The':
            pass
        elif source == 'Alliance News':
            pass
        elif source == 'InvestorPlace':
            pass
        elif source == 'PR Newswire':
            pass
        elif source == 'TalkMarkets':
            pass
        elif source == 'Finnhub':
            pass
        elif source == 'StockMarket':
            pass
        elif source == 'Fintel':
            pass
        elif source == 'DowJones':
            pass
        elif source == 'Stock Options Channel':
            pass
        elif source == 'ETF Channel':
            pass
        elif source == 'Business Wire':
            pass
        elif source == 'United Press International':
            pass
        elif source == '247WallSt':
            pass
        elif source == 'Preferred Stock Channel':
            pass
# Checking the mean number of news each company publishes each day

## Parse News Content

### Testing the fetch_news_content function

In [None]:
# Don't use df4
# #  Testing to fetch one line from df3
# idx = 10
# start_time = time.time()
# print(df3.loc[idx, 'url'])
# df4 = df3.loc[[idx]]
# df4['news_content'] = df4['url'].apply(fetch_news_content)
# end_time = time.time()
# min, sec = divmod(end_time - start_time, 60)
# print(f"The elapsed time for 1 url is {int(min)} minutes, {sec:.2f} seconds")
# print(df4.loc[idx, 'news_content'])

## Double Check Errors

### Check if the News Content are Successfully Fetched
- Some are blank after I try to run the url
- About 94 problematic rows

In [105]:
# Read the csv with complete news content from urls, and 
news_path = Path.joinpath(root_dir, 'data', 'proc', 'boeing_news_content_20231101_to_20240131.csv')
df6 = pd.DataFrame(pd.read_csv(news_path))



## Cleaning News Content
- Need to do it separately for each type
- Different techniques have different outputs
- Need to name the outputs based on techniques

- Notes from prof
    - here is a sample use for English: sentic.net/api/en/KEY.py?text=senticnet+is+pretty+cool

    - input text does not require any special formatting so feel free to use spaces instead of '+' or '%20'

    - ampersand, hashtag, semicolons, and braces ('&', '#', ';', '{', '}'), however, are illegal characters

    - hence, they should be replaced with colons (':') or removed entirely in the preprocessing phase

    - please note that:

    - 1) API keys are case-sensitive

    - 2) API keys will be valid for about one month

    - 3) API keys are personal and confidential

    - do not share nor use them from different devices or IP addresses

    - or else they will get terminated earlier

    - the capacity limit for our server is 8000 characters 

    - so our recommendation is to cap your input at about 1000 words

    - if you need to process bigger texts, you will have to split them into smaller parts

    - this is also a good idea in case you want to perform a finer-grained analysis of your input

    - all APIs, in fact, are designed to give you an overall judgement about the whole input

    - for more details, split your text into paragraphs or sentences and feed them to the API one by one

In [108]:
def clean_text(text):
    # Remove sentences that include "reporting by" (case insensitive)
    text = re.sub(r'\n.*?reporting by.*?\n', '', text, flags=re.IGNORECASE)

    # Remove everything starting with "\nPublished:" until the next newline
    text = re.sub(r'\nPublished:.*?\n', '', text, flags=re.IGNORECASE)

    # Remove everything including and after "\n(END)"
    text = re.sub(r'\n\(END\).*', '', text, flags=re.IGNORECASE)

    # Remove all the links
    text = re.sub(r'https?://.*?(?=\s|$)', '', text)

    # Remove the line that starts with "\nRelated Quotes"
    text = re.sub(r'\nRelated Quotes.*?\n', '', text, flags=re.IGNORECASE)

    # Remove content starting with "©" until the next newline
    # This matches the "©" symbol followed by any characters (.*?), until a newline
    text = re.sub(r'©.*?\n', '', text, flags=re.IGNORECASE)

    # To remove the specific sentences directly
    text = re.sub(r'Thank you for your patience\.\s+Our engineers are working quickly to resolve the issue\.', '', text)
    
    # Remove lines with fewer than 5 characters
    text = re.sub(r'^.{1,4}$', '', text, flags=re.MULTILINE)

    # Remove illegal characters for Sentic APIs
    text = re.sub(r'[&#;{}]', '', text)

    # Split the text into paragraphs, remove leading and trailing white spaces, and filter out empty strings
    paragraphs = [paragraph.strip() for paragraph in text.split('\n') if paragraph.strip()]
    
    if paragraphs == []:
        return np.nan
    print(paragraphs)
    return paragraphs  # Join the paragraphs back into a single string separated by newlines


In [None]:
# display(df7)
# Trying to spot the pattern for unrelated content
# for i in range(len(df7)):
#     news_txt = df7.loc[i, 'news_content']
#     display(news_txt)

df6['cln_news'] = df6['news_content'].apply(clean_text)
# Clean headline
df6['cln_hdl'] = df6['news_content'].apply(clean_text)
# Clean summary
df6['cln_smr'] = df6['news_content'].apply(clean_text)

# TODO: Clean for headline and summary
char_limit = 8000


print(df6.isna().sum())
df6.dropna(inplace=True)
print(f"Total NA values: {df6.isna().sum().sum()}")

# passages2

# Sentiment analysis
- SenticNet
- TextBlob
- nltk
- BERT

TODO:
- Check whether need to unify the results
- Not included subjectivity yet
- Check char limit

In [116]:
# number of tweets pass through API


# Use SenticNet APIs to do sentiment analysis
# pol: polarity, sub: subjectivity
def sentic_api(text, KEY):
    APIURL = 'https://sentic.net/api/' + 'en' + '/' + KEY + '.py?text='
    
    # label is the concept, polarity that is returned from API
    label = str(requests.get(APIURL + text).content)[2:-3]
    return label

def sentic_anal_pol(text_list):
    polarity_list = []
    for i in range(len(text_list)):
        polarity_cat = sentic_api(text_list[i], POLARITY_API_KEY) # Polarity category (positive, negative)
        intensity = sentic_api(text_list[i], INTENSITY_API_KEY)
        if polarity_cat == "POSITIVE":
            polarity_list.append(intensity)
        elif polarity_cat == "NEGATIVE":
            polarity_list.append(-intensity)
        elif polarity_cat == "NEUTRAL":
            polarity_list.append(0)
        else:
            print(f"Unknown polarity: {polarity_cat}")
        
        if i % 10:
            print(f"{i}/{len(text_list)} is done.")
    return(np.mean(polarity_list))

def sentic_anal_sub(text_list):
    intensity_list = []
    for i in range(len(text_list)):
        intensity = sentic_api(text_list[i], SUBJECTIVITY_API_KEY)
        intensity_list.append(intensity)
    return(np.mean(intensity_list))



In [None]:
df6['news_pol_sentic'] = df6['cleaned_news'].apply(sentic_anal_pol)
# df6['news_sub_sentic'] = df6['cleaned_news'].apply(sentic_anal_sub)
# df6['hdl_pol_sentic'] = df6['cln_hdl'].apply(sentic_anal_pol)
# df6['hdl_sub_sentic'] = df6['cln_hdl'].apply(sentic_anal_sub)
# df6['smr_pol_sentic'] = df6['cln_smr'].apply(sentic_anal_pol)
# df6['smr_sub_sentic'] = df6['cln_smr'].apply(sentic_anal_sub)

# TODO: Subjectivity

In [125]:
def blob_anal_pol(text_list):
    pol_list = []
        
    # Assuming text_list is iterable; adjust if 'cleaned_news' is a singular text item
    for txt in text_list:
        # print(txt)
        blob = TextBlob(txt)
        polarity = blob.sentiment.polarity
        pol_list.append(polarity)
        
    # Handling the case where lists might be empty to avoid ZeroDivisionError
    mean_pol = np.mean(pol_list) if pol_list else 0
        
    return mean_pol

def blob_anal_sub(text_list):
    sub_list = []
        
    # Assuming text_list is iterable; adjust if 'cleaned_news' is a singular text item
    for txt in text_list:
        # print(txt)
        blob = TextBlob(txt)
        subjectivity = blob.sentiment.subjectivity
        sub_list.append(subjectivity)
        
    # Handling the case where lists might be empty to avoid ZeroDivisionError
    mean_sub = np.mean(sub_list) if sub_list else 0
        
    return mean_sub

In [126]:
df6['news_pol_blob'] = df6['cleaned_news'].apply(blob_anal_pol)
df6['news_sub_blob'] = df6['cleaned_news'].apply(blob_anal_sub)
df6['hdl_pol_blob'] = df6['cln_hdl'].apply(blob_anal_pol)
df6['hdl_sub_blob'] = df6['cln_hdl'].apply(blob_anal_sub)
df6['smr_pol_blob'] = df6['cln_smr'].apply(blob_anal_pol)
df6['smr_sub_blob'] = df6['cln_smr'].apply(blob_anal_sub)

In [129]:
print(np.min(df6['hdl_sub_blob']))

0.0


In [130]:


# nltk.download('vader_lexicon')  # Download the VADER lexicon if not already downloaded

sentence = 'OCTOBER 2018: A Lion Air MAX plane crashes in Indonesia, killing all 189 people on board.'

sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(sentence)

print(sentiment_scores['compound'])

# VADER compound score: https://stackoverflow.com/questions/40325980/how-is-the-vader-compound-polarity-score-calculated-in-python-nltk

-0.6597


In [None]:

# Load the sentiment analysis pipeline with the chosen BERT model
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_analysis = pipeline("sentiment-analysis", model=model_name)

# Example sentence
sentence = 'OCTOBER 2018: A Lion Air MAX plane crashes in Indonesia, killing all 189 people on board.'

# Get sentiment prediction
result = sentiment_analysis(sentence)

# Print the result
print(result)

# label is 1 to 5 stars. 1 is very negative, 5 is very positive
# Score is the confidence, between 0 to 1

config.json: 100%|██████████| 953/953 [00:00<00:00, 1.35MB/s]
pytorch_model.bin: 100%|██████████| 669M/669M [00:11<00:00, 59.6MB/s] 
  return self.fget.__get__(instance, owner)()
tokenizer_config.json: 100%|██████████| 39.0/39.0 [00:00<00:00, 877B/s]
vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 1.54MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 284kB/s]


[{'label': '1 star', 'score': 0.8441336154937744}]


In [131]:
# After performing sentiment
stm_path = root_dir.joinpath('data', 'proc', 'boeing_stm_20231101_to_20240131.csv')
df6.to_csv(path_or_buf=stm_path, index=False)