In [1]:
import os
import requests
import re

import logging

from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer


from transformers import pipeline
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

import finnhub
from dotenv import load_dotenv
from pathlib import Path    
import numpy as np
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import sys
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
sys.path.append('../') # Change the python path at runtime
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

# Self-created modules
from src.utils import path as path_yq




  from .autonotebook import tqdm as notebook_tqdm


# Set up Variables

In [2]:
load_dotenv()

SUBJECTIVITY_API_KEY = os.environ.get('SUBJECTIVITY_API_KEY')
POLARITY_API_KEY = os.environ.get('POLARITY_API_KEY')
INTENSITY_API_KEY = os.environ.get('INTENSITY_API_KEY')
FINNHUB_API_KEY = os.environ.get('FINNHUB_API_KEY')

BT_START_DATE = '2023-11-01'
BT_START_STR = '20231101'
BT_END_DATE = '2024-01-31'
BT_END_STR = '20240131'

cur_dir = Path.cwd()
root_dir = path_yq.get_root_dir(cur_dir)

logging.basicConfig(filename=Path.joinpath(root_dir, 'logs', 'trading_system.log'),
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG)

# Finnhub API

- Finnhub API
https://finnhub.io/docs/api/company-news

## Fetch News Data

UNIX timezone 1706743226
https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjMxeO_w5CEAxWByjgGHQwwCT8QFnoECA0QAw&url=https%3A%2F%2Fnote.nkmk.me%2Fen%2Fpython-unix-time-datetime%2F%23%3A~%3Atext%3DUnix%2520time%2520is%2520the%2520number%2CPosix%2520time%252C%2520and%2520so%2520on.&usg=AOvVaw0s2rDBPUsnO1N5cO74o2AR&opi=89978449

TODO: Need to remove similar news (might not be the same title)

- 2024-02-04: API down 

- 765 rows after removing image column, dropna and only get yahoo



In [None]:
dates_path = Path.joinpath(root_dir, 'data', 'raw', 'trading_dates.csv')
dates = pd.read_csv(dates_path, index_col=False)

In [None]:
finnhub_client = finnhub.Client(api_key=FINNHUB_API_KEY)

# print(finnhub_client.general_news('general', min_id=0))

# Need to use _from instead of from to avoid conflict

# Cannot fetch all days together. Need to fetch one by one, but rmb rate limit

all_news = pd.DataFrame()

tar_dir = Path.joinpath(root_dir, 'data', 'raw', 'boeing_news')
tar_dir.mkdir(parents=True, exist_ok=True)

# 61 dates
for date in dates['Date']:
    print(date)
    try:
        resp = finnhub_client.company_news('BA', _from=date, to=date)
        df = pd.DataFrame(resp)
        if not df.empty:
            all_news = pd.concat([all_news, df], axis=0)

            csv_path = Path.joinpath(tar_dir, f'{date}.csv')
            df.to_csv(csv_path, index=False)
            
            print(f"Saved news for {date}")
        else:
            print(f"No news for {date}")

        time.sleep(2)  # Respect the API's rate limit
    except Exception as e:
        print(f"Error fetching news for {date}: {e}")

# Save the combined news DataFrame
all_news_path = Path.joinpath(tar_dir.parent, f'BA_API_{BT_START_STR}_{BT_END_STR}.csv')
all_news.to_csv(all_news_path, index=False)
print("All news data fetched and saved.")


# TODO: Explore
# Company Peers
# print(finnhub_client.company_peers('AAPL'))

# Simple API Data Cleaning

## Data Exploration, Feature Engineering

- Sort based on datetime

In [3]:
resp_path = Path.joinpath(root_dir, 'data', 'raw', f'BA_API_{BT_START_STR}_{BT_END_STR}.csv')
df = pd.DataFrame(pd.read_csv(resp_path)).sort_values('datetime').reset_index(drop=True)
display(df)



Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,1698803700,Spirit AeroSystems rises to 3-month high on be...,123548289,,BA,Seeking Alpha,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=a0fd2f98c3ba18b...
1,company,1698808020,Market Today: Goldman Sachs Reveals U.S. Convi...,123548596,,BA,GuruFocus,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=7aa47e4fe31aae6...
2,company,1698812100,Boeing upgraded to Conviction Buy from Buy at ...,123549161,,BA,Thefly.com,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=8bf7a9cb8674b4b...
3,company,1698813120,Wall Street Analysts Are Bullish on Top Indust...,123549160,,BA,TipRanks,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=a0605cdf1e54e30...
4,company,1698814800,Top Defensive Stocks For Turbulent Times,123542215,https://static.seekingalpha.com/cdn/s3/uploads...,BA,SeekingAlpha,Investors are searching for safe haven industr...,https://finnhub.io/api/news?id=fe99f746625a4e7...
...,...,...,...,...,...,...,...,...,...
2115,company,1706649056,Boeing Seen Narrowing Q4 Loss Amid 737 Max Gro...,125415682,https://media.zenfs.com/en/ibd.com/9b23f2f0e09...,BA,Yahoo,Dow Jones giant Boeing reports Q4 results earl...,https://finnhub.io/api/news?id=17a69562336b7cb...
2116,company,1706653428,"Hawaiian Airlines ekes out Q4 revenue beat, ea...",125415680,https://s.yimg.com/ny/api/res/1.2/0SHY9n5yQb8c...,BA,Yahoo,Hawaiian Holdings (HA) — the parent company of...,https://finnhub.io/api/news?id=6ee3d0bf3808121...
2117,company,1706654340,Boeing’s Earnings Are Coming. Investors Are Wa...,125415679,,BA,Yahoo,The list of points to watch when the jet maker...,https://finnhub.io/api/news?id=0a10d35ba5fd463...
2118,company,1706655823,Boeing was once known for safety and engineeri...,125417521,https://media.cnn.com/api/v1/images/stellar/pr...,BA,Yahoo,Part of the fuselage blowing off shortly after...,https://finnhub.io/api/news?id=a947077acf5e67a...


In [4]:
# General information
print(df.describe(include='all'))


       category      datetime                                     headline  \
count      2120  2.120000e+03                                         2120   
unique        1           NaN                                         1992   
top     company           NaN  What You Missed On Wall Street This Morning   
freq       2120           NaN                                           17   
mean        NaN  1.703565e+09                                          NaN   
std         NaN  2.322037e+06                                          NaN   
min         NaN  1.698804e+09                                          NaN   
25%         NaN  1.701427e+09                                          NaN   
50%         NaN  1.704498e+09                                          NaN   
75%         NaN  1.705546e+09                                          NaN   
max         NaN  1.706658e+09                                          NaN   

                  id                                           

In [7]:
# Check NA values
print(df.isna().sum(axis=0))

# Convert UNIX to pandas datetime
df['datetime2'] = pd.to_datetime(df['datetime'], unit="s")
df.loc[pd.isna(df['summary']), :]


category        0
datetime        0
headline        0
id              0
image        1196
related         0
source          0
summary        11
url             0
datetime2       0
dtype: int64


Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url,datetime2
329,company,1700479008,Deutsche Bank Upgrades Boeing (BA),123991449,,BA,Fintel,,https://finnhub.io/api/news?id=97d67e03c4787d7...,2023-11-20 11:16:48
450,company,1701177335,RBC Capital Upgrades Boeing (BA),124139945,,BA,Fintel,,https://finnhub.io/api/news?id=6a765cba60e0c95...,2023-11-28 13:15:35
534,company,1701436102,Stifel Initiates Coverage of Boeing (BA) with ...,124222334,,BA,Fintel,,https://finnhub.io/api/news?id=508f1cee94cf1a0...,2023-12-01 13:08:22
727,company,1702415986,William Blair Initiates Coverage of Boeing (BA...,124467956,,BA,Fintel,,https://finnhub.io/api/news?id=1e22f078b43aa81...,2023-12-12 21:19:46
1258,company,1704874800,My Dividend Stock Portfolio: New December Divi...,124963549,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=c9e29c89a81ddd0...,2024-01-10 08:20:00
1531,company,1705446325,Wells Fargo Downgrades Boeing (BA),125091559,,BA,Fintel,,https://finnhub.io/api/news?id=fcd7a145e602912...,2024-01-16 23:05:25
1923,company,1706186105,United Airlines: A Bright Future Despite Boein...,125311137,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=3d0a145860b5853...,2024-01-25 12:35:05
1932,company,1706199660,NEW YORK MARKET CLOSE: GDP and easing inflatio...,125326823,,BA,Alliance News,,https://finnhub.io/api/news?id=5df32569aba0983...,2024-01-25 16:21:00
1953,company,1706246222,B of A Securities Downgrades Boeing (BA),125315964,,BA,Fintel,,https://finnhub.io/api/news?id=ad83d971b24c01c...,2024-01-26 05:17:02
1999,company,1706510801,Wall Street Breakfast Podcast: United Airlines...,125373285,,BA,SeekingAlpha,,https://finnhub.io/api/news?id=f8a6021d052f26e...,2024-01-29 06:46:41


## Data Cleaning

- Finnhub is others' news
- MarketWatch provides premium news
- Others mcm lead to MarketWatch 

Yahoo                         765
Finnhub                       271
MarketWatch                   188
Thefly.com                    167
Seeking Alpha                 134
Benzinga                      129
TipRanks                      116

- Yahoo consolidates news from others (Reuters, etc.)

In [8]:
df2 = df.drop(columns=['image'])
df2.dropna(inplace=True)
print(df2.groupby('source')['source'].count().sort_values(ascending=False))
df2 = df2[df2['source'] == 'Yahoo']
df2.reset_index(inplace=True, drop=True)

df2

source
Yahoo                         765
Finnhub                       271
MarketWatch                   188
Thefly.com                    167
Seeking Alpha                 134
Benzinga                      129
TipRanks                      116
Alliance News                  78
GuruFocus                      64
SeekingAlpha                   58
InvestorPlace                  42
TalkMarkets                    26
Associated Press, The          22
Market News Video              10
DowJones                        9
United Press International      9
Stock Options Channel           7
247WallSt                       5
PR Newswire                     4
ETF Channel                     2
StockMarket                     1
Business Wire                   1
Preferred Stock Channel         1
Name: source, dtype: int64


Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2
0,company,1698817191,"Ford, GM bumped to buy; Boeing gets 2 upgrades...",123559928,BA,Yahoo,Goldman Sachs upgraded Simon Property Group (N...,https://finnhub.io/api/news?id=edb2b1970709b9d...,2023-11-01 05:39:51
1,company,1698837180,"Stocks slip lower, Fed preview, AMD slides, CV...",123567214,BA,Yahoo,Editor's note: This will be the final issue of...,https://finnhub.io/api/news?id=f457fb892d3c699...,2023-11-01 11:13:00
2,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00
3,company,1698838746,UPDATE 2-Spirit Aero cuts 737 fuselage deliver...,123544219,BA,Yahoo,Spirit AeroSystems on Wednesday announced $101...,https://finnhub.io/api/news?id=9a5fbbaec04442a...,2023-11-01 11:39:06
4,company,1698838790,Spirit Aero cuts 737 fuselage delivery forecast,123544218,BA,Yahoo,(Reuters) -Spirit AeroSystems on Wednesday ann...,https://finnhub.io/api/news?id=f9d73de255cf351...,2023-11-01 11:39:50
...,...,...,...,...,...,...,...,...,...
760,company,1706649056,Boeing Seen Narrowing Q4 Loss Amid 737 Max Gro...,125415682,BA,Yahoo,Dow Jones giant Boeing reports Q4 results earl...,https://finnhub.io/api/news?id=17a69562336b7cb...,2024-01-30 21:10:56
761,company,1706653428,"Hawaiian Airlines ekes out Q4 revenue beat, ea...",125415680,BA,Yahoo,Hawaiian Holdings (HA) — the parent company of...,https://finnhub.io/api/news?id=6ee3d0bf3808121...,2024-01-30 22:23:48
762,company,1706654340,Boeing’s Earnings Are Coming. Investors Are Wa...,125415679,BA,Yahoo,The list of points to watch when the jet maker...,https://finnhub.io/api/news?id=0a10d35ba5fd463...,2024-01-30 22:39:00
763,company,1706655823,Boeing was once known for safety and engineeri...,125417521,BA,Yahoo,Part of the fuselage blowing off shortly after...,https://finnhub.io/api/news?id=a947077acf5e67a...,2024-01-30 23:03:43


In [9]:
# Double confirm there is no more na
print(df2.isna().sum())
print('\n')
print(f"Number of unique dates: {len(df2['datetime2'].dt.strftime('%Y-%m-%d').unique())}")


category     0
datetime     0
headline     0
id           0
related      0
source       0
summary      0
url          0
datetime2    0
dtype: int64


Number of unique dates: 60


In [13]:
# Number of duplicated values in id
print(df2['id'].duplicated().sum())

# Print unique values in different columns
for column in ['category', 'source', 'related']:
    print(f"Unique values in '{column}':")
    print(df2[column].unique())

0
Unique values in 'category':
['company']
Unique values in 'source':
['Yahoo']
Unique values in 'related':
['BA']


## Remove similar news

- 20240210 update
    - Similar news and similar sentiment is hard to definte (subjective)
    - For example, if this market is full of HFT players, they might not react again for similar news (updated) from a published one or a similar news published by another company, or a summary etc.
    

A lot of summary has something like this "Looking for stock market analysis and research with proves results? Zacks.com offers in-depth financial research with over 30years of proven results."
TODO: Find out why

- Finnhub does not produce news themselves
company,1706227967,Boeing production woes 'will get resolved' says major customer Avolon,125318794,BA,Yahoo,"Boeing's production difficulties will ""get resolved"", its 737 MAX 10 aircraft will get certified and the impact of U.S. regulators freezing a planned production ramp-up will be minimal this year, the head of major customer Avolon said.  In the medium term, however, tighter regulation will cause delays and it could take until the end of the decade for the industry to make up the production shortfall experienced since the start of the COVID-19 pandemic, Avolon CEO Andy Cronin said in an interview.  The Federal Aviation Administration (FAA) on Wednesday froze planned increases in production of the 737 MAX following the blow-out of a panel on an Alaska Airlines Boeing jet, raising concerns for airlines and suppliers worldwide.",https://finnhub.io/api/news?id=0f7f8a1d89f6a7d99c2de28813eaec78a35a938d1f56818a743a3195255a5e4c,2024-01-26 00:12:47


company,1706209260,Boeing production woes 'will get resolved' says major customer Avolon,125312655,BA,Finnhub,"Boeing's productiondifficulties will get resolved, its 737 MAX 10 aircraft willget certified and the impact of U.S. regulators freezing aplanned production ramp-up will be minimal this year, the...",https://finnhub.io/api/news?id=7801ccd5fe582f7d41e91ed1ef17d277a7c9ae3c9617b15241583c45785dedf1,2024-01-25 19:01:00

- 2024-02-06
    - Dropped 86 rows with similar summaries.
    - Only used Yahoo
    - Left with 679 rows

In [19]:
# pd.to_datetime(1706519791 - 1706512579, unit='s')

def find_similar_news(df, col, threshold=0.6, period=86400):
    """
    The df needs to be sorted based on the datetime.
    datetime is in UNIX.
    Find similar news in df.
    """
    # Example using 'news_content' column
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[col])

    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Find articles that are similar based on the threshold
    similar_pairs = []
    for i in range(cosine_sim.shape[0]):
        for j in range(i+1, cosine_sim.shape[0]):
            if cosine_sim[i, j] > threshold:
                similar_pairs.append((i, j))

    duplicated_idx = set()
    for i, j in similar_pairs:
        print("--------------------------")
        # Published less than 1 day TODO:
        if abs(df.loc[i, 'datetime'] - df.loc[j, 'datetime']) < period:
            duplicated_idx.add(j)
            print(f"Published within the period")
        print("--------Part 1--------")
        print(df.loc[i, col], '\n')
        print("--------Part 2--------")
        print(df.loc[j, col])
        print('\n')

    duplicated_list = sorted(list(duplicated_idx))
    print(f"Number of duplications: {len(duplicated_list)}")
    return duplicated_list



In [15]:
df2.summary


0      Goldman Sachs upgraded Simon Property Group (N...
1      Editor's note: This will be the final issue of...
2      Pat Shanahan is taking over Spirit AeroSystems...
3      Spirit AeroSystems on Wednesday announced $101...
4      (Reuters) -Spirit AeroSystems on Wednesday ann...
                             ...                        
760    Dow Jones giant Boeing reports Q4 results earl...
761    Hawaiian Holdings (HA) — the parent company of...
762    The list of points to watch when the jet maker...
763    Part of the fuselage blowing off shortly after...
764    Ryanair will take as many Boeing MAX 10 aircra...
Name: summary, Length: 765, dtype: object

In [23]:
duplicated_list = find_similar_news(df=df2, col='summary', threshold=0.6, period=86400)

--------------------------
Published within the period
--------Part 1--------
Spirit AeroSystems on Wednesday announced $101 million in forward losses on key Boeing and Airbus aircraft production and cut its full-year forecast for 737 fuselage deliveries as it grapples with a production defect.  The aerospace supplier took a $47.3 million charge on the Boeing 787 Dreamliner program and a $22.7 million loss on the Airbus A350 program due to higher supply chain and labor costs.  Spirit now expects to deliver 345 to 360 narrowbody 737 fuselages to Boeing in 2023, compared with its prior forecast of 370 to 390 units. 

--------Part 2--------
(Reuters) -Spirit AeroSystems on Wednesday announced $101 million in forward losses on key Boeing and Airbus aircraft production and cut its full-year forecast for 737 fuselage deliveries as it grapples with a production defect.  The aerospace supplier took a $47.3 million charge on the Boeing 787 Dreamliner program and a $22.7 million loss on the Airb

In [24]:
# Drop the rows identified to be similar
df3 = df2.drop(index=duplicated_list, axis=0).reset_index(drop=True)
print(f"Dropped {len(duplicated_list)} rows with similar summaries.")

# Removed similar news
dedup_path = Path.joinpath(root_dir, 'data', 'proc', f'BA_dedup_{BT_START_STR}_{BT_END_STR}.csv')
df3.to_csv(dedup_path, index=False)

Dropped 86 rows with similar summaries.


# News Content


## Fetch News Content

- outputs/error_df.txt
    - Records the errors for Yahoo consent and from fool.com
    - 7 requires [consent](https://consent.yahoo.com)
    - 10 is [Fool](https://www.fool.com)
    - 62 routes to https://sg.yahoo.com/?p=us (link not found)
    - In total is 79 errors, tally with the value output from find_error()

    Need at least 3h

In [29]:
# More robust way to fetch news for a df
# Need to think of how to "add in" the content for rows with problem (additional iterations)
def scrape_news_sel(df, col):
    """
    Fetches news content for each URL in the DataFrame and updates/adds a column with the fetched content.
    Finds the article tag only.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing news URLs.
    - col (str): Column name where the fetched content will be stored.
    """
    # Check if the column exists, if not, initialize it
    if col not in df.columns:
        df[col] = pd.NA  # Using pandas NA for missing data
        
    # Create an empty list to store news content
    content_list = []
    final_url_list = []

    # Set up save directory
    cur_dir = Path.cwd()
    tar_dir = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'data', 'proc')
    
    # Patterns for error detection
    pattern = r'Thank you for your patience|Article tag not found'
    pattern2 = r'sg\.yahoo\.com/\?p=us'

    options = Options()
    options.headless = True # TODO:
    # Add these arguments to try solving unknown exceptions
    options.add_argument('--disable-extensions')  # Disable extensions
    options.add_argument('--disable-gpu')  # Disable GPU hardware acceleration
    options.add_argument('--no-sandbox')  # Bypass OS security model

    service = Service(log_path='chromedriver.log', enable_verbose_logging=True)

    for index, row in df.iterrows():
        content = row.get(col, None)
        final_url = row.get('final_url', '')
        # Don't run for those with pattern2
        if ((pd.isna(content) or content in ['Timeout', 'Error', '']) or re.search(pattern, content)) and not re.search(pattern2, final_url):
            driver = webdriver.Chrome(service=service, options=options)
            driver.set_page_load_timeout(20)  # Adjust timeout as necessary
            driver.set_script_timeout(20) # Solves the issue for unknown errors
            
            try:
                driver.get(row['url'])
                
            except Exception as e:
                driver.execute_script("window.stop();")
                print(f"Attempting to stop for index, url: {index}, {row['url']}")

            try:
                # Proceed to extract the page source
                driver.execute_script("window.stop();")
                soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            except Exception as e:
                print(f"Soup has problem for index, url: {index}, {row['url']}")

            article_content = soup.find('article')
            if article_content:
                print(f"Article content found for index, url: {index}, {row['url']}.")
                # If the article tag is found
                content = str(article_content)
            else:
                content = "Article tag not found"
                print(f"Cannot find article content with tag for index, url: {index}, {row['url']}")
        
            final_url = driver.current_url  # Capture the final URL after any redirects
            print(f"Final url: {final_url}")
            content_list.append(content)
            final_url_list.append(final_url)  # Add the final URL to the list
            driver.quit()

        else:
            content_list.append(content)
            final_url_list.append(final_url)  # Add the final URL to the list

        # Save periodically
        if (index + 1) % 10 == 0:  # For example, save every 10 articles
            print(f"{index + 1} urls have been processed.")
        #     df.loc[:index, col] = pd.Series(content_list)
        #     partial_file_path = Path.joinpath(tar_dir, f'boeing_partial_{col}_20231101_to_20240131.csv')
        #     df.to_csv(partial_file_path, index=False)
        
        time.sleep(1)  # Delay to avoid server blocking
    
    # Update the DataFrame with all the fetched content
    df[col] = pd.Series(content_list, index=df.index)
    df['final_url'] = pd.Series(final_url_list, index=df.index)

    full_file_path = Path.joinpath(tar_dir, f'BA_{col}_sel_{BT_START_STR}_{BT_END_STR}.csv')
    # Save the final DataFrame to CSV
    df.to_csv(full_file_path, index=False)

    return df


In [26]:
dedup_path = Path.joinpath(root_dir, 'data', 'proc', 'BA_dedup_20231101_20240131.csv')
df3 = pd.read_csv(dedup_path, index_col=False)

In [30]:

# To get the df without any previous soup
df4 = df3.copy(deep=True)

# To scrape news and use the same df to fill up error rows
for i in range(3):
    df4 = scrape_news_sel(df4, 'soup')
    error_df = find_error(df4, 'soup')
    print(f"The number of errors out of total is: {len(error_df)} / {len(df4)}")


Article content found for index, url: 0, https://finnhub.io/api/news?id=edb2b1970709b9d5f6026e03051cd5aea646205840ae9770a1f4bb974b7f8af5.
Final url: https://finance.yahoo.com/news/ford-gm-bumped-buy-boeing-053951435.html
Cannot find article content with tag for index, url: 1, https://finnhub.io/api/news?id=f457fb892d3c699358c15597655d0bca1f10a0e14b1f3cfaedd707936557a9ad
Final url: https://sg.yahoo.com/?p=us
Cannot find article content with tag for index, url: 2, https://finnhub.io/api/news?id=0734bb1d9a76a471ef05289ff43a082abcc0e54b571fc50323914cec74f7e3ec
Final url: https://sg.yahoo.com/?p=us
Article content found for index, url: 3, https://finnhub.io/api/news?id=9a5fbbaec04442abbac8db523ffaf4600f79942b7012f81e93f4de4df3733df0.
Final url: https://finance.yahoo.com/news/1-spirit-aero-cuts-737-113906164.html
Article content found for index, url: 4, https://finnhub.io/api/news?id=ca6571146b46d493b46e8dc2780853a854a82da54a8f354762785042b55a0a50.
Final url: https://finance.yahoo.com/news/c

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup,final_url
1,company,1698837180,"Stocks slip lower, Fed preview, AMD slides, CV...",123567214,BA,Yahoo,Editor's note: This will be the final issue of...,https://finnhub.io/api/news?id=f457fb892d3c699...,2023-11-01 11:13:00,Article tag not found,https://sg.yahoo.com/?p=us
2,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00,Article tag not found,https://sg.yahoo.com/?p=us
6,company,1698852540,SpaceX Chief Elon Musk calls out his strugglin...,123567209,BA,Yahoo,One of Elon Musk's secondary goals with SpaceX...,https://finnhub.io/api/news?id=1c4a165f7274f89...,2023-11-01 15:29:00,Article tag not found,https://sg.yahoo.com/?p=us
7,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,Article tag not found,https://sg.yahoo.com/?p=us
18,company,1699280922,"Ryanair CEO O'Leary on Boeing, Fares and Dividend",123664539,BA,Yahoo,"""We are committed to Boeing,"" Ryanair CEO Mich...",https://finnhub.io/api/news?id=1ceb2c9b5ddaf31...,2023-11-06 14:28:42,Article tag not found,https://sg.yahoo.com/?p=us


The number of errors out of total is: 81 / 679
10 urls have been processed.
20 urls have been processed.
30 urls have been processed.
Attempting to stop for index, url: 33, https://finnhub.io/api/news?id=7d855b570001766818a52e557469b6b8b2946f1bfd76862e665d3d33de596a82
Article content found for index, url: 33, https://finnhub.io/api/news?id=7d855b570001766818a52e557469b6b8b2946f1bfd76862e665d3d33de596a82.
Final url: https://finance.yahoo.com/news/boeing-data-published-lockbit-hacking-134722306.html
Attempting to stop for index, url: 34, https://finnhub.io/api/news?id=000b8c6d971b2b5744913f5869fd6998438077130cfde86f43ed1add477946bc
Article content found for index, url: 34, https://finnhub.io/api/news?id=000b8c6d971b2b5744913f5869fd6998438077130cfde86f43ed1add477946bc.
Final url: https://finance.yahoo.com/news/dubai-showcase-demand-jets-under-175243081.html
40 urls have been processed.
50 urls have been processed.
60 urls have been processed.
70 urls have been processed.
80 urls have been

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup,final_url
1,company,1698837180,"Stocks slip lower, Fed preview, AMD slides, CV...",123567214,BA,Yahoo,Editor's note: This will be the final issue of...,https://finnhub.io/api/news?id=f457fb892d3c699...,2023-11-01 11:13:00,Article tag not found,https://sg.yahoo.com/?p=us
2,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00,Article tag not found,https://sg.yahoo.com/?p=us
6,company,1698852540,SpaceX Chief Elon Musk calls out his strugglin...,123567209,BA,Yahoo,One of Elon Musk's secondary goals with SpaceX...,https://finnhub.io/api/news?id=1c4a165f7274f89...,2023-11-01 15:29:00,Article tag not found,https://sg.yahoo.com/?p=us
7,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,Article tag not found,https://sg.yahoo.com/?p=us
18,company,1699280922,"Ryanair CEO O'Leary on Boeing, Fares and Dividend",123664539,BA,Yahoo,"""We are committed to Boeing,"" Ryanair CEO Mich...",https://finnhub.io/api/news?id=1ceb2c9b5ddaf31...,2023-11-06 14:28:42,Article tag not found,https://sg.yahoo.com/?p=us


The number of errors out of total is: 79 / 679
10 urls have been processed.
20 urls have been processed.
30 urls have been processed.
40 urls have been processed.
50 urls have been processed.
60 urls have been processed.
70 urls have been processed.
80 urls have been processed.
90 urls have been processed.
100 urls have been processed.
110 urls have been processed.
120 urls have been processed.
130 urls have been processed.
140 urls have been processed.
150 urls have been processed.
160 urls have been processed.
170 urls have been processed.
180 urls have been processed.
190 urls have been processed.
200 urls have been processed.
210 urls have been processed.
220 urls have been processed.
230 urls have been processed.
240 urls have been processed.
250 urls have been processed.
260 urls have been processed.
270 urls have been processed.
280 urls have been processed.
290 urls have been processed.
300 urls have been processed.
310 urls have been processed.
Cannot find article content with

Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup,final_url
1,company,1698837180,"Stocks slip lower, Fed preview, AMD slides, CV...",123567214,BA,Yahoo,Editor's note: This will be the final issue of...,https://finnhub.io/api/news?id=f457fb892d3c699...,2023-11-01 11:13:00,Article tag not found,https://sg.yahoo.com/?p=us
2,company,1698838200,He Left Boeing. Now He’s the Jet Maker’s Most ...,123544220,BA,Yahoo,Pat Shanahan is taking over Spirit AeroSystems...,https://finnhub.io/api/news?id=0734bb1d9a76a47...,2023-11-01 11:30:00,Article tag not found,https://sg.yahoo.com/?p=us
6,company,1698852540,SpaceX Chief Elon Musk calls out his strugglin...,123567209,BA,Yahoo,One of Elon Musk's secondary goals with SpaceX...,https://finnhub.io/api/news?id=1c4a165f7274f89...,2023-11-01 15:29:00,Article tag not found,https://sg.yahoo.com/?p=us
7,company,1698857220,Boeing shares higher as Goldman Sachs adds pla...,123567208,BA,Yahoo,Longer-term aircraft demand remains a more pow...,https://finnhub.io/api/news?id=bdae52e4e5ec724...,2023-11-01 16:47:00,Article tag not found,https://sg.yahoo.com/?p=us
18,company,1699280922,"Ryanair CEO O'Leary on Boeing, Fares and Dividend",123664539,BA,Yahoo,"""We are committed to Boeing,"" Ryanair CEO Mich...",https://finnhub.io/api/news?id=1ceb2c9b5ddaf31...,2023-11-06 14:28:42,Article tag not found,https://sg.yahoo.com/?p=us


The number of errors out of total is: 79 / 679


In [32]:
soup_sel_path = Path.joinpath(root_dir, 'data', 'proc', f'BA_soup_sel_20231101_20240131.csv')
df4 = pd.read_csv(soup_sel_path, index_col=False)

In [None]:
# def scrape_news_bs4(df, col):
#     """
#     This function is outdated.
#     Fetches news content for each URL in the DataFrame and updates/adds a column with the fetched content.
#     This uses beautiful soup to fetch the entire HTML content, which is very long and time-consuming.
#     The scrape_news_sel() checks for article tag only (and uses ways to manage the missing tag).
#     """
#     # Check if the column exists, if not, initialize it
        
#     # Create an empty list to store news content
#     content_list = []

#     session = requests.Session()  # Use session for connection pooling

#     # Set up save directory
#     cur_dir = Path.cwd()
#     tar_dir = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'data', 'proc')
    
#     pattern = r'Thank you for your patience'

#     for index, row in df.iterrows():
#         try:# Count rows containing the pattern
#             content = row.get(col, None)
#             if pd.isna(content) or content in ['Timeout', 'Error', ''] or re.search(pattern, content):
#                 response = session.get(row['url'], timeout=5)
#                 soup = BeautifulSoup(response.content, 'html.parser')
#                 # print(soup)
#                 content_list.append(str(soup))  # Convert soup object to string to store in DataFrame
#             else:
#                 content_list.append(row[col])
#         except TimeoutException:
#             print(f"Timeout occurred for index, url: {index}, {row['url']}")
#             content_list.append("Timeout")
#         except Exception as e:
#             print(f"Error fetching content from {row['url']}: {e}")
#             content_list.append("Error")
        
#         # Save periodically
#         # if index % 10 == 0:  # For example, save every 10 articles
#         #     df.loc[:index, col] = pd.Series(content_list)
#         #     partial_file_path = Path.joinpath(tar_dir, f'BA_partial_{col}_{BT_START_STR}_{BT_END_STR}.csv')
#         #     df.to_csv(partial_file_path, index=False)
        
#         time.sleep(1)  # Delay to avoid server blocking
    
#     # print(content_list)
#     # Update the DataFrame with all the fetched content
#     df[col] = pd.Series(content_list, index=df.index)
#     # print(df)

#     # full_file_path = Path.joinpath(tar_dir,  f'BA_{col}_bs4_{BT_START_STR}_{BT_END_STR}.csv')
#     # Save the final DataFrame to CSV
#     # df.to_csv(full_file_path, index=False)

#     return df


## Find Errors, Drop NA

Final length, original length, dropped: 600, 679, 79    

In [33]:
# Check if there is any unsuccessful events
def find_error(df, col):
    pattern = r'Thank you for your patience|Article tag not found'
    pattern2 = r'sg\.yahoo\.com/\?p=us'
    error_df = df[df[col].isin(['Timeout', 'Error', '']) | df[col].isna() | df[col].str.contains(pattern, regex=True, na=False)]
    # df['final_url'].str.contains(pattern2, regex=True, na=False) This is not an error, since it won't be scraped again

    if len(error_df):
        display(error_df.head())
        pass
    else:
        print("All content are successfully fetched.")

    return error_df

# error_df = find_error(df4, 'soup')
# print(len(error_df))

In [34]:
def drop_na(df):
    # Drop all the news_content with na
    print(f"Before dropping na: {df.isna().sum()}")
    df1 = df.dropna()
    df1.reset_index(inplace=True, drop=True)
    print(f"After dropping na: {df.isna().sum()}")
    return df1
# df6['url'].duplicated().sum()
# df5 = drop_na(df4)
# df5


In [35]:
def drop_errors(df, col):
    pattern = r'Thank you for your patience|Article tag not found'
    # Use ~ to invert the boolean mask: select rows where conditions are False
    df_cleaned = df[~(df[col].isin(['Timeout', 'Error', '']) | df[col].isna() | df[col].str.contains(pattern, regex=True, na=False))]
    df_cleaned.reset_index(inplace=True, drop=True)
    print(f"Final length, original length, dropped: {len(df_cleaned)}, {len(df)}, {len(df) - len(df_cleaned)}")
    return df_cleaned

In [36]:
df5 = drop_errors(df4, 'soup')
df5

Final length, original length, dropped: 600, 679, 79


Unnamed: 0,category,datetime,headline,id,related,source,summary,url,datetime2,soup,final_url
0,company,1698817191,"Ford, GM bumped to buy; Boeing gets 2 upgrades...",123559928,BA,Yahoo,Goldman Sachs upgraded Simon Property Group (N...,https://finnhub.io/api/news?id=edb2b1970709b9d...,2023-11-01 05:39:51,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/news/ford-gm-bumped-...
1,company,1698838746,UPDATE 2-Spirit Aero cuts 737 fuselage deliver...,123544219,BA,Yahoo,Spirit AeroSystems on Wednesday announced $101...,https://finnhub.io/api/news?id=9a5fbbaec04442a...,2023-11-01 11:39:06,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/news/1-spirit-aero-c...
2,company,1698845429,"Compared to Estimates, Spirit Aerosystems (SPR...",123566505,BA,Yahoo,Although the revenue and EPS for Spirit Aerosy...,https://finnhub.io/api/news?id=ca6571146b46d49...,2023-11-01 13:30:29,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/news/compared-estima...
3,company,1698848517,Morning Brew: AMD's Q4 Guidance Weighs on Stoc...,123545059,BA,Yahoo,Advanced Micro Devices (NASDAQ:AMD) stock was ...,https://finnhub.io/api/news?id=d8793b0d7c45c7f...,2023-11-01 14:21:57,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/news/morning-brew-am...
4,company,1698877471,UPDATE 1-US Air Force blows up Minuteman III i...,123567205,BA,Yahoo,The U.S. Air Force said on Wednesday it had bl...,https://finnhub.io/api/news?id=37c95272c9136be...,2023-11-01 22:24:31,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/news/1-us-air-force-...
...,...,...,...,...,...,...,...,...,...,...,...
595,company,1706649056,Boeing Seen Narrowing Q4 Loss Amid 737 Max Gro...,125415682,BA,Yahoo,Dow Jones giant Boeing reports Q4 results earl...,https://finnhub.io/api/news?id=17a69562336b7cb...,2024-01-30 21:10:56,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/m/41bd2d93-9d2e-3af2...
596,company,1706653428,"Hawaiian Airlines ekes out Q4 revenue beat, ea...",125415680,BA,Yahoo,Hawaiian Holdings (HA) — the parent company of...,https://finnhub.io/api/news?id=6ee3d0bf3808121...,2024-01-30 22:23:48,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/video/hawaiian-airli...
597,company,1706654340,Boeing’s Earnings Are Coming. Investors Are Wa...,125415679,BA,Yahoo,The list of points to watch when the jet maker...,https://finnhub.io/api/news?id=0a10d35ba5fd463...,2024-01-30 22:39:00,"<article class=""caas-container"" role=""article""...",https://finance.yahoo.com/m/e16582bc-be2a-302f...
598,company,1706655823,Boeing was once known for safety and engineeri...,125417521,BA,Yahoo,Part of the fuselage blowing off shortly after...,https://finnhub.io/api/news?id=a947077acf5e67a...,2024-01-30 23:03:43,"<article class=""article"" data-regwall-disabled...",https://edition.cnn.com/2024/01/30/business/bo...


# 5 Parse and Clean content

## Parse Soup

- Identify source:
    - img alt: 1533 a bit hard to search
    - <span class=""caas-attr-provider"">Investing.com</span> Have 597 only
        - <span class=""caas-attr-provider"">Barrons.com: 67
        - <span class=""caas-attr-provider"">Insider Monkey: 10
        - 

In [37]:
def parse_soup(soup):
    """
    Extract text from <p> tags and filter out short or irrelevant content.
    
    Parameters:
    - soup (str): Soup content scraped beforehand.
    - min_length (int): Minimum length of text to keep.
    
    Returns:
    - str: Filtered and concatenated text from <p> tags.
    """
    soup = BeautifulSoup(soup, 'html.parser')
    # Target a specific division by class (or you can use id, etc.)
    target_class = 'caas-body' # FIXME: For Yahoo
    target_div = soup.find('div', class_=target_class)
    # print(soup)
    # If the target division is found, extract texts from its <p> tags
    soup = target_div if target_div else soup
    
    print('--------New soup--------')
    p_texts = '\n'.join([p.get_text().strip() for p in soup.find_all('p')])
    print(p_texts)
   
    return p_texts


In [38]:
df5['news_content'] = df5['soup'].apply(parse_soup)

--------New soup--------
Investing.com — Here is your Pro Recap of the biggest analyst picks you may have missed since yesterday: upgrades at Ford, Boeing, Saia , and Stellantis.
InvestingPro subscribers got this news first. Never miss another market-moving headline.
Ford Motor (NYSE:F) and General Motors (NYSE:GM) were each upgraded to Overweight from Equalweight at Barclays with price targets of $14.00 and $37.00, as reported in real time on InvestingPro.
The analysts said they "believe the different pressures on the business have created 'peak pain,' yielding trading multiples at historical lows."
They added, "While we acknowledge that structural concerns aren’t likely to dissipate any time soon, we believe even a modest reversal of sharply negative sentiment could drive attractive upside."
Barclays said that, of the two, they prefer GM over Ford.
Last week, Ford lost more than 12% after the company reported worse-than-expected Q3 results, disclosing a wider loss in its electric-veh

## Check Content Duplication
- Discuss different news outlet might publish similar news on the same day
- How to identify the news outlet from the same source?

dup_list2 = find_similar_news(df=df5, col='news_content', threshold=0.8, period=90 * 86400)
- Can detect almost identical news
Dropped 49 rows with similar summaries.
Final length, original length, dropped: 551, 600, 49
Refer to outputs/error_df

- When threshold = 0.5, news about different topics are gathered.
    - I want the effect of reiterating the stance, which might drive stock investments




In [45]:
dup_list2 = find_similar_news(df=df5, col='news_content', threshold=0.8, period=90 * 86400)

--------------------------
Published within the period
--------Part 1--------
Mark J. Suchinski; Senior VP & CFO; Spirit AeroSystems Holdings, Inc.
Patrick M. Shanahan; President, CEO & Director; Spirit AeroSystems Holdings, Inc.
Ryan Avey
Cai von Rumohr; MD & Senior Research Analyst; TD Cowen, Research Division
David Egon Strauss; Research Analyst; Barclays Bank PLC, Research Division
Douglas Stuart Harned; SVP and Senior Analyst; Sanford C. Bernstein & Co., LLC., Research Division
Gavin Eric Parsons; Analyst; UBS Investment Bank, Research Division
George D. Shapiro; CEO and Managing Partner; Shapiro Research
Kristine Liwag; Executive Director, Head of Aerospace & Defense Equity Research and Equity Analyst; Morgan Stanley, Research Division
Michael Frank Ciarmoli; Research Analyst; Truist Securities, Inc., Research Division
Myles Alexander Walton; MD & Senior Analyst; Wolfe Research, LLC
Noah Poponak; Equity Analyst; Goldman Sachs Group, Inc., Research Division
Peter J. Arment; Senior

In [46]:
# Drop the rows identified to be similar
df5a = df5.drop(index=dup_list2, axis=0).reset_index(drop=True)
print(f"Dropped {len(dup_list2)} rows with similar summaries.")
print(f"Final length, original length, dropped: {len(df5a)}, {len(df5)}, {len(df5) - len(df5a)}")


Dropped 49 rows with similar summaries.
Final length, original length, dropped: 551, 600, 49


## Parse News Content

- TODO: Find out news outlet (Reuters), (Bloomberg), cannot do after lemmatisation (Everything breaks)

## Cleaning News Content
- Need to do it separately for each type
- Different techniques have different outputs
- Need to name the outputs based on techniques

- Notes from prof
    - here is a sample use for English: sentic.net/api/en/KEY.py?text=senticnet+is+pretty+cool

    - input text does not require any special formatting so feel free to use spaces instead of '+' or '%20'

    - ampersand, hashtag, semicolons, and braces ('&', '#', ';', '{', '}'), however, are illegal characters

    - hence, they should be replaced with colons (':') or removed entirely in the preprocessing phase

    - please note that:

    - 1) API keys are case-sensitive

    - 2) API keys will be valid for about one month

    - 3) API keys are personal and confidential

    - do not share nor use them from different devices or IP addresses

    - or else they will get terminated earlier

    - the capacity limit for our server is 8000 characters 

    - so our recommendation is to cap your input at about 1000 words

    - if you need to process bigger texts, you will have to split them into smaller parts

    - this is also a good idea in case you want to perform a finer-grained analysis of your input

    - all APIs, in fact, are designed to give you an overall judgement about the whole input

    - for more details, split your text into paragraphs or sentences and feed them to the API one by one



Remove stop words or not?
https://www.researchgate.net/publication/335013227_THE_IMPACT_OF_REMOVING_LESS_IMPORTANT_TERMS_ON_SENTIMENT_ANALYSIS

?! might have significant meaning (polarity)
up, down might have crucial meaning in finance

Sentence level sentiment analysis: https://www.researchgate.net/publication/330102084_Sentence-Level_Sentiment_Analysis_of_Financial_News_Using_Distributed_Text_Representations_and_Multi-Instance_Learning

In [47]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

def clean_text(text):
    # print("-------NEW-------")
    # Remove sentences that include "reporting by" (case insensitive)
    pattern_reporting = r'\(\s*(?:Additional\s+)?Reporting by.*?;.*?\)'
    matches_reporting = re.findall(pattern_reporting, text, flags=re.IGNORECASE)
    if matches_reporting:
        text = re.sub(pattern_reporting, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches reporting: {matches_reporting}\n")

    pattern_suggestion = r'Click\s+(?:here\s+)?to (?:get|continue).*'
    matches_suggestion = re.findall(pattern_suggestion, text, flags=re.IGNORECASE)
    if matches_suggestion:
        text = re.sub(pattern_suggestion, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches suggestion: {matches_suggestion}\n")

    # Suggested articles
    pattern_article = r'Suggested articles:.*'
    matches_article = re.findall(pattern_article, text, flags=re.IGNORECASE)
    if matches_article:
        text = re.sub(pattern_article, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches article: {matches_article}\n")

    pattern_adds = r'\(Adds.*?\)'
    matches_adds = re.findall(pattern_adds, text, flags=re.IGNORECASE)
    if matches_adds:
        text = re.sub(pattern_adds, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches adds: {matches_adds}\n")
    
    # I cannot change all the patterns because "By" in a sentence and "By" someone can occur together
    # Need to filter all the matches that are less than 50 char (not a sentence) and substitute
    pattern_by = r'By\s+[A-Z][a-z]+.*\n'
    matches_by = re.findall(pattern_by, text) # Cannot ignore case for this
    if matches_by:
        for match in matches_by:
            if len(match) < 70: # TODO: Assume a sentences is at least 70 char
                text = re.sub(match, '', text)
                logging.debug(f"Matches by: {match}\n")
            else:
                # This is not removed
                logging.debug(f"Matches by but too long: {match}\n")

    pattern_image = r'Image Source:.*\n'
    matches_image = re.findall(pattern_image, text, flags=re.IGNORECASE)
    if matches_image:
        text = re.sub(pattern_image, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches image: {matches_image}\n")

    # # Remove everything starting with "\nPublished:" until the next newline
    # text = re.sub(r'\nPublished:.*?\n', '', text, flags=re.IGNORECASE)

    # # Remove everything including and after "\n(END)"
    # text = re.sub(r'\n\(END\).*', '', text, flags=re.IGNORECASE)

    # # Remove all the links
    # text = re.sub(r'https?://.*?(?=\s|$)', '', text)

    # # Remove the line that starts with "\nRelated Quotes"
    # text = re.sub(r'Related Quotes.*?\n', '', text, flags=re.IGNORECASE)

    # Remove content starting with "©" until the next newline
    # This matches the "©" symbol followed by any characters (.*?), until a newline
    pattern_copyright = r'©.*' # This is mostly the last sentence, don't have \n
    matches_copyright = re.findall(pattern_copyright, text, flags=re.IGNORECASE)
    if matches_copyright:
        text = re.sub(pattern_copyright, '', text, flags=re.IGNORECASE)
        logging.debug(f"Matches copyright: {matches_copyright}\n")
    
    # # To remove the specific sentences directly
    # text = re.sub(r'Thank you for your patience\.\s+Our engineers are working quickly to resolve the issue\.', '', text)
    
    # Remove lines with less than or equal to 20
    pattern_short = r'^.{1,20}$'
    matches_short = re.findall(pattern_short, text, flags=re.MULTILINE)
    if matches_short:
        text = re.sub(pattern_short, '', text, flags=re.MULTILINE)
        logging.debug(f"Matches short: {matches_short}\n")

    text = re.sub(r'\n+', ' ', text)  # Replace newlines with a single space
    text = re.sub(r"[\'‘’\"“”]", "", text)  # Remove all types of apostrophes and quotation marks
    
    # Remove illegal characters for Sentic APIs
    text = re.sub(r'[&#;{}]', '', text)

    text = text.strip()
    
    # Tokenise into sentences # TODO: Check if this will affect finance
    sentences = sent_tokenize(text)

    sentences = [sentence for sentence in sentences if len(sentence) > 20 and len(sentence) <= 8000]

    if sentences == []:
        logging.warning('Empty string.')
        return np.nan
    # print("-------Sentences-------")
    # print(sentences)
    return sentences  # Join the paragraphs back into a single string separated by newlines


In [49]:
df6 = df5a.copy(deep=True)

cols = ['headline', 'summary', 'news_content']
new_cols = {
    'headline': 'cln_hdl', 
    'summary': 'cln_smr',
    'news_content': 'cln_news'
}

for col in cols:
    new_col = new_cols.get(col, '')
    df6[new_col] = df6[col].apply(clean_text)
# # Clean content
# df6['cln_news'] = df6['news_content'].apply(clean_text)
# # Clean headline
# df6['cln_hdl'] = df6['headline'].apply(clean_text)
# # Clean summary
# df6['cln_smr'] = df6['summary'].apply(clean_text)


print(df6.isna().sum())
df6.dropna(inplace=True)
df6.reset_index(inplace=True, drop=True)
print(f"After dropping, total NA values: {df6.isna().sum().sum()}")

# passages2

category        0
datetime        0
headline        0
id              0
related         0
source          0
summary         0
url             0
datetime2       0
soup            0
final_url       0
news_content    0
cln_hdl         0
cln_smr         0
cln_news        0
dtype: int64
After dropping, total NA values: 0


In [50]:
df6.loc[223, 'cln_news']

['In this article, we will be looking at the 25 best online engineering degree programs heading into 2024.',
 'If you want to skip our detailed analysis, you can go directly to the 5 Best Online Engineering Degree Programs Heading Into 2024.',
 'According to a report by Mordor Intelligence, the global engineering services market was valued at $1.6 trillion in 2023.',
 'The market is projected to grow at a compound annual growth rate (CAGR) of 4.2% to reach $1.96 trillion by 2028.',
 'Digital transformation is enabling businesses to optimize processes, enhance customer experiences, and gain data-driven insights by using cutting-edge technologies including Artificial Intelligence (AI) and cloud computing.',
 'The enhanced business capabilities are helping enterprises optimize revenue which is causing a surge in demand for engineering services.',
 'The digital transformation and development of digital infrastructure is expected to fuel the growth of the market during the forecast period.'

## Find out sources

In [52]:
df6 = df6[['id', 'datetime2', 'cln_hdl', 'cln_smr', 'cln_news']]
df6

Unnamed: 0,id,datetime2,cln_hdl,cln_smr,cln_news
0,123559928,2023-11-01 05:39:51,"[Ford, GM bumped to buy Boeing gets 2 upgrades...",[Goldman Sachs upgraded Simon Property Group (...,[Investing.com — Here is your Pro Recap of the...
1,123544219,2023-11-01 11:39:06,[UPDATE 2-Spirit Aero cuts 737 fuselage delive...,[Spirit AeroSystems on Wednesday announced $10...,"[(Adjusts shares in paragraph 5, adds Airbus c..."
2,123566505,2023-11-01 13:30:29,"[Compared to Estimates, Spirit Aerosystems (SP...",[Although the revenue and EPS for Spirit Aeros...,"[For the quarter ended September 2023, Spirit ..."
3,123545059,2023-11-01 14:21:57,[Morning Brew: AMDs Q4 Guidance Weighs on Stoc...,[Advanced Micro Devices (NASDAQ:AMD) stock was...,[Advanced Micro Devices (NASDAQ:AMD) stock was...
4,123567205,2023-11-01 22:24:31,[UPDATE 1-US Air Force blows up Minuteman III ...,[The U.S. Air Force said on Wednesday it had b...,[Nov 1 (Reuters) - The U.S. Air Force said on ...
...,...,...,...,...,...
546,125415682,2024-01-30 21:10:56,[Boeing Seen Narrowing Q4 Loss Amid 737 Max Gr...,[Dow Jones giant Boeing reports Q4 results ear...,[Dow Jones giant Boeing reports Q4 results ear...
547,125415680,2024-01-30 22:23:48,"[Hawaiian Airlines ekes out Q4 revenue beat, e...",[Hawaiian Holdings (HA) — the parent company o...,[Hawaiian Holdings (HA) — the parent company o...
548,125415679,2024-01-30 22:39:00,"[Boeings Earnings Are Coming., Investors Are W...",[The list of points to watch when the jet make...,[The number of watch items in Boeings fourth-q...
549,125417521,2024-01-30 23:03:43,[Boeing was once known for safety and engineer...,[Part of the fuselage blowing off shortly afte...,[Part of the fuselage blowing off shortly afte...


## Refine Sentences (Lemma, stop)

In [53]:
def refine(sentences):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Load English stop words
    stop_words = set(stopwords.words('english'))

    # Function to clean and lemmatise a sentence
    def clean_and_lemmatise(sentence):
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        # Remove stop words and perform lemmatisation
        lemmatised_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
        # Rejoin words into a sentence
        return " ".join(lemmatised_words)

    # Apply the function to each sentence
    clean_sentences = [clean_and_lemmatise(sentence) for sentence in sentences]

    return clean_sentences

In [54]:
cols = ['cln_hdl', 'cln_smr', 'cln_news']

for col in cols:
    new_col = col + '_lemma'
    df6[new_col] = df6[col].apply(refine)

In [55]:
cleaned_path = Path.joinpath(root_dir, 'data', 'proc', f'BA_cln_{BT_START_STR}_{BT_END_STR}.csv')
df6.to_csv(cleaned_path, index=False)

# Sentiment analysis
- SenticNet
- TextBlob

- SID (frmo NTLK VADER)
- BERT
- finBERT: https://huggingface.co/ProsusAI/finbert/tree/main

TODO:
- Check whether need to unify the results
- Not included subjectivity yet
- Check char limit



In [56]:
cols = ['cln_hdl', 'cln_smr', 'cln_news',
        'cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']

import ast

def convert_data(row):
    try:
        # First, try to evaluate the row as a list
        evaluated = ast.literal_eval(row)
        # If the result is a list, return it directly
        if isinstance(evaluated, list):
            return evaluated
        # If not, it's already the correct type (int, float, etc.)
        return evaluated
    except ValueError:
        # Handle the case where the row is not a valid Python literal
        # This could be a string that should not be converted
        return row
    except SyntaxError:
        # Handle syntax errors which might occur if ast.literal_eval can't parse the string
        return row
    except Exception as e:
        print(f'Exception: {e}')
        return row

cleaned_path = Path.joinpath(root_dir, 'data', 'proc', f'BA_cln_{BT_START_STR}_{BT_END_STR}.csv')
df7 = pd.read_csv(cleaned_path, index_col=False)

# Assuming 'list_column' is the name of your column containing the string representation of lists
for col in cols:
    df7[col] = df7[col].apply(convert_data)

# print(df7.equals(df6))

## Sentic

In [57]:
# pol: polarity, sub: subjectivity
def sentic_api(text, KEY):
    APIURL = f'https://sentic.net/api/en/{KEY}.py?text='

    attempt = 0
    max_attempt = 5
    while attempt < 5:
        try:
            attempt += 1
            # Adding a timeout of 5 seconds as an example
            response = requests.get(APIURL + text, timeout=120)
            # Assuming the API returns a string in the expected format
            label = str(response.content)[2:-3]
            return label
        except requests.exceptions.Timeout:
            # Handle timeout exception
            if attempt < max_attempt: 
                print(f"Sentic request timed out. Attempt {attempt} of {max_attempt}.")
            else:
                print("No more retries.") 
                return None
        except Exception as e:
            # Handle other requests exceptions
            if attempt < max_attempt: 
                print(f"Attempt {attempt} of {max_attempt}. Error: {e}")
            else:
                print("No more retries.")
                return None

def sentic_anal_pol(text_list):
    polarity_list = []
    for i in range(len(text_list)):
        try:
            polarity_cat = sentic_api(text_list[i], POLARITY_API_KEY) # Polarity category (positive, negative)
            intensity = float(sentic_api(text_list[i], INTENSITY_API_KEY)) / 100
            if polarity_cat == "POSITIVE":
                polarity_list.append(intensity)
            elif polarity_cat == "NEGATIVE":
                polarity_list.append(-intensity)
            elif polarity_cat == "NEUTRAL":
                polarity_list.append(0)
            else:
                print(f"Unknown polarity: {polarity_cat}")
            
            print(f"{(i + 1)}/{len(text_list)} sentic_anal_pol completed.")
        except Exception as e:
            polarity_list.append(None)
            print(f"sentic_anal_pol exception for text: {text_list[i]}, {e}")

    print(f"sentic_anal_pol completed.")

    return(polarity_list)

# Need to add except block
# def sentic_anal_sub(text_list):
#     """
#     Analyze the subjectivity of sentences in a list.

#     This function calls an external API to determine the subjectivity of each
#     sentence in the input list. Each sentence is classified as either
#     "SUBJECTIVE" or "OBJECTIVE".

#     Parameters:
#     - text_list (list of str): A list of sentences to be analyzed for subjectivity.

#     Returns:
#     - list of str: A list containing "SUBJECTIVE" or "OBJECTIVE" for each input sentence,
#       corresponding to the sentence's classified subjectivity.

#     Note:
#     - Requires a valid SUBJECTIVITY_API_KEY set in the environment.
#     """
#     subjectivity_list = []
#     for i in range(len(text_list)):
#         subjectivity = sentic_api(text_list[i], SUBJECTIVITY_API_KEY)
#         subjectivity_list.append(subjectivity)
        
#         # if (i + 1) % 10 == 0:
#         #     print(f"{(i + 1)}/{len(text_list)} sentic_anal_sub completed.")
#         print(f"{(i + 1)}/{len(text_list)} sentic_anal_sub completed.")
#     # print(f"sentic_anal_sub completed.")
#     return(subjectivity_list)



In [58]:
cols = ['cln_hdl', 'cln_smr', 'cln_news',
        'cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']
def batch_sentic_anal(df):
    for col in cols:
        df[f'{col}_pol_stc'] = df[col].apply(sentic_anal_pol)
        # df[f'{col}_sub_stc'] = df[col].apply(sentic_anal_sub)
        df.to_csv(stm_path, index=False)

# tmp_df = df7.head(2) 
# batch_sentic_anal(tmp_df)
# tmp_df

## Blob

In [59]:
def blob_anal_pol(text_list):
    pol_list = []
        
    # Assuming text_list is iterable
    for txt in text_list:
        # print(txt)
        blob = TextBlob(txt)
        polarity = blob.sentiment.polarity
        pol_list.append(polarity)
        
    # print(f"blob_anal_pol completed.")
    return pol_list

# def blob_anal_sub(text_list):
#     sub_list = []
        
#     for txt in text_list:
#         # print(txt)
#         blob = TextBlob(txt)
#         subjectivity = blob.sentiment.subjectivity
#         sub_list.append(subjectivity)
        
#     # Handling the case where lists might be empty to avoid ZeroDivisionError
#     # mean_sub = np.mean(sub_list) if sub_list else 0
#     print(f"blob_anal_sub completed.")
#     return sub_list

In [60]:
def batch_blob_anal(df):
    for col in cols:
        df[f'{col}_pol_blob'] = df[col].apply(blob_anal_pol)
        # df[f'{col}_sub_blob'] = df[col].apply(blob_anal_sub)
        df.to_csv(stm_path, index=False)


# tmp_df = df7.head(3) # TODO:
# batch_blob_anal(tmp_df)
# tmp_df

## SID

In [61]:

def sid_anal_pol(text_list):
    pol_list = []
    
    sid = SentimentIntensityAnalyzer()
    # Assuming text_list is iterable
    for txt in text_list:
        
        sentiment_scores = sid.polarity_scores(txt)
        pol_list.append(sentiment_scores['compound'])
    # print("sid_anal_pol completed.")
    return pol_list
# nltk.download('vader_lexicon')  # Download the VADER lexicon if not already downloaded

# VADER compound score: https://stackoverflow.com/questions/40325980/how-is-the-vader-compound-polarity-score-calculated-in-python-nltk

In [62]:
def batch_sid_anal(df, tech='sid'):
    """
    Batch process text_list in different dataframes using different techniques.
    """
    for col in cols:
        df[f'{col}_pol_{tech}'] = df[col].apply(sid_anal_pol)
        df.to_csv(stm_path, index=False)

# batch_sid_anal(tmp_df)
# tmp_df

## Bert

BERT Original Paper: "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" by Jacob Devlin et al. It introduces BERT and its underlying principles.

In [63]:
# label is 1 to 5 stars. 1 is very negative, 5 is very positive
# Score is the confidence, between 0 to 1

def bert_anal_pol(text_list):
    pol_list = []
    
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    bert = pipeline("sentiment-analysis", model=model_name)

    for txt in text_list:
        sentiment = bert(txt)[0]
        # print(sentiment)
        polarity_cat = sentiment['label']
        intensity = float(sentiment['score'])
        # print(sentiment)

        star_num = int(polarity_cat[0])
        if star_num == 5:
            final_score = 0.75 + (intensity - 0.5) * 0.5
        elif star_num == 4:
            final_score = 0.25 + (intensity - 0.5) * 0.5
        elif star_num == 3:
            final_score = 0
        elif star_num == 2:
            final_score = - (0.25 + (intensity - 0.5) * 0.5)
        elif star_num == 1:
            final_score = - (0.75 + (intensity - 0.5) * 0.5)
        else:
            print("WARNING: Unknown polarity.")
            final_score = None
        pol_list.append(final_score)
        print("bert_anal_pol completed.")
    return pol_list

# # Example sentence
# sentences = ['Table', 'Very strong positive cash flow', 'The stock fell by 10%', 'The stock market crashed']

# # Get sentiment prediction
# result = bert_anal_pol(sentences)

# # Print the result
# print(result)

In [64]:
def batch_bert_anal(df, tech='bert'):
    """
    Batch process text_list in different dataframes using different techniques.
    """
    for col in cols:
        df[f'{col}_pol_{tech}'] = df[col].apply(bert_anal_pol)
        df.to_csv(stm_path, index=False)

# batch_bert_anal(tmp_df)
# tmp_df

## FinBert

- Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [65]:
def finbert_anal_pol(text_list):
    pol_list = []
    
    finbert = pipeline("text-classification", model="ProsusAI/finbert")
    # Assuming text_list is iterable; adjust if 'cleaned_news' is a singular text item
    for txt in text_list:
        sentiment = finbert(txt)[0]
        # print(sentiment)
        polarity_cat = sentiment['label']
        intensity = float(sentiment['score'])
        if polarity_cat == "positive":
            pol_list.append(intensity)
        elif polarity_cat == "negative":
            pol_list.append(-intensity)
        elif polarity_cat == "neutral":
            pol_list.append(0)
        else:
            pol_list.append(None)
            print("WARNING: Unkown polarity category.")
    print("finbert_anal_pol completed.")
    return pol_list

# # Example sentence
# sentences = ['He', 'Positive cash flow', 'The stock fell by 10%', 'The stock market crashed']

# # Get sentiment prediction
# result = finbert_anal_pol(sentences)

# # Print the result
# print(result)

In [66]:
def batch_finbert_anal(df, tech='finbert'):
    """
    Batch process text_list in different dataframes using different techniques.
    """
    for col in cols:
        df[f'{col}_pol_{tech}'] = df[col].apply(finbert_anal_pol)
        df.to_csv(stm_path, index=False)

# batch_finbert_anal(tmp_df)
# tmp_df

## Batch

In [67]:
simple_cols = ['cln_hdl', 'cln_smr', 'cln_news']
counts = []
for col in simple_cols:
    count = df7[col].apply(lambda x: len(x) if x else 0)
    sum_cnt = sum(count)
    counts.append(sum_cnt)
    print(f"Sentence count in {col} is: {sum_cnt}")
print(f"Total sentences in the 3 col is: {sum(counts)}")
print(f"({counts[0]} + {counts[1]} + {counts[2]}) * 2 = {sum(counts) * 2}")
print(f"Length of df7 is {len(df7)}")

Sentence count in cln_hdl is: 586
Sentence count in cln_smr is: 1309
Sentence count in cln_news is: 12279
Total sentences in the 3 col is: 14174
(586 + 1309 + 12279) * 2 = 28348
Length of df7 is 551


In [70]:
28348 / 4224 * 9.6 

64.42727272727272

- After doing threshold=0.5, only do polarity, do starting from 2024-01-01 to 2024-01-14
    - Sentic
        - (119 + 270 + 1723) * 2 {with and w/o tokenization} = 4224 (9.6h)
    - Textblob
        - (119 + 270 + 1723) * 2: 1s
    - SID
        - (119 + 270 + 1723) * 2: 3.8s
    - Bert
        - (119 + 270 + 1723) * 2 / 91 * 157 / 60 / 60 = 2.02h
    - FinBert
        - (119 + 270 + 1723) * 2 / 124 * 103 / 60 / 60 = 1.78h

- 20240210
    Sentence count in cln_hdl is: 500
    Sentence count in cln_smr is: 1070
    Sentence count in cln_news is: 9582
    Total sentences in the 3 col is: 11152
    (500 + 1070 + 9582) * 2 = 22304
    Length of df7 is 467

    - Other than stc, all others used 93 minutes in total

- 20240220
    Sentence count in cln_hdl is: 586
    Sentence count in cln_smr is: 1309
    Sentence count in cln_news is: 12279
    Total sentences in the 3 col is: 14174
    (586 + 1309 + 12279) * 2 = 28348
    Length of df7 is 551

    - Sentic API needs 2.68 days or 64.4h

In [71]:
def batch_anal(df):
    # After performing sentiment
    # TODO: Sentic API uses other parallel notebooks
    # batch_sentic_anal(df)
    # print('batch_sentic_anal completed.')
    batch_blob_anal(df)
    print('batch_blob_anal completed.')
    batch_sid_anal(df)
    print('batch_sid_anal completed.')
    batch_bert_anal(df)
    print('batch_bert_anal completed.')
    batch_finbert_anal(df)
    print('batch_finbert_anal completed.')

    df.to_csv(stm_path, index=False)

cols = ['cln_hdl', 'cln_smr', 'cln_news',
        'cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']
stm_path = root_dir.joinpath('data', 'proc', f'BA_stm_{BT_START_STR}_{BT_END_STR}.csv') 

# df7['datetime2'] = pd.to_datetime(df7['datetime2'])
# df7 = df7[(df7['datetime2'] >= pd.to_datetime('2024-01-01')) & (df7['datetime2'] <= pd.to_datetime('2024-01-14'))]
# df7.reset_index(inplace=True, drop=True)


batch_anal(df7)


batch_blob_anal completed.
batch_sid_anal completed.


  return self.fget.__get__(instance, owner)()


bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.
bert_anal_pol completed.


# Aggregate scores

## Merge the dfs

8 similar columns
3 content * 2 lemma * 5 techniques = 30
In totla 38 columns

In [None]:
stm_path = root_dir.joinpath('data', 'proc', f'BA_stm_{BT_START_STR}_{BT_END_STR}.csv') 
df8p = pd.read_csv(stm_path, index_col=False)



In [None]:
df8p.head()

In [None]:
# stc1

df_list = [df8p]

# Index 1 and 2
# cols = ['cln_hdl', 'cln_smr', 'cln_news']
# cols = ['cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']

# TODO: Change for the number of parallel programming
for i in range(1, 4):
    path = root_dir.joinpath('data', 'proc', f'BA_stm_stc{i}_{BT_START_STR}_{BT_END_STR}.csv') 
    try:
        partial = pd.read_csv(path, index_col=False)
        df_list.append(partial)
    except Exception as e:
        print(e)

from functools import reduce

# New columns are the sentiment scores specific to each technique
keys = ['id', 'datetime2', 'cln_hdl', 'cln_smr', 'cln_news', 'cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']
df8 = reduce(lambda left, right: pd.merge(left, right, on=keys, how='left'), df_list)
df8


In [None]:
# Apply the conversion function to each specified column
for col in df8.columns:
    df8[col] = df8[col].apply(convert_data)
df8['datetime2'] = pd.to_datetime(df8['datetime2'])

print(type(df8['datetime2'][0]))

In [None]:
df8.isna().sum().sum()

In [None]:
from statistics import mean

# Assuming tmp['cln_hdl_lemma_pol_bert'] is a Series of lists of floats
# Calculate the mean of each list and then the overall mean
techs = ['stc', 'blob', 'sid', 'bert', 'finbert']
# techs = ['blob', 'sid']
cols = ['cln_hdl', 'cln_smr', 'cln_news',
        'cln_hdl_lemma', 'cln_smr_lemma', 'cln_news_lemma']

# For each of the techniques applied to the texts (with or without lemmatisation),
# we will get a list of scores for each cell. Each score represents the sentiment score
# for each sentence in one piece of news content.
# Now we need to access each list, get the mean without the zeros, and return an overall score for the news content
for tech in techs:
    for col in cols:
        print(f'{col}_pol_{tech}')
        # If there is at least one non-zero element (can be positive or negative) in the list, 
        # we will calculate the mean, else just return 0
        df8[f'{col}_pol_{tech}_score'] = df8[f'{col}_pol_{tech}'].apply(lambda l: mean([i for i in l if i != 0]) if any(i != 0 for i in l) else 0)


In [None]:
df8.tail()

In [None]:
df8[df8['cln_hdl_pol_stc_score'] < 0]

In [None]:
df8.loc[0, 'cln_news_lemma']

In [None]:
df8.loc[0, 'cln_news']

In [None]:

# Temporarily adjust display settings to show the full content of one row
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full width of each column

# print(tmp.loc[0])

# Reset options to default if desired
pd.reset_option('display.max_columns')
pd.reset_option('display.max_colwidth')



In [None]:
score_path = root_dir.joinpath('data', 'proc', f'BA_score_{BT_START_STR}_{BT_END_STR}.csv') 
df8.to_csv(score_path, index=False)


TODO: ANOVA? (Need the lists of returns, can get this from each trade), need normal distr,... conditions, 
TODO: sharpe ratio(?), treasury bond...
TODO: correlation between AIRBUS, BOEING