In [1]:
import os
import requests
import re

import logging

from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer


from transformers import pipeline
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

import finnhub
from dotenv import load_dotenv
from pathlib import Path    
import numpy as np
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import sys
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
sys.path.append('../') # Change the python path at runtime
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

# Self-created modules
from src.utils import path as path_yq

load_dotenv()

SUBJECTIVITY_API_KEY = os.environ.get('SUBJECTIVITY_API_KEY')
POLARITY_API_KEY = os.environ.get('POLARITY_API_KEY')
INTENSITY_API_KEY = os.environ.get('INTENSITY_API_KEY')
FINNHUB_API_KEY = os.environ.get('FINNHUB_API_KEY')

BT_START_DATE = '2023-11-01'
BT_START_STR = '20231101'
BT_END_DATE = '2024-01-31'
BT_END_STR = '20240131'

cur_dir = Path.cwd()
root_dir = path_yq.get_root_dir(cur_dir)

logging.basicConfig(filename=Path.joinpath(root_dir, 'logs', 'trading_system.log'),
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG)



  from .autonotebook import tqdm as notebook_tqdm


In [12]:
import ast

def convert_data(row):
    try:
        # First, try to evaluate the row as a list
        evaluated = ast.literal_eval(row)
        # If the result is a list, return it directly
        if isinstance(evaluated, list):
            return evaluated
        # If not, it's already the correct type (int, float, etc.)
        return evaluated
    except ValueError:
        # Handle the case where the row is not a valid Python literal
        # This could be a string that should not be converted
        return row
    except SyntaxError:
        # Handle syntax errors which might occur if ast.literal_eval can't parse the string
        return row
    except Exception as e:
        print(f'Exception: {e}')
        return row

# pol: polarity, sub: subjectivity
def sentic_api(text, KEY):
    APIURL = f'https://sentic.net/api/en/{KEY}.py?text='

    attempt = 0
    max_attempt = 5
    while attempt < 5:
        try:
            attempt += 1
            # Adding a timeout of 5 seconds as an example
            response = requests.get(APIURL + text, timeout=120)
            # Assuming the API returns a string in the expected format
            label = str(response.content)[2:-3]
            return label
        except requests.exceptions.Timeout:
            # Handle timeout exception
            if attempt < max_attempt: 
                print(f"Sentic request timed out. Attempt {attempt} of {max_attempt}.")
            else:
                print("No more retries.") 
                return None
        except Exception as e:
            # Handle other requests exceptions
            if attempt < max_attempt: 
                print(f"Attempt {attempt} of {max_attempt}. Error: {e}")
            else:
                print("No more retries.")
                return None

def sentic_anal_pol(text_list):
    polarity_list = []
    for i in range(len(text_list)):
        try:
            polarity_cat = sentic_api(text_list[i], POLARITY_API_KEY) # Polarity category (positive, negative)
            intensity = float(sentic_api(text_list[i], INTENSITY_API_KEY)) / 100
            if polarity_cat == "POSITIVE":
                polarity_list.append(intensity)
            elif polarity_cat == "NEGATIVE":
                polarity_list.append(-intensity)
            elif polarity_cat == "NEUTRAL":
                polarity_list.append(0)
            else:
                print(f"Unknown polarity: {polarity_cat}")
            
            print(f"{(i + 1)}/{len(text_list)} sentic_anal_pol completed.")
        except Exception as e:
            polarity_list.append(None)
            print(f"sentic_anal_pol exception for text: {text_list[i]}, {e}")

    print(f"sentic_anal_pol completed.")

    return(polarity_list)

def batch_sentic_anal(df, stm_path):
    # Ensure the DataFrame includes the necessary columns, else fill with NA
    for col in cols:
        pol_stc_col = f'{col}_pol_stc'
        
        # Check if the sentiment polarity column exists, if not create it filled with NAs
        if pol_stc_col not in df.columns:
            df[pol_stc_col] = [[] for _ in range(len(df))]

    # Iterate over each row in DataFrame
    for idx, row in df.iterrows():
        for col in cols:
            pol_stc_col = f'{col}_pol_stc'
            # Only process if the current sentiment score is empty list
            if row[pol_stc_col] == []:
                df.at[idx, pol_stc_col] = sentic_anal_pol(row[col])

        # Optionally, save the DataFrame after processing each row (though this example saves it at the end)
        df.to_csv(stm_path, index=False)

def batch_anal(df):
    # After performing sentiment
    
    batch_sentic_anal(df, stm_path)
    print('batch_sentic_anal completed.')

    # df.to_csv(stm_path, index=False)



In [13]:

# Path for new processing with no target columns
# cleaned_path = Path.joinpath(root_dir, 'data', 'proc', f'BA_cln_{BT_START_STR}_{BT_END_STR}.csv')

# Path for intermediate processing
cleaned_path = root_dir.joinpath('data', 'proc', f'BA_stm_stc1_{BT_START_STR}_{BT_END_STR}.csv') 
df7 = pd.read_csv(cleaned_path, index_col=False)

cols = ['cln_hdl', 'cln_smr']

# Assuming 'list_column' is the name of your column containing the string representation of lists
for col in cols:
    df7[col] = df7[col].apply(convert_data)

stm_path = root_dir.joinpath('data', 'proc', f'BA_stm_stc1_{BT_START_STR}_{BT_END_STR}.csv') 

In [14]:
# TODO: Change columns, and pathname, and df7

batch_anal(df7)
print("All completed.")

batch_sentic_anal completed.
All completed.


In [15]:
df7

Unnamed: 0,id,datetime2,cln_hdl,cln_smr,cln_news,cln_hdl_lemma,cln_smr_lemma,cln_news_lemma,cln_hdl_pol_stc,cln_smr_pol_stc
0,123559928,2023-11-01 05:39:51,"[Ford, GM bumped to buy Boeing gets 2 upgrades...",[Goldman Sachs upgraded Simon Property Group (...,['Investing.com — Here is your Pro Recap of th...,"['Ford , GM bumped buy Boeing get 2 upgrade : ...",['Goldman Sachs upgraded Simon Property Group ...,['Investing.com — Pro Recap biggest analyst pi...,[0.8],[0.74]
1,123544219,2023-11-01 11:39:06,[UPDATE 2-Spirit Aero cuts 737 fuselage delive...,[Spirit AeroSystems on Wednesday announced $10...,"['(Adjusts shares in paragraph 5, adds Airbus ...",['UPDATE 2-Spirit Aero cut 737 fuselage delive...,['Spirit AeroSystems Wednesday announced $ 101...,"['( Adjusts share paragraph 5 , add Airbus com...",[0.83],"[-0.79, 0.6, 0.72]"
2,123566505,2023-11-01 13:30:29,"[Compared to Estimates, Spirit Aerosystems (SP...",[Although the revenue and EPS for Spirit Aeros...,"['For the quarter ended September 2023, Spirit...","['Compared Estimates , Spirit Aerosystems ( SP...",['Although revenue EPS Spirit Aerosystems ( SP...,"['quarter ended September 2023 , Spirit Aerosy...",[0.43],[0.61]
3,123545059,2023-11-01 14:21:57,[Morning Brew: AMDs Q4 Guidance Weighs on Stoc...,[Advanced Micro Devices (NASDAQ:AMD) stock was...,['Advanced Micro Devices (NASDAQ:AMD) stock wa...,['Morning Brew : AMDs Q4 Guidance Weighs Stock...,['Advanced Micro Devices ( NASDAQ : AMD ) stoc...,['Advanced Micro Devices ( NASDAQ : AMD ) stoc...,[-0.33],"[0.77, 0.39, 0.8]"
4,123567205,2023-11-01 22:24:31,[UPDATE 1-US Air Force blows up Minuteman III ...,[The U.S. Air Force said on Wednesday it had b...,['Nov 1 (Reuters) - The U.S. Air Force said on...,['UPDATE 1-US Air Force blow Minuteman III tes...,['U.S. Air Force said Wednesday blown Minutema...,['Nov 1 ( Reuters ) - U.S. Air Force said Wedn...,[0.6],"[0.39, 0.59, 0.83]"
...,...,...,...,...,...,...,...,...,...,...
546,125415682,2024-01-30 21:10:56,[Boeing Seen Narrowing Q4 Loss Amid 737 Max Gr...,[Dow Jones giant Boeing reports Q4 results ear...,['Dow Jones giant Boeing reports Q4 results ea...,['Boeing Seen Narrowing Q4 Loss Amid 737 Max G...,['Dow Jones giant Boeing report Q4 result earl...,['Dow Jones giant Boeing report Q4 result earl...,[-0.75],"[0.56, 0.33]"
547,125415680,2024-01-30 22:23:48,"[Hawaiian Airlines ekes out Q4 revenue beat, e...",[Hawaiian Holdings (HA) — the parent company o...,['Hawaiian Holdings (HA) — the parent company ...,"['Hawaiian Airlines ekes Q4 revenue beat , ear...",['Hawaiian Holdings ( HA ) — parent company Ha...,['Hawaiian Holdings ( HA ) — parent company Ha...,[0.54],"[0.59, 0.69, 0.59, -0.14]"
548,125415679,2024-01-30 22:39:00,"[Boeings Earnings Are Coming., Investors Are W...",[The list of points to watch when the jet make...,['The number of watch items in Boeings fourth-...,"['Boeings Earnings Coming .', 'Investors Watch...",['list point watch jet maker report latest res...,['number watch item Boeings fourth-quarter rep...,"[0.19, 0.86]",[-0.5]
549,125417521,2024-01-30 23:03:43,[Boeing was once known for safety and engineer...,[Part of the fuselage blowing off shortly afte...,['Part of the fuselage blowing off shortly aft...,"['Boeing known safety engineering .', 'critic ...","['Part fuselage blowing shortly takeoff , leav...","['Part fuselage blowing shortly takeoff , leav...","[0.18, 0.85]","[-0.63, -0.57, 0.81, -0.38]"
