# Main

## initialization

In [3]:
# hide output
%%capture output

!pip install setuptools
!pip install -U polygon-api-client

In [4]:
import os
import numpy as np
import pandas as pd
import requests
import json
import pytz
import warnings
import time
from polygon import RESTClient
from google.colab import drive
from datetime import datetime, timedelta
from bisect import bisect_right
from tqdm import tqdm



# define const
MAX_NUM_NEWS = 1000
NA = 'null_val'
SCORE_ERROR = 100
SCORE_UNINIT = 200

warnings.filterwarnings('ignore')



# access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Sentimental/'
records_path = os.path.join(path, 'data', 'records.csv')



# get api
with open(os.path.join(path, 'data', 'api_key_eodhd.txt'), 'r') as file:
    api_key_eodhd = file.read()

with open(os.path.join(path, 'data', 'api_key_gpt.txt'), 'r') as file:
    api_key_gpt = file.read()

with open(os.path.join(path, 'data', 'api_key_yahoo.txt'), 'r') as file:
    api_key_yahoo = file.read()
client = RESTClient(api_key_yahoo)



# get stock list
stocks = pd.read_csv(os.path.join(path, 'data', 'stock_list.csv'), index_col = 0)
stock_lst = [[company, stocks.loc[company, 'stock_code']] for company in stocks.index]
stocks

Mounted at /content/drive


Unnamed: 0,stock_code
apple,AAPL
tesla,TSLA
nvidia,NVDA
microsoft,MSFT
amazon,AMZN
google,GOOGL
meta,META
netflix,NFLX
disney,DIS
visa,V


*create dataframe*

In [5]:
#df = pd.DataFrame(columns = ['date', 'company', 'stock', 'datetime', 'title', 'news', 'link', 'open', 'high', 'low', 'close', 'volume', 'vwap', 'transactions', 'gpt_response', 'score'])
#df.to_csv(records_path, index = False)

In [6]:
def look():
    return pd.read_csv(records_path).head()
look()

Unnamed: 0,date,company,stock,datetime,title,news,link,open,high,low,close,volume,vwap,transactions,gpt_response,score
0,20250422,apple,AAPL,2025-04-22 12:42:20+00:00,Survey Shows Pent-Up Demand For New Apple iPhones,A survey of U.S. consumers in March indicates ...,https://finance.yahoo.com/m/f862527e-fa7e-335a...,196.12,201.59,195.97,198.91,25860823.0,198.8815,386273.0,Score: 4. Reasoning: The news indicates a stro...,4.0
1,20250422,apple,AAPL,2025-04-22 10:42:04+00:00,Apple Intelligence Gains Momentum as Users Sho...,Apple (NASDAQ:AAPL) might be winning over more...,https://finance.yahoo.com/news/apple-intellige...,196.12,201.59,195.97,198.91,25860823.0,198.8815,386273.0,Score: 4. Reasoning: The overall sentiment tow...,4.0
2,20250422,apple,AAPL,2025-04-22 10:02:00+00:00,"Sell Apple stock before it sinks another 30%, ...",Photo: Sean Gallup (Getty Images)\n\nA firm th...,https://finance.yahoo.com/news/sell-apple-stoc...,196.12,201.59,195.97,198.91,25860823.0,198.8815,386273.0,Score: -4. Reasoning: The financial news respo...,-4.0
3,20250422,apple,AAPL,2025-04-22 09:44:36+00:00,Apple (AAPL) Target Slashed to $141: ‘No Easy ...,We recently published a list of Top 10 AI Stoc...,https://finance.yahoo.com/news/apple-aapl-targ...,196.12,201.59,195.97,198.91,25860823.0,198.8815,386273.0,Score: -2. Reasoning: The sentiment towards Ap...,-2.0
4,20250422,apple,AAPL,2025-04-22 08:37:51+00:00,Cloud AI Update - Tonita Unveils AI Shopping A...,Tonita has launched a new AI-driven shopping a...,https://finance.yahoo.com/news/cloud-ai-tonita...,196.12,201.59,195.97,198.91,25860823.0,198.8815,386273.0,Score: 2. Reasoning: The overall sentiment tow...,2.0


## 1. Process News

> grab news from [eodhd](https://eodhd.com/financial-apis/stock-market-financial-news-api/) api

In [7]:
def get_news(stock, num_news = MAX_NUM_NEWS):
    url = "https://eodhd.com/api/news?s={stock}&offset=0&limit={num_news}&api_token={api_key}"
    raw_data = requests.get(url.format(stock=stock, num_news=str(num_news), api_key=api_key_eodhd))
    data = raw_data.json()
    func = lambda date: datetime.fromisoformat(date.replace('Z', '+00:00'))
    news = [[stock, func(x['date']), x['title'], x['content'], x['link']] for x in data]
    for x in news:
        if ('yahoo.com' in x[4]):
            x[1] -= timedelta(hours=4, minutes=0)
    return news

def process_news_single_stock(stock, num_news = MAX_NUM_NEWS):
    company = 'null'
    for i in range(len(stocks)):
        if stocks.iloc[i, 0] == stock:
            company = stocks.index[i]
    if company == 'null':
        print('Error: company does not exist, add it into stock list first.')
        return

    if num_news > MAX_NUM_NEWS:
        print('Warning: maximum processing limit per request exceeded. Latest {num} news will be processed'.format(num=MAX_NUM_NEWS))
        num_news = MAX_NUM_NEWS

    df = pd.read_csv(records_path)
    index = df.set_index(['stock', 'title']).index

    data_news = get_news(stock, num_news)

    exist_record = 0
    new_record = 0
    print('\nStart processing news for \033[92m{stock}\033[0m...'.format(stock=stock))
    for data in tqdm(data_news):

        if ((data[0], data[2]) in index):
            exist_record += 1
            continue
        new_record += 1

        date = [int(data[1].strftime("%Y%m%d"))]
        new_row = date + [company] + data
        new_row += [NA] * (len(df.columns) - len(new_row))

        df.loc[len(df)] = new_row
    df.to_csv(records_path, index = False)
    print('\nProcessing completed. New records: \033[92m{a}\033[0m, old records: {b}\n'.format(a=new_record, b = exist_record))

def process_news(num_news = MAX_NUM_NEWS):
    for stock in stocks['stock_code']:
        print('-' * 55)
        process_news_single_stock(stock, num_news)
    print('-' * 55)

In [8]:
process_news(100)

-------------------------------------------------------

Start processing news for [92mAAPL[0m...


100%|██████████| 100/100 [00:00<00:00, 230.04it/s]



Processing completed. New records: [92m32[0m, old records: 68

-------------------------------------------------------

Start processing news for [92mTSLA[0m...


100%|██████████| 100/100 [00:00<00:00, 469.26it/s]



Processing completed. New records: [92m26[0m, old records: 74

-------------------------------------------------------

Start processing news for [92mNVDA[0m...


100%|██████████| 100/100 [00:00<00:00, 357.66it/s]



Processing completed. New records: [92m45[0m, old records: 55

-------------------------------------------------------

Start processing news for [92mMSFT[0m...


100%|██████████| 100/100 [00:00<00:00, 266.20it/s]



Processing completed. New records: [92m32[0m, old records: 68

-------------------------------------------------------

Start processing news for [92mAMZN[0m...


100%|██████████| 100/100 [00:00<00:00, 586.85it/s]



Processing completed. New records: [92m26[0m, old records: 74

-------------------------------------------------------

Start processing news for [92mGOOGL[0m...


100%|██████████| 100/100 [00:00<00:00, 808.05it/s]



Processing completed. New records: [92m17[0m, old records: 83

-------------------------------------------------------

Start processing news for [92mMETA[0m...


100%|██████████| 100/100 [00:00<00:00, 1022.69it/s]



Processing completed. New records: [92m15[0m, old records: 85

-------------------------------------------------------

Start processing news for [92mNFLX[0m...


100%|██████████| 100/100 [00:00<00:00, 415.90it/s]



Processing completed. New records: [92m35[0m, old records: 65

-------------------------------------------------------

Start processing news for [92mDIS[0m...


100%|██████████| 100/100 [00:00<00:00, 760.27it/s]



Processing completed. New records: [92m20[0m, old records: 80

-------------------------------------------------------

Start processing news for [92mV[0m...


100%|██████████| 100/100 [00:00<00:00, 2874.70it/s]



Processing completed. New records: [92m4[0m, old records: 96

-------------------------------------------------------

Start processing news for [92mJNJ[0m...


100%|██████████| 100/100 [00:00<00:00, 3153.73it/s]



Processing completed. New records: [92m4[0m, old records: 96

-------------------------------------------------------

Start processing news for [92mJPM[0m...


100%|██████████| 100/100 [00:00<00:00, 1690.73it/s]



Processing completed. New records: [92m8[0m, old records: 92

-------------------------------------------------------

Start processing news for [92mBAC[0m...


100%|██████████| 100/100 [00:00<00:00, 2519.27it/s]



Processing completed. New records: [92m3[0m, old records: 97

-------------------------------------------------------

Start processing news for [92mMA[0m...


100%|██████████| 100/100 [00:00<00:00, 4463.64it/s]



Processing completed. New records: [92m2[0m, old records: 98

-------------------------------------------------------

Start processing news for [92mUNH[0m...


100%|██████████| 100/100 [00:00<00:00, 3801.36it/s]



Processing completed. New records: [92m2[0m, old records: 98

-------------------------------------------------------

Start processing news for [92mPFE[0m...


100%|██████████| 100/100 [00:00<00:00, 3222.22it/s]



Processing completed. New records: [92m3[0m, old records: 97

-------------------------------------------------------

Start processing news for [92mHD[0m...


100%|██████████| 100/100 [00:00<00:00, 5864.93it/s]



Processing completed. New records: [92m1[0m, old records: 99

-------------------------------------------------------


## 2. Validate News:

> process invalid news containing `continue reading` or `len <= 50`

In [9]:
def check_news(df):
    return df[df['news'].str.contains('continue reading') | (df['news'].str.len() <= 50)]

def validate():
    df = pd.read_csv(records_path)
    inv = check_news(df)
    updated = 0
    total = len(inv.index)

    print('\033[1m{inv_num}\033[0m records are invalid'.format(inv_num=total))
    print('Please update news, enter empty string to quit:')

    for idx in inv.index:
        print('\n')
        print('='*100, '\n')
        print('index: {idx}, link: {link}'.format(idx=idx, link=inv.loc[idx, 'link']))
        s = input()
        if (s == ''):
            break
        else:
            df.loc[idx, 'news'] = s
            updated += 1

    df.to_csv(records_path, index = False)
    print('\n')
    print('='*100, '\n')
    print('\033[92m' + 'Updated: {a} / {b}\n'.format(a=updated, b=total) + '\033[0m')


def update_news_by_index():
    df = pd.read_csv(records_path)
    print('Enter index:')
    idx = int(input())

    if (idx not in df.index):
        print('Invalid index')
    else:
        print('Enter news:')
        news = input()
        df.loc[idx, 'news'] = news

        df.to_csv(records_path, index = False)
        print('\nUpdate succeed.')

In [10]:
validate()

[1m3[0m records are invalid
Please update news, enter empty string to quit:



index: 16353, link: https://www.fool.com/investing/2025/05/05/why-shares-of-tesla-are-sinking-today/
Why Shares of Tesla Are Sinking Today By Bram Berkowitz – May 5, 2025 at 12:35PM Key Points  The electric vehicle maker is reportedly seeing shrinking sales in another European country. Struggles in Tesla's core EV business has dogged the stock all year. Investors are laser focused on an upcoming June demonstration of the company's new Robotaxi. Top Analyst Picks: 10 Stocks We'd Buy If We Were Starting Fresh Today ›Shares of the electric vehicle (EV) maker Tesla (TSLA -2.35%) traded roughly 3.3% lower as of 12:21 p.m. ET today. Sales continued to struggle in Europe and the stock also fell, despite an analyst recommending that investors hold the name.  Sales in Europe aren't improving Tesla's stock has been hit hard this year, a big part of which can be attributed to struggling global sales. The company repo

## 3. Process Financial Data

> add available financial data into records, api: [yahoo finance](https://polygon.io/docs/stocks/getting-started)

> **open, high, low, close, volume, vwap, transactions**

> financial records are typically delayed; latest news would not have updated financial data

In [11]:
def get_fin_data(stock):
    date = pd.Timestamp(datetime.now())
    start_date = date - pd.Timedelta(700, unit='D')
    end_date = date + pd.Timedelta(2, unit='D')

    data = client.get_aggs(stock[:4], 1, 'day', start_date, end_date)
    transform = lambda x: [x.open, x.high, x.low, x.close, x.volume, x.vwap, x.transactions]
    return dict([(int(pd.to_datetime(x.timestamp, unit='ms').strftime("%Y%m%d")), transform(x)) for x in data])

def process_fin_data():
    df = pd.read_csv(records_path)

    big_dic = dict()
    for _, stock in stock_lst:
        dic = get_fin_data(stock)
        big_dic[stock] = dic

    unava_dates = set()
    for idx, row in df.iterrows():
        if row[7] != NA:
            continue

        if row[0] not in big_dic[row[2]].keys():
            unava_dates.add(row[0])
        else:
            df.iloc[idx, 7:14] = big_dic[row[2]][row[0]]

    df.to_csv(records_path, index = False)

    num_dates = len(set(df['date']))
    unava_dates_per = int(100.0 * len(unava_dates) / num_dates)

    num_records = len(df)
    unava_records = len(df[df['vwap'] == NA])
    unava_records_per = int(100.0 * unava_records / num_records)
    print('Financial data null percentage:')
    print('\n{a}%, {b} out of {c} dates'.format(a=unava_dates_per, b=len(unava_dates), c=num_dates))
    print('{a}%, {b} out of {c} records'.format(a=unava_records_per, b=unava_records, c=num_records))

In [12]:
process_fin_data()

Financial data null percentage:

29%, 127 out of 424 dates
8%, 1347 out of 16575 records


## 4. Process Sentiment Score

In [13]:
def parse(text):
    score = ''

    f = 0
    for i in range(len(text)):
        if f == 0 and i > 1 and text[i-2:i] == ': ':
            f = 1
        elif text[i] == '.':
            break

        if f == 1:
            score += text[i]

    if score == '':
        return 100


    i = 0 + (score[0] in '+-')
    isInt = score[i:].isdigit()

    if isInt:
        return int(score)
    else:
        return 100


def question(company, stock, score_min, score_max, title, news):
    q = "Analyze this finicial news responses to company: " +\
    company + " with stock code: " +\
    stock + ". Determine the overall sentiment. First, just give me an integer score in range: " + str(score_min) + " to " + str(score_max) +\
    '. The output format should be: \'Score: x. Reasoning:... \'' +\
    '. Then give me one sentense resoning. ' +\
    'Title: ' +  title + '. ' +\
    'News: ' + news
    return q

def get_response(company, stock, title, news, score_min = -5, score_max = 5):
    stock += '.US'

    headers = {
        'Authorization': f'Bearer {api_key_gpt}',
        'Content-Type': 'application/json',
    }

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant and setimental analysis master"},
            {"role": "user", "content": question(company, stock, score_min, score_max, title, news)}
        ]
    }

    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, data=json.dumps(data))


    if response.status_code == 200:
        answer = response.json()['choices'][0]['message']['content']
        return [answer, parse(answer)]
    else:
        return ["Error:" + str(response.status_code) + response.text, 100]

def process_score():
    df = pd.read_csv(records_path)
    inv = check_news(df)

    total = len(df)
    new_num = 0
    old_num = 0
    inv_num = len(inv)
    x = 0
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #if x == 50:
            #break
        x += 1

        if idx in inv.index:
            continue

        #if row[-1] != NA and int(row[-1]) >= -5 and int(row[-1]) <= 5:
            #old_num += 1
            #continue
        if row[-1] != NA and -5 <= int(float(row[-1])) <= 5:
            old_num += 1
            continue

        new_num += 1
        df.iloc[idx, 14:16] = get_response(row[1], row[2], row[4], row[5])

    df.to_csv(records_path, index = False)
    print('\n\nTotal number of records: {total}\n'.format(total=total))
    print('New score: {per}%, {new_num}'.format(per=int(100*new_num/total), new_num=new_num))
    print('Old score: {per}%, {old_num}'.format(per=int(100*old_num/total), old_num=old_num))
    print('Inva news: {per}%, {inv_num}'.format(per=int(100*inv_num/total), inv_num=inv_num))

In [14]:
process_score()

100%|██████████| 16575/16575 [05:45<00:00, 48.03it/s]




Total number of records: 16575

New score: 1%, 275
Old score: 98%, 16300
Inva news: 0%, 0


# old code

In [15]:
# def get_news(stock, num_news = 1000):
#     url = "https://eodhd.com/api/news?s={stock}&offset=0&limit={num_news}&api_token={api_key}"
#     data = requests.get(url.format(stock=stock, num_news=str(num_news), api_key=api_key_eodhd)).json()
#     func = lambda date: datetime.fromisoformat(date.replace('Z', '+00:00'))
#     return [[stock, func(x['date']), x['title'], x['content'], x['link']] for x in data]

# def get_vwap(stock):
#     date = pd.Timestamp(datetime.now())
#     start_date = date - pd.Timedelta(700, unit='D')
#     end_date = date + pd.Timedelta(2, unit='D')

#     data = client.get_aggs(stock[:4], 1, 'day', start_date, end_date)
#     return dict([(int(pd.to_datetime(x.timestamp, unit='ms').strftime("%Y%m%d")), x.vwap) for x in data])

# def parse(text):
#     score = ''

#     f = 0
#     for i in range(len(text)):
#         if f == 0 and i > 1 and text[i-2:i] == ': ':
#             f = 1
#         elif text[i] == '.':
#             break

#         if f == 1:
#             score += text[i]

#     if score == '':
#         return 100


#     i = 0 + (score[0] in '+-')
#     isInt = score[i:].isdigit()

#     if isInt:
#         return int(score)
#     else:
#         return 100


# def question(company, stock, score_min, score_max, title, news):
#     q = "Analyze this finicial news responses to company: " +\
#     company + " with stock code: " +\
#     stock + ". Determine the overall sentiment. First, just give me an integer score in range: " + str(score_min) + " to " + str(score_max) +\
#     '. The output format should be: \'Score: x. Reasoning:... \'' +\
#     '. Then give me one sentense resoning. ' +\
#     'Title: ' +  title + '. ' +\
#     'News: ' + news
#     return q

# def get_response(company, stock, title, news, score_min = -5, score_max = 5):
#     stock += '.US'

#     headers = {
#         'Authorization': f'Bearer {api_key_gpt}',
#         'Content-Type': 'application/json',
#     }

#     data = {
#         "model": "gpt-4-0125-preview",
#         "messages": [
#             {"role": "system", "content": "You are a helpful assistant and setimental analysis master"},
#             {"role": "user", "content": question(company, stock, score_min, score_max, title, news)}
#         ]
#     }

#     response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, data=json.dumps(data))


#     if response.status_code == 200:
#         answer = response.json()['choices'][0]['message']['content']
#         return [answer, parse(answer)]
#     else:
#         return ["Error:" + str(response.status_code) + response.text, 100]

# def process_stock(stock, num_news = 1000):
#     company = 'null'
#     for i in range(len(stocks)):
#         if stocks.iloc[i, 0] == stock:
#             company = stocks.index[i]
#     if company == 'null':
#         print('Company does not exist, add it into stock list first.')
#         return

#     df = pd.read_csv(records_path)
#     index = df.set_index(['stock', 'title']).index

#     data_news = get_news(stock, num_news)
#     dict_vwap = get_vwap(stock)
#     keys = list(dict_vwap.keys())

#     f = lambda date: dict_vwap[keys[bisect_right(keys, int(date.strftime("%Y%m%d")), lo=0, hi=len(keys))-1]]
#     [x.append(f(x[1])) for x in data_news]


#     exist_record = 0
#     new_record = 0
#     print('\nStart processing news for {stock}...'.format(stock=stock))
#     for data in tqdm(data_news):
#         start_time = time.time()

#         if ((data[0], data[2]) in index):
#             exist_record += 1
#             continue
#         new_record += 1

#         date = [int(data[1].strftime("%Y%m%d"))]
#         response = get_response(company, stock, data[2], data[3])
#         new_row = date + [company] + data + response

#         df.loc[len(df)] = new_row


#         end_time = time.time()
#         elapesd_time = end_time - start_time
#         time.sleep(max(2-elapesd_time, 0))

#     print('\nProcessing completed. New records: {a}, old records: {b}\n'.format(a=new_record, b = exist_record))
#     df.to_csv(records_path, index = False)
#     return df

In [16]:
# create dataframe
# df = pd.DataFrame(columns = ['date', 'company', 'stock', 'datetime', 'title', 'news', 'link', 'vwap', 'gpt_response', 'score'])
# df.to_csv(os.path.join(path, 'data', 'records.csv'), index = False)

## daily process

In [17]:
# num_per_stock = 200

# for stock in stocks['stock_code']:
#     process_stock(stock, num_per_stock)

## Process single stock

In [18]:
# num = 1000
# stock = 'NVDA'

# process_stock(stock)

## Process all stocks

In [19]:
# num_per_stock = 200

# for stock in stocks['stock_code']:
#     process_stock(stock, num_per_stock)

In [20]:
# stock = 'AAPL'
# data = get_news(stock, 100)
# data[-1]

In [21]:
# stock = 'TSLA'
# data = get_news(stock, 100)
# data[-1]

In [22]:
# stock = 'MSFT'
# data = get_news(stock, 100)
# data[-1]

In [23]:
# stock = 'META'
# data = get_news(stock, 100)
# data[-1]

In [24]:
# stock = 'GOOG'
# data = get_news(stock, 100)
# data[-1]

In [25]:
# stock = 'AMZN'
# data = get_news(stock, 100)
# data[-1]

In [26]:
# stock = 'NVDA'
# data = get_news(stock, 100)
# data[-1]

## Strategy test

In [27]:
# stocks.loc['google'] = 'GOOG'
# stocks.loc['nvidia'] = 'NVDA'
# stocks

In [28]:
# num_per_stock = 200

# for stock in stocks[:1]['stock_code']:
#     process_stock(stock, num_per_stock)

In [29]:
# process_stock('TSLA', num_per_stock)

In [30]:
# process_stock('NVDA', num_per_stock)