In [2]:
import pandas as pd
import requests
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import os

api_key = os.getenv('FMP_API_KEY')
stock_symbol = 'NVDA'

In [3]:
# Define the model path
model_name = "ProsusAI/finbert"

# Load tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for sentiment analysis
finbert_sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [8]:
df = pd.read_csv("data_final.csv")
df.head()


Unnamed: 0,press_sentiment_score,news_sentiment_score,date,open,low,high,close,volume,log_return,log_volume_change,log_trading_range,previous_volatility_7,previous_volatility_3,future_volatility_7,weight_avg_sentiment,rolling_sentiment_7,garch_volatility
0,,0.4939,2021-02-02 09:30:00,133.3025,133.1125,135.1143,134.675,5534824,0.01887,0.543639,0.014926,0.006998,0.010666,0.006781,0.4939,0.4939,
1,,,2021-02-02 10:30:00,133.625,132.855,133.9825,133.0375,2662972,-0.012233,-0.731617,0.008451,0.009167,0.015571,0.002986,,0.4939,
2,,,2021-02-02 11:30:00,134.335,133.54,134.5675,133.5625,2017932,0.003938,-0.27737,0.007665,0.009159,0.015556,0.004752,,0.4939,
3,,,2021-02-02 12:30:00,133.8325,133.695,134.6338,134.31,1591220,0.005581,-0.237572,0.006997,0.009197,0.009845,0.00503,,0.4939,
4,,0.4404,2021-02-02 13:30:00,134.5025,134.005,134.7975,134.2231,1726548,-0.000647,0.081623,0.005897,0.009291,0.003228,0.00522,0.4404,0.46715,


In [19]:
press_list = []

for page in tqdm(range(100)):

    url = f'https://financialmodelingprep.com/api/v3/press-releases/{stock_symbol}?page={page}&apikey={api_key}'

    response = requests.get(url).json()

    for item in response:
        press_list.append({'text': item['title'], 'publishedDate': item['date']})


press_df = pd.DataFrame(press_list)

press_df

100%|██████████| 100/100 [02:20<00:00,  1.41s/it]


Unnamed: 0,text,publishedDate
0,FPT TO SHAPE THE FUTURE OF AI AND CLOUD ON A G...,2024-04-23 03:22:00
1,WEIGHTS & BIASES ANNOUNCES INTEGRATION WITH NV...,2024-04-18 14:00:00
2,"TELESTREAM DELIVERS CUTTING-EDGE, CLOUD-NATIVE...",2024-04-09 11:00:00
3,ALSET CAPITAL INC. ANNOUNCES SECURED LOAN TO A...,2024-04-02 03:05:00
4,BIATECH JOINS NVIDIA INCEPTION,2024-03-29 11:38:00
...,...,...
8292,PENGUIN SOLUTIONS CERTIFIED AS NVIDIA DGX-READ...,2023-09-26 07:58:00
8293,INFOSYS AND NVIDIA COLLABORATE TO HELP WORLD'S...,2023-09-20 06:28:00
8294,OPENFOLD AI RESEARCH CONSORTIUM WELCOMES THREE...,2023-09-12 05:30:00
8295,DT RESEARCH EXPANDS LINE OF MILITARY STANDARD ...,2023-09-12 03:00:00


In [21]:
sentiment_scores = []

# Analyze the sentiment of all financial texts and store the scores
for index, row in press_df.iterrows():
    text = row['text']
    sentiment = finbert_sentiment_pipeline(text)
    # Extract the sentiment score (assuming the first element of the result is the sentiment label)
    sentiment_score = sentiment[0]['score']
    #print(index,sentiment_score)
    sentiment_scores.append(sentiment_score)

In [22]:
press_df['sentiment_score'] = sentiment_scores
press_df


Unnamed: 0,text,publishedDate,sentiment_score
0,FPT TO SHAPE THE FUTURE OF AI AND CLOUD ON A G...,2024-04-23 03:22:00,0.758889
1,WEIGHTS & BIASES ANNOUNCES INTEGRATION WITH NV...,2024-04-18 14:00:00,0.519784
2,"TELESTREAM DELIVERS CUTTING-EDGE, CLOUD-NATIVE...",2024-04-09 11:00:00,0.799337
3,ALSET CAPITAL INC. ANNOUNCES SECURED LOAN TO A...,2024-04-02 03:05:00,0.837347
4,BIATECH JOINS NVIDIA INCEPTION,2024-03-29 11:38:00,0.928256
...,...,...,...
8292,PENGUIN SOLUTIONS CERTIFIED AS NVIDIA DGX-READ...,2023-09-26 07:58:00,0.809473
8293,INFOSYS AND NVIDIA COLLABORATE TO HELP WORLD'S...,2023-09-20 06:28:00,0.870120
8294,OPENFOLD AI RESEARCH CONSORTIUM WELCOMES THREE...,2023-09-12 05:30:00,0.638917
8295,DT RESEARCH EXPANDS LINE OF MILITARY STANDARD ...,2023-09-12 03:00:00,0.636374


In [32]:

#press_df = press_df.drop(columns=['date_test', 'text'])
press_df.to_csv('press_score.csv')

In [34]:

press_df['date_press'] = pd.to_datetime(press_df['publishedDate']).dt.strftime('%Y-%m-%d %H')

# Assuming 'date' is the column in df that you want to convert in the same way.
df['date_press'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H')

merged_df = df.merge(press_df, on='date_press', how='left')
merged_df

Unnamed: 0,press_sentiment_score,news_sentiment_score,date,open,low,high,close,volume,log_return,log_volume_change,...,previous_volatility_7,previous_volatility_3,future_volatility_7,weight_avg_sentiment,rolling_sentiment_7,garch_volatility,date_test,date_press,publishedDate,sentiment_score
0,,0.493900,2021-02-02 09:30:00,133.3025,133.1125,135.1143,134.6750,5534824,0.018870,0.543639,...,0.006998,0.010666,0.006781,0.493900,0.493900,,2021-02-02 09,2021-02-02 09,,
1,,,2021-02-02 10:30:00,133.6250,132.8550,133.9825,133.0375,2662972,-0.012233,-0.731617,...,0.009167,0.015571,0.002986,,0.493900,,2021-02-02 10,2021-02-02 10,,
2,,,2021-02-02 11:30:00,134.3350,133.5400,134.5675,133.5625,2017932,0.003938,-0.277370,...,0.009159,0.015556,0.004752,,0.493900,,2021-02-02 11,2021-02-02 11,,
3,,,2021-02-02 12:30:00,133.8325,133.6950,134.6338,134.3100,1591220,0.005581,-0.237572,...,0.009197,0.009845,0.005030,,0.493900,,2021-02-02 12,2021-02-02 12,,
4,,0.440400,2021-02-02 13:30:00,134.5025,134.0050,134.7975,134.2231,1726548,-0.000647,0.081623,...,0.009291,0.003228,0.005220,0.440400,0.467150,,2021-02-02 13,2021-02-02 13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7545,,0.074433,2024-02-14 11:30:00,727.0000,721.8700,729.1000,726.5000,5366623,-0.017005,-0.599035,...,0.014699,0.022074,0.003390,0.074433,0.441258,0.000035,2024-02-14 11,2024-02-14 11,,
7546,,,2024-02-14 12:30:00,727.8700,724.5000,735.0002,729.2050,4553647,0.003716,-0.164270,...,0.014768,0.012784,0.003272,,0.387301,0.000135,2024-02-14 12,2024-02-14 12,,
7547,,0.757433,2024-02-14 13:30:00,729.6460,726.4001,731.7999,729.5450,4102110,0.000466,-0.104427,...,0.014465,0.011145,0.003403,0.757433,0.488874,0.000154,2024-02-14 13,2024-02-14 13,,
7548,,0.466850,2024-02-14 14:30:00,732.8400,730.0200,734.7200,730.8700,4407749,0.001815,0.071863,...,0.014457,0.001633,0.003386,0.466850,0.501686,20.559436,2024-02-14 14,2024-02-14 14,,


In [47]:
merged_df.to_csv('processed_dataset.csv')