In [None]:
import pandas as pd
df_news = pd.read_csv('../data/raw_analyst_ratings.csv')
df_aapl = pd.read_csv('../data/yfinance/AAPL.csv')
df_amzn = pd.read_csv('../data/yfinance/AMZN.csv')
df_goog = pd.read_csv('../data/yfinance/GOOG.csv')
df_meta = pd.read_csv('../data/yfinance/META.csv')
df_msft = pd.read_csv('../data/yfinance/MSFT.csv')
df_nvda = pd.read_csv('../data/yfinance/NVDA.csv')

display(df_aapl.head())

In [None]:
# Parse the 'date' column
df_news['date_parsed'] = pd.to_datetime(df_news['date'], errors='coerce', utc=True, infer_datetime_format=True)

# Drop rows that failed to parse
df_news = df_news.dropna(subset=['date_parsed'])

# Extract only the date (as datetime.date)
df_news['date_only'] = df_news['date_parsed'].dt.date

# Check
df_news[['date', 'date_parsed', 'date_only']].head()


In [None]:
# 2 Sentiment Analysis
from textblob import TextBlob
def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity
df_news['sentiment'] = df_news['headline'].apply(get_sentiment)
display(df_news[['headline', 'sentiment']].head(30))

In [None]:
# Analysis
# Agregate daily sentiment
daily_sentiment = df_news.groupby('date')['sentiment'].mean().reset_index()
daily_sentiment.rename(columns={'sentiment':'avg_sentiment'}, inplace=True)
# calculate daily returns for all stocks
def compute_daily_returns(df):
    df['daily_return'] = df['Close'].pct_change()
    return df
fd_aapl = compute_daily_returns(df_aapl)
fd_amzn = compute_daily_returns(df_amzn)
fd_goog = compute_daily_returns(df_goog)
fd_meta = compute_daily_returns(df_meta)
fd_msft = compute_daily_returns(df_msft)
fd_nvda = compute_daily_returns(df_nvda)



In [None]:
# correlation analysis
# merge sentiment with stock returns
def merge_sentiment_with_returns(stock_df, sentiment_df):
    stock_df = stock_df.reset_index()
    stock_df['date_only'] = pd.to_datetime(stock_df['Date']).dt.date
    sentiment_df['date_only'] = pd.to_datetime(sentiment_df['date']).dt.date
    merged = pd.merge(stock_df, sentiment_df, on='date_only', how='left')
    merged.set_index('date_only', inplace=True)

    return merged


merged_aapl = merge_sentiment_with_returns(fd_aapl, daily_sentiment)
merged_amzn = merge_sentiment_with_returns(fd_amzn, daily_sentiment)
merged_goog = merge_sentiment_with_returns(fd_goog, daily_sentiment)
merged_meta = merge_sentiment_with_returns(fd_meta, daily_sentiment)
merged_msft = merge_sentiment_with_returns(fd_msft, daily_sentiment)
merged_nvda = merge_sentiment_with_returns(fd_nvda, daily_sentiment)
# compute correlations
def compute_correlation(merged_df):
    corr = merged_df['daily_return'].corr(merged_df['avg_sentiment'])
    return corr

correlations = {
    'AAPL': compute_correlation(merged_aapl),
    'AMZN': compute_correlation(merged_amzn),
    'GOOG': compute_correlation(merged_goog),
    'META': compute_correlation(merged_meta),
    'MSFT': compute_correlation(merged_msft),
    'NVDA': compute_correlation(merged_nvda),
}

# Display correlations
print("Pearson correlation between daily sentiment and stock returns:")
for stock, corr in correlations.items():
    print(f"{stock}: {corr:.4f}")