In [None]:
# Merge news and stock data on Date
combined_df = pd.merge(news_df, stock_df[['Date', 'Daily_Return']], on='Date', how='inner')


In [None]:
# Calculate average sentiment score per day (if multiple headlines per day)
daily_sentiment = combined_df.groupby('Date').agg({
    'vader_sentiment': 'mean',
    'textblob_sentiment': 'mean',
    'Daily_Return': 'mean'
}).reset_index()

# Calculate correlation matrix
correlation_matrix = daily_sentiment[['vader_sentiment', 'textblob_sentiment', 'Daily_Return']].corr()

# Output correlations
print("Correlation Matrix:")
print(correlation_matrix)


In [None]:
import matplotlib.pyplot as plt

# Plot sentiment scores vs daily returns
plt.figure(figsize=(12, 6))

# VADER Sentiment
plt.subplot(1, 2, 1)
plt.scatter(daily_sentiment['vader_sentiment'], daily_sentiment['Daily_Return'], alpha=0.5)
plt.title('VADER Sentiment vs Daily Returns')
plt.xlabel('VADER Sentiment')
plt.ylabel('Daily Returns')

# TextBlob Sentiment
plt.subplot(1, 2, 2)
plt.scatter(daily_sentiment['textblob_sentiment'], daily_sentiment['Daily_Return'], alpha=0.5)
plt.title('TextBlob Sentiment vs Daily Returns')
plt.xlabel('TextBlob Sentiment')
plt.ylabel('Daily Returns')

plt.tight_layout()
plt.show()


In [5]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
stock_data = pd.read_csv('../data/AAPL_historical_data.csv')

# Parse dates
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')

# Remove timezone information to make both timezone-naive
news_data['date'] = news_data['date'].dt.tz_localize(None)
stock_data['Date'] = stock_data['Date'].dt.tz_localize(None)

# Sentiment analysis
analyzer = SentimentIntensityAnalyzer()
news_data['sentiment'] = news_data['headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Aggregate sentiment by date
daily_sentiment = news_data.groupby(news_data['date'])['sentiment'].mean().reset_index()

# Calculate daily stock returns
stock_data['daily_return'] = stock_data['Close'].pct_change()

# Merge sentiment data with stock data
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date')

# Calculate correlation
correlation = merged_data[['sentiment', 'daily_return']].corr()

print("Correlation between sentiment and stock returns: ", correlation.iloc[0, 1])


Correlation between sentiment and stock returns:  nan


In [6]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
stock_data = pd.read_csv('../data/AAPL_historical_data.csv')

# Parse dates
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')

# Remove timezone information to make both timezone-naive
news_data['date'] = news_data['date'].dt.tz_localize(None)
stock_data['Date'] = stock_data['Date'].dt.tz_localize(None)

# Sentiment analysis
analyzer = SentimentIntensityAnalyzer()
news_data['sentiment'] = news_data['headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Aggregate sentiment by date
daily_sentiment = news_data.groupby(news_data['date'])['sentiment'].mean().reset_index()

# Calculate daily stock returns
stock_data['daily_return'] = stock_data['Close'].pct_change()

# Merge sentiment data with stock data
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date')

# Drop rows with missing data (NaN)
merged_data = merged_data.dropna(subset=['sentiment', 'daily_return'])

# Check if there are enough data points to compute correlation
if merged_data.empty:
    print("No valid data to compute correlation.")
else:
    # Calculate correlation
    correlation = merged_data[['sentiment', 'daily_return']].corr()

    print("Correlation between sentiment and stock returns: ", correlation.iloc[0, 1])


No valid data to compute correlation.


In [8]:
print("Daily Sentiment Data:")
print(daily_sentiment.head())
print(daily_sentiment.info())

print("Stock Data:")
print(stock_data.head())
print(stock_data.info())


print("Sentiment Date Range:", daily_sentiment['date'].min(), "to", daily_sentiment['date'].max())
print("Stock Date Range:", stock_data['Date'].min(), "to", stock_data['Date'].max())



Daily Sentiment Data:
                 date  sentiment
0 2011-04-27 21:01:48     0.0000
1 2011-04-28 13:49:29     0.0000
2 2011-04-28 15:00:36     0.2500
3 2011-04-29 13:47:06     0.0000
4 2011-04-29 16:11:05     0.7351
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36011 entries, 0 to 36010
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       36011 non-null  datetime64[ns]
 1   sentiment  36011 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 562.8 KB
None
Stock Data:
        Date      Open      High       Low     Close  Adj Close     Volume  \
0 1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1 1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
2 1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
3 1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
4 1980-12-18  0.

In [9]:
print("Sentiment Date Type:", daily_sentiment['date'].dtype)
print("Stock Date Type:", stock_data['Date'].dtype)


Sentiment Date Type: datetime64[ns]
Stock Date Type: datetime64[ns]


In [10]:
print("Sentiment Data NaNs in 'date':", daily_sentiment['date'].isna().sum())
print("Stock Data NaNs in 'Date':", stock_data['Date'].isna().sum())


Sentiment Data NaNs in 'date': 0
Stock Data NaNs in 'Date': 0


In [11]:
print("Daily Sentiment Missing Values:")
print(daily_sentiment.isna().sum())

print("Stock Data Missing Values:")
print(stock_data.isna().sum())

# Optionally, fill or drop missing values if necessary
daily_sentiment = daily_sentiment.dropna()
stock_data = stock_data.dropna()

Daily Sentiment Missing Values:
date         0
sentiment    0
dtype: int64
Stock Data Missing Values:
Date            0
Open            0
High            0
Low             0
Close           0
Adj Close       0
Volume          0
Dividends       0
Stock Splits    0
daily_return    1
dtype: int64


In [12]:
# Perform the merge again after ensuring both DataFrames have valid dates
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date')

# Drop rows with missing data in the merged DataFrame
merged_data = merged_data.dropna(subset=['sentiment', 'daily_return'])

# Recheck if there are any valid rows
print("Merged Data Info:")
print(merged_data.info())

Merged Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          0 non-null      datetime64[ns]
 1   sentiment     0 non-null      float64       
 2   Date          0 non-null      datetime64[ns]
 3   Open          0 non-null      float64       
 4   High          0 non-null      float64       
 5   Low           0 non-null      float64       
 6   Close         0 non-null      float64       
 7   Adj Close     0 non-null      float64       
 8   Volume        0 non-null      int64         
 9   Dividends     0 non-null      float64       
 10  Stock Splits  0 non-null      float64       
 11  daily_return  0 non-null      float64       
dtypes: datetime64[ns](2), float64(9), int64(1)
memory usage: 132.0 bytes
None


In [13]:
# Strip time information and ensure both columns are of the same type
daily_sentiment['date'] = daily_sentiment['date'].dt.date
stock_data['Date'] = stock_data['Date'].dt.date

print("Sentiment Date Range:", daily_sentiment['date'].min(), "to", daily_sentiment['date'].max())
print("Stock Date Range:", stock_data['Date'].min(), "to", stock_data['Date'].max())


Sentiment Date Range: 2011-04-27 to 2020-06-11
Stock Date Range: 1980-12-15 to 2024-07-30


In [15]:
print("Unique Dates in Sentiment Data:", daily_sentiment['date'].unique())
print("Unique Dates in Stock Data:", stock_data['Date'].unique())

Unique Dates in Sentiment Data: [datetime.date(2011, 4, 27) datetime.date(2011, 4, 28)
 datetime.date(2011, 4, 29) ... datetime.date(2020, 6, 9)
 datetime.date(2020, 6, 10) datetime.date(2020, 6, 11)]
Unique Dates in Stock Data: [datetime.date(1980, 12, 15) datetime.date(1980, 12, 16)
 datetime.date(1980, 12, 17) ... datetime.date(2024, 7, 26)
 datetime.date(2024, 7, 29) datetime.date(2024, 7, 30)]


In [17]:
stock_data['daily_return'] = stock_data['Close'].pct_change()
stock_data = stock_data.dropna(subset=['daily_return'])
# Merge sentiment data with stock data
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date')

# Drop rows with missing data
merged_data = merged_data.dropna(subset=['sentiment', 'daily_return'])

# Check the merged data
print("Merged Data Info:")
print(merged_data.info())
print(merged_data.head())


Merged Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35553 entries, 0 to 35552
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          35553 non-null  object 
 1   sentiment     35553 non-null  float64
 2   Date          35553 non-null  object 
 3   Open          35553 non-null  float64
 4   High          35553 non-null  float64
 5   Low           35553 non-null  float64
 6   Close         35553 non-null  float64
 7   Adj Close     35553 non-null  float64
 8   Volume        35553 non-null  int64  
 9   Dividends     35553 non-null  float64
 10  Stock Splits  35553 non-null  float64
 11  daily_return  35553 non-null  float64
dtypes: float64(9), int64(1), object(2)
memory usage: 3.3+ MB
None
         date  sentiment        Date       Open       High        Low  \
0  2011-04-27     0.0000  2011-04-27  12.580000  12.583929  12.396429   
1  2011-04-28     0.0000  2011-04-28  12.363929  12.491071

In [18]:
# Print a few rows from each DataFrame for inspection
print("Daily Sentiment Sample:")
print(daily_sentiment.head())

print("Stock Data Sample:")
print(stock_data.head())

Daily Sentiment Sample:
         date  sentiment
0  2011-04-27     0.0000
1  2011-04-28     0.0000
2  2011-04-28     0.2500
3  2011-04-29     0.0000
4  2011-04-29     0.7351
Stock Data Sample:
         Date      Open      High       Low     Close  Adj Close    Volume  \
3  1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049  86441600   
4  1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630  73449600   
5  1980-12-19  0.126116  0.126674  0.126116  0.126116   0.097223  48630400   
6  1980-12-22  0.132254  0.132813  0.132254  0.132254   0.101954  37363200   
7  1980-12-23  0.137835  0.138393  0.137835  0.137835   0.106257  46950400   

   Dividends  Stock Splits  daily_return  
3        0.0           0.0      0.024751  
4        0.0           0.0      0.028992  
5        0.0           0.0      0.061029  
6        0.0           0.0      0.048670  
7        0.0           0.0      0.042199  


In [24]:
correlation = merged_data[['sentiment', 'daily_return']].corr()

print("Correlation between sentiment and stock returns: ", correlation.iloc[0, 1])

correlation = merged_data[['sentiment', 'daily_return']].corr()

print("Correlation between sentiment and stock returns: ", correlation.iloc[0, 1])

Correlation between sentiment and stock returns:  nan
Correlation between sentiment and stock returns:  nan


In [21]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
stock_data = pd.read_csv('../data/AAPL_historical_data.csv')

# Parse dates
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')

# Remove timezone information
news_data['date'] = news_data['date'].dt.tz_localize(None)
stock_data['Date'] = stock_data['Date'].dt.tz_localize(None)

# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()
news_data['sentiment'] = news_data['headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Aggregate sentiment by date
daily_sentiment = news_data.groupby(news_data['date'])['sentiment'].mean().reset_index()

# Calculate daily stock returns
stock_data['daily_return'] = stock_data['Close'].pct_change()

# Check for missing values
print("Daily Sentiment Missing Values:")
print(daily_sentiment.isna().sum())

print("Stock Data Missing Values:")
print(stock_data.isna().sum())

# Merge sentiment data with stock data
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date')

# Drop rows with missing data in the merged DataFrame
merged_data = merged_data.dropna(subset=['sentiment', 'daily_return'])

# Display the merged data
print("Merged Data Sample:")
print(merged_data.head())

# Calculate and print daily stock returns
print("Daily Stock Returns Sample:")
print(stock_data[['Date', 'daily_return']].head())

# Calculate and print average daily sentiment scores
print("Average Daily Sentiment Scores Sample:")
print(daily_sentiment.head())

# Calculate correlation
if merged_data.empty:
    print("No valid data to compute correlation.")
else:
    correlation = merged_data[['sentiment', 'daily_return']].corr()
    print("Correlation between sentiment and stock returns: ", correlation.iloc[0, 1])


Merged Data Sample:
Empty DataFrame
Columns: [date, sentiment, Date, Open, High, Low, Close, Adj Close, Volume, Dividends, Stock Splits, daily_return]
Index: []
Daily Stock Returns Sample:
        Date  daily_return
0 1980-12-12           NaN
1 1980-12-15     -0.052171
2 1980-12-16     -0.073398
3 1980-12-17      0.024751
4 1980-12-18      0.028992
Average Daily Sentiment Scores Sample:
                 date  sentiment
0 2011-04-27 21:01:48     0.0000
1 2011-04-28 13:49:29     0.0000
2 2011-04-28 15:00:36     0.2500
3 2011-04-29 13:47:06     0.0000
4 2011-04-29 16:11:05     0.7351
Correlation between sentiment and stock returns:  nan
