In [7]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import nltk

# Download necessary NLTK data files
nltk.download('vader_lexicon')

# Load news dataset

news_file_path = '../data/raw_analyst_ratings.csv' 
news_df = pd.read_csv(news_file_path)

# Initialize sentiment analyzers
vader_analyzer = SentimentIntensityAnalyzer()

# Define functions for sentiment analysis
def analyze_sentiment_vader(text):
    score = vader_analyzer.polarity_scores(text)
    return score['compound']

def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply sentiment analysis
news_df['vader_sentiment'] = news_df['headline'].apply(analyze_sentiment_vader)
news_df['textblob_sentiment'] = news_df['headline'].apply(analyze_sentiment_textblob)

# Classify sentiment
def classify_sentiment(score):
    if score > 0.1:
        return 'positive'
    elif score < -0.1:
        return 'negative'
    else:
        return 'neutral'

news_df['vader_sentiment_class'] = news_df['vader_sentiment'].apply(classify_sentiment)
news_df['textblob_sentiment_class'] = news_df['textblob_sentiment'].apply(classify_sentiment)

# Convert 'Date' column to datetime
news_df['Date'] = pd.to_datetime(news_df['Date'])
news_df = news_df.sort_values('Date').reset_index(drop=True)


AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

In [None]:
# Load stock data
stock_file_path = '../data/AAPL_historical_data.csv'
stock_df = pd.read_csv(stock_file_path)

# Convert 'Date' column to datetime
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df = stock_df.sort_values('Date').reset_index(drop=True)

# Compute daily returns
stock_df['Daily_Return'] = stock_df['Close'].pct_change()


In [None]:
# Merge news and stock data on Date
combined_df = pd.merge(news_df, stock_df[['Date', 'Daily_Return']], on='Date', how='inner')


In [None]:
# Calculate average sentiment score per day (if multiple headlines per day)
daily_sentiment = combined_df.groupby('Date').agg({
    'vader_sentiment': 'mean',
    'textblob_sentiment': 'mean',
    'Daily_Return': 'mean'
}).reset_index()

# Calculate correlation matrix
correlation_matrix = daily_sentiment[['vader_sentiment', 'textblob_sentiment', 'Daily_Return']].corr()

# Output correlations
print("Correlation Matrix:")
print(correlation_matrix)


In [None]:
import matplotlib.pyplot as plt

# Plot sentiment scores vs daily returns
plt.figure(figsize=(12, 6))

# VADER Sentiment
plt.subplot(1, 2, 1)
plt.scatter(daily_sentiment['vader_sentiment'], daily_sentiment['Daily_Return'], alpha=0.5)
plt.title('VADER Sentiment vs Daily Returns')
plt.xlabel('VADER Sentiment')
plt.ylabel('Daily Returns')

# TextBlob Sentiment
plt.subplot(1, 2, 2)
plt.scatter(daily_sentiment['textblob_sentiment'], daily_sentiment['Daily_Return'], alpha=0.5)
plt.title('TextBlob Sentiment vs Daily Returns')
plt.xlabel('TextBlob Sentiment')
plt.ylabel('Daily Returns')

plt.tight_layout()
plt.show()
