In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import os


In [30]:
# Download NLTK sentiment analyzer data
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yetimeshet.tadesse\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Define folder path and stock files
folder_path = '../data/raw/yfinance_data'
stock_files = {
    'AAPL': 'AAPL_historical_data.csv',
    'AMZN': 'AMZN_historical_data.csv',
    'GOOG': 'GOOG_historical_data.csv',
    'META': 'META_historical_data.csv',
    'MSFT': 'MSFT_historical_data.csv',
    'NVDA': 'NVDA_historical_data.csv',
    'TSLA': 'TSLA_historical_data.csv'
}

# Load News Data
news_df = pd.read_csv('../data/raw/raw_analyst_ratings.csv')  # Columns: [date, headline]

# Preprocess News Data
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
news_df = news_df.dropna(subset=['date'])  # Drop rows with invalid dates
news_grouped = news_df.groupby('date')['headline'].apply(lambda x: ' '.join(x)).reset_index()

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment polarity using NLTK
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

# Apply sentiment analysis to news headlines
news_grouped['sentiment_score'] = news_grouped['headline'].apply(get_sentiment)

# Iterate through stock files and perform analysis
for stock_name, file_name in stock_files.items():
    print(f"\n--- Analysis for {stock_name} ---")
    
    # Load stock data
    stock_df = pd.read_csv(os.path.join(folder_path, file_name))  # Columns: [date, close_price]
    stock_df['date'] = pd.to_datetime(stock_df['date'], errors='coerce').dt.date
    stock_df = stock_df.dropna(subset=['date'])  # Drop rows with invalid dates
    
    # Merge with sentiment data
    merged_df = pd.merge(news_grouped[['date', 'sentiment_score']], stock_df, on='date', how='inner')
    
    # Calculate Daily Stock Returns
    merged_df['daily_return'] = merged_df['close_price'].pct_change()
    merged_df = merged_df.dropna().reset_index(drop=True)
    
    # Correlation Analysis
    correlation, p_value = pearsonr(merged_df['sentiment_score'], merged_df['daily_return'])
    print("Sentiment vs. Stock Return Correlation Analysis")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-Value: {p_value:.4f}")
    
    # Visualization
    plt.figure(figsize=(10, 6))
    plt.scatter(merged_df['sentiment_score'], merged_df['daily_return'], alpha=0.7)
    plt.title(f'Correlation between Sentiment Score and {stock_name} Returns')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Daily Stock Return')
    plt.axhline(0, color='red', linestyle='--', lw=1)
    plt.axvline(0, color='red', linestyle='--', lw=1)
    plt.grid(True)
    plt.show()
    
    # Summary Insights
    print("Key Insights:")
    if correlation > 0:
        print("Positive correlation: Higher sentiment scores align with stock price increases.")
    elif correlation < 0:
        print("Negative correlation: Lower sentiment scores align with stock price decreases.")
    else:
        print("No strong correlation observed between sentiment and stock returns.")

print("\nProactivity Reference:")
print("Tools used: Python libraries (pandas, nltk, scipy, matplotlib)")

  news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
  news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date


AttributeError: Can only use .dt accessor with datetimelike values

In [23]:
news_df = pd.read_csv('../data/raw/raw_analyst_ratings.csv', encoding='utf-8')

In [32]:
print(news_df.columns)

Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')


In [34]:
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')

In [35]:
print(news_df.isnull().sum())

Unnamed: 0         0
headline           0
url                0
publisher          0
date          872204
stock              0
dtype: int64


Check the distribution of articles over time: