In [None]:
import pandas as pd
import os
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Configuration ---
DATA_DIR = "data"
STOCK_DIR = '../data/yfinance_data'
OUTPUT_DIR = "results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 2. Load and Prepare News Data ---
print("Loading news data...")
news = pd.read_csv('../data/raw_analyst_ratings/raw_analyst_ratings.csv')

# Convert to datetime (coerce invalid)
news['date'] = pd.to_datetime(news['date'], errors='coerce')

# Drop rows where conversion failed
news = news.dropna(subset=['date'])

# Localize only if tz-naive
if news['date'].dt.tz is None:
    news['date'] = news['date'].dt.tz_localize('UTC')

# Then convert to New York time
news['date'] = news['date'].dt.tz_convert('America/New_York')

# --- 3. Sentiment Analysis ---
print("Performing sentiment analysis...")
def get_sentiment(text):
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity  # Range: -1 (negative) to 1 (positive)

news['sentiment'] = news['headline'].apply(get_sentiment)

# --- 4. Date Alignment Function ---
def align_to_trading_dates(news_df):
    """Align news dates to stock market trading days."""
    market_close = pd.Timestamp('16:00:00').time()  # 4 PM UTC-4
    
    def _align_row(row):
        if row['date'].time() < market_close:
            return row['date'].date()  # Same trading day
        return (row['date'] + pd.Timedelta(days=1)).date()  # Next trading day
    
    news_df['aligned_date'] = news_df.apply(_align_row, axis=1)
    return news_df

news = align_to_trading_dates(news)

# --- 5. Process All Stocks ---
print("Processing stock data...")
results = []
stock_files = [f for f in os.listdir(STOCK_DIR) if f.endswith('_historical_data.csv')]

for stock_file in stock_files:
    ticker = stock_file.split('_')[0]
    print(f"\nProcessing {ticker}...")
    
    try:
        # Load stock data
        stock = pd.read_csv(os.path.join(STOCK_DIR, stock_file))
        stock['Date'] = pd.to_datetime(stock['Date']).dt.date
        
        # Filter news for current ticker
        news_filtered = news[news['stock'] == ticker]
        if news_filtered.empty:
            print(f"No news found for {ticker}")
            continue
            
        # Merge with aligned dates
        merged = pd.merge(
            news_filtered,
            stock,
            left_on=['aligned_date'],
            right_on=['Date'],
            how='inner'
        )
        
        if merged.empty:
            print(f"No date matches for {ticker}")
            continue
            
        # --- 6. Calculate Daily Metrics ---
        # Daily returns
        merged['daily_return'] = merged['Close'].pct_change() * 100  # Percentage
        
        # Average daily sentiment (multiple articles per day)
        daily_data = merged.groupby('aligned_date').agg({
            'sentiment': 'mean',
            'daily_return': 'last'
        }).reset_index()
        
        # --- 7. Correlation Analysis ---
        corr = daily_data[['sentiment', 'daily_return']].corr().iloc[0, 1]
        
        # --- 8. Visualization ---
        plt.figure(figsize=(10, 5))
        sns.regplot(data=daily_data, x='sentiment', y='daily_return')
        plt.title(f"{ticker}: Sentiment vs. Daily Returns (Corr = {corr:.2f})")
        plt.xlabel("Average Daily Sentiment Score")
        plt.ylabel("Daily Return (%)")
        plt.savefig(os.path.join(OUTPUT_DIR, f"{ticker}_correlation.png"))
        plt.close()
        
        # Store results
        results.append({
            'ticker': ticker,
            'correlation': corr,
            'days_with_data': len(daily_data),
            'first_date': daily_data['aligned_date'].min(),
            'last_date': daily_data['aligned_date'].max()
        })
        
        # Save processed data
        merged.to_csv(os.path.join(OUTPUT_DIR, f"{ticker}_processed.csv"), index=False)
        
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

# --- 9. Final Results ---
results_df = pd.DataFrame(results)
print("\n=== Final Correlation Results ===")
print(results_df.sort_values('correlation', ascending=False))

# Save results
results_df.to_csv(os.path.join(OUTPUT_DIR, "correlation_summary.csv"), index=False)
print(f"\nAll results saved to {OUTPUT_DIR} directory")

Loading news data...
Performing sentiment analysis...
Processing stock data...


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data\\yfinance_data'