# Stock Market AI Agent - Comprehensive EDA

This notebook performs exploratory data analysis on:
1. Stock price data
2. News sentiment data
3. Macroeconomic indicators

**Objective**: Understand the data structure, distributions, and relationships to inform feature engineering and modeling decisions.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

print("Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load datasets
print("Loading datasets...")
stock_data = pd.read_csv('../data/stock_prices.csv')
sentiment_data = pd.read_csv('../data/news_sentiment.csv')
macro_data = pd.read_csv('../data/macro_indicators.csv')

print(f"✅ Stock data loaded: {stock_data.shape}")
print(f"✅ Sentiment data loaded: {sentiment_data.shape}")
print(f"✅ Macro data loaded: {macro_data.shape}")

In [None]:
# Convert date columns to datetime
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])
macro_data['Date'] = pd.to_datetime(macro_data['Date'])

print("Date columns converted to datetime format")

## 2. Stock Price Data Analysis

In [None]:
# Basic info about stock data
print("📊 Stock Data Overview")
print("=" * 50)
print(f"Shape: {stock_data.shape}")
print(f"Date range: {stock_data['Date'].min()} to {stock_data['Date'].max()}")
print(f"Number of unique tickers: {stock_data['Ticker'].nunique()}")
print(f"Tickers: {sorted(stock_data['Ticker'].unique())}")
print("\nFirst few rows:")
stock_data.head()

In [None]:
# Stock data statistics
print("📈 Stock Data Statistics")
stock_data.describe()

In [None]:
# Check for missing values
print("🔍 Missing Values in Stock Data")
missing_stock = stock_data.isnull().sum()
print(missing_stock)
print(f"\nTotal missing values: {missing_stock.sum()}")

In [None]:
# Plot closing prices for top 10 stocks by market cap
top_stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'UNH', 'JNJ', 'JPM']
top_stock_data = stock_data[stock_data['Ticker'].isin(top_stocks)]

plt.figure(figsize=(15, 10))
for i, ticker in enumerate(top_stocks):
    ticker_data = top_stock_data[top_stock_data['Ticker'] == ticker]
    plt.subplot(2, 5, i+1)
    plt.plot(ticker_data['Date'], ticker_data['close'])
    plt.title(f'{ticker} Stock Price')
    plt.xticks(rotation=45)
    plt.ylabel('Price ($)')

plt.tight_layout()
plt.show()

In [None]:
# Volume analysis
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(stock_data['volume'], bins=50, alpha=0.7)
plt.title('Distribution of Trading Volume')
plt.xlabel('Volume')
plt.ylabel('Frequency')
plt.yscale('log')

plt.subplot(1, 2, 2)
stock_data.boxplot(column='volume', ax=plt.gca())
plt.title('Trading Volume Box Plot')
plt.ylabel('Volume')
plt.yscale('log')

plt.tight_layout()
plt.show()

## 3. News Sentiment Data Analysis

In [None]:
# Basic info about sentiment data
print("📰 Sentiment Data Overview")
print("=" * 50)
print(f"Shape: {sentiment_data.shape}")
print(f"Date range: {sentiment_data['Date'].min()} to {sentiment_data['Date'].max()}")
print(f"Number of unique tickers: {sentiment_data['Ticker'].nunique()}")
print("\nFirst few rows:")
sentiment_data.head()

In [None]:
# Sentiment statistics
print("📊 Sentiment Data Statistics")
sentiment_data.describe()

In [None]:
# Check for missing values in sentiment data
print("🔍 Missing Values in Sentiment Data")
missing_sentiment = sentiment_data.isnull().sum()
print(missing_sentiment)
print(f"\nTotal missing values: {missing_sentiment.sum()}")

In [None]:
# Sentiment score distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(sentiment_data['sentiment_score'], bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.axvline(0, color='red', linestyle='--', alpha=0.7, label='Neutral')
plt.legend()

plt.subplot(1, 3, 2)
sentiment_data.boxplot(column='sentiment_score', ax=plt.gca())
plt.title('Sentiment Score Box Plot')
plt.ylabel('Sentiment Score')

plt.subplot(1, 3, 3)
plt.hist(sentiment_data['news_count'], bins=20, alpha=0.7, color='lightgreen')
plt.title('Distribution of News Count')
plt.xlabel('News Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Sentiment by ticker (top 10 stocks)
top_sentiment = sentiment_data[sentiment_data['Ticker'].isin(top_stocks)]

plt.figure(figsize=(12, 8))
sns.boxplot(data=top_sentiment, x='Ticker', y='sentiment_score')
plt.title('Sentiment Score Distribution by Top 10 Stocks')
plt.xticks(rotation=45)
plt.axhline(0, color='red', linestyle='--', alpha=0.7, label='Neutral')
plt.legend()
plt.show()

In [None]:
# Sentiment over time for selected stocks
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
selected_stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'TSLA']
axes = axes.flatten()

for i, ticker in enumerate(selected_stocks):
    ticker_sentiment = sentiment_data[sentiment_data['Ticker'] == ticker]
    # Resample to monthly average for cleaner visualization
    monthly_sentiment = ticker_sentiment.set_index('Date').resample('M')['sentiment_score'].mean()
    
    axes[i].plot(monthly_sentiment.index, monthly_sentiment.values)
    axes[i].set_title(f'{ticker} - Monthly Average Sentiment')
    axes[i].axhline(0, color='red', linestyle='--', alpha=0.5)
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_ylabel('Sentiment Score')

plt.tight_layout()
plt.show()

## 4. Macroeconomic Data Analysis

In [None]:
# Basic info about macro data
print("🏛️ Macroeconomic Data Overview")
print("=" * 50)
print(f"Shape: {macro_data.shape}")
print(f"Date range: {macro_data['Date'].min()} to {macro_data['Date'].max()}")
print(f"Columns: {list(macro_data.columns)}")
print("\nFirst few rows:")
macro_data.head()

In [None]:
# Macro data statistics
print("📊 Macroeconomic Data Statistics")
macro_data.describe()

In [None]:
# Check for missing values in macro data
print("🔍 Missing Values in Macro Data")
missing_macro = macro_data.isnull().sum()
print(missing_macro)
print(f"\nTotal missing values: {missing_macro.sum()}")

In [None]:
# Plot macro indicators over time
macro_cols = [col for col in macro_data.columns if col != 'Date']
n_cols = len(macro_cols)
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(macro_cols):
    # Resample to monthly for cleaner visualization
    monthly_data = macro_data.set_index('Date').resample('M')[col].mean()
    
    axes[i].plot(monthly_data.index, monthly_data.values)
    axes[i].set_title(f'{col.replace("_", " ").title()}')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix of macro indicators
macro_numeric = macro_data.select_dtypes(include=[np.number])
correlation_matrix = macro_numeric.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Macroeconomic Indicators')
plt.show()

## 5. Cross-Dataset Analysis

In [None]:
# Data coverage analysis
print("📅 Data Coverage Analysis")
print("=" * 50)

# Get date ranges for each dataset
stock_dates = pd.date_range(stock_data['Date'].min(), stock_data['Date'].max())
sentiment_dates = pd.date_range(sentiment_data['Date'].min(), sentiment_data['Date'].max())
macro_dates = pd.date_range(macro_data['Date'].min(), macro_data['Date'].max())

print(f"Stock data: {len(stock_dates)} days ({stock_data['Date'].min().date()} to {stock_data['Date'].max().date()})")
print(f"Sentiment data: {len(sentiment_dates)} days ({sentiment_data['Date'].min().date()} to {sentiment_data['Date'].max().date()})")
print(f"Macro data: {len(macro_dates)} days ({macro_data['Date'].min().date()} to {macro_data['Date'].max().date()})")

# Find overlapping dates
overlap_start = max(stock_data['Date'].min(), sentiment_data['Date'].min(), macro_data['Date'].min())
overlap_end = min(stock_data['Date'].max(), sentiment_data['Date'].max(), macro_data['Date'].max())
print(f"\nOverlapping period: {overlap_start.date()} to {overlap_end.date()}")
print(f"Overlapping days: {(overlap_end - overlap_start).days + 1}")

In [None]:
# Sample correlation analysis between stock prices and sentiment for AAPL
aapl_stock = stock_data[stock_data['Ticker'] == 'AAPL'].copy()
aapl_sentiment = sentiment_data[sentiment_data['Ticker'] == 'AAPL'].copy()

# Merge AAPL stock and sentiment data
aapl_merged = pd.merge(aapl_stock[['Date', 'close']], 
                       aapl_sentiment[['Date', 'sentiment_score']], 
                       on='Date', how='inner')

if not aapl_merged.empty:
    # Calculate correlation
    correlation = aapl_merged['close'].corr(aapl_merged['sentiment_score'])
    print(f"AAPL Stock Price vs Sentiment Correlation: {correlation:.3f}")
    
    # Plot scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(aapl_merged['sentiment_score'], aapl_merged['close'], alpha=0.6)
    plt.xlabel('Sentiment Score')
    plt.ylabel('AAPL Closing Price ($)')
    plt.title(f'AAPL: Stock Price vs Sentiment Score (Correlation: {correlation:.3f})')
    
    # Add trend line
    z = np.polyfit(aapl_merged['sentiment_score'], aapl_merged['close'], 1)
    p = np.poly1d(z)
    plt.plot(aapl_merged['sentiment_score'], p(aapl_merged['sentiment_score']), "r--", alpha=0.8)
    
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("No overlapping data found for AAPL stock and sentiment")

## 6. Data Quality Assessment

In [None]:
# Data quality summary
print("🔍 Data Quality Assessment")
print("=" * 50)

def assess_data_quality(df, name):
    print(f"\n{name}:")
    print(f"  Shape: {df.shape}")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    print(f"  Duplicate rows: {df.duplicated().sum()}")
    if 'Date' in df.columns:
        print(f"  Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
        print(f"  Unique dates: {df['Date'].nunique()}")

assess_data_quality(stock_data, "Stock Data")
assess_data_quality(sentiment_data, "Sentiment Data")
assess_data_quality(macro_data, "Macro Data")

## 7. Key Insights and Recommendations

Based on the EDA performed above, here are the key insights:

### Stock Data Insights:
- ✅ **Coverage**: Good coverage with ~61K records across 49 tickers
- ✅ **Quality**: No missing values in core price data
- 📊 **Patterns**: Different stocks show varying price patterns and volatility
- 📈 **Volume**: Trading volumes vary significantly across stocks

### Sentiment Data Insights:
- ✅ **Coverage**: Comprehensive with ~65K records
- 📊 **Distribution**: Sentiment scores are normally distributed around neutral (0)
- 🎯 **Variation**: Different stocks have different sentiment patterns
- 📰 **News Count**: Average of 3 news articles per day per stock

### Macro Data Insights:
- ✅ **Indicators**: 6 key economic indicators included
- 📊 **Trends**: Each indicator shows realistic economic patterns
- 🔗 **Correlations**: Some correlations between economic indicators

### Recommendations for Next Steps:
1. **Feature Engineering**: Create technical indicators, lagged variables, and rolling statistics
2. **Data Merging**: Merge all datasets on Date and Ticker with proper handling of missing values
3. **Forward Fill**: Implement forward fill for sentiment data as specified
4. **Model Selection**: Consider both traditional ML (Random Forest, XGBoost) and deep learning (LSTM) approaches
5. **Target Variable**: Define prediction target (next day/week closing price)
