# Feature Engineering for Stock Market AI Agent

This notebook performs comprehensive feature engineering on our stock market datasets including:

1. **Technical Indicators**: Moving averages, RSI, MACD, Bollinger Bands, etc.
2. **Price Features**: Returns, volatility, price ratios
3. **Volume Features**: Volume indicators and price-volume relationships
4. **Sentiment Features**: Sentiment moving averages, momentum, extremes
5. **Macro Features**: Economic indicator derivatives and regime indicators
6. **Lagged Features**: Historical values for time series modeling
7. **Target Variables**: Next day/week price predictions

**Goal**: Create a comprehensive feature set for machine learning models to predict stock prices.

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from feature_engineering import (
    StockFeatureEngineer, 
    SentimentFeatureEngineer, 
    MacroFeatureEngineer, 
    DataMerger
)
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

print("✅ Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load raw datasets
print("Loading datasets...")
stock_data = pd.read_csv('../data/stock_prices.csv')
sentiment_data = pd.read_csv('../data/news_sentiment.csv')
macro_data = pd.read_csv('../data/macro_indicators.csv')

# Convert dates
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])
macro_data['Date'] = pd.to_datetime(macro_data['Date'])

print(f"✅ Stock data: {stock_data.shape}")
print(f"✅ Sentiment data: {sentiment_data.shape}")
print(f"✅ Macro data: {macro_data.shape}")

## 2. Initialize Feature Engineers

In [None]:
# Initialize feature engineering classes
stock_fe = StockFeatureEngineer()
sentiment_fe = SentimentFeatureEngineer()
macro_fe = MacroFeatureEngineer()
merger = DataMerger()

print("✅ Feature engineers initialized!")

## 3. Sentiment Feature Engineering

In [None]:
print("🔄 Processing sentiment data...")

# Forward fill sentiment data per ticker (as specified in requirements)
sentiment_processed = sentiment_fe.forward_fill_sentiment(sentiment_data)
print(f"Forward fill completed")

# Create sentiment features
sentiment_processed = sentiment_fe.create_sentiment_features(sentiment_processed)
print(f"✅ Sentiment features created: {len(sentiment_fe.features_created)}")
print(f"Features: {sentiment_fe.features_created[:10]}...")  # Show first 10

print(f"Final sentiment data shape: {sentiment_processed.shape}")

## 4. Macroeconomic Feature Engineering

In [None]:
print("🔄 Processing macroeconomic data...")

# Create macro features
macro_processed = macro_fe.create_macro_features(macro_data)
print(f"✅ Macro features created: {len(macro_fe.features_created)}")
print(f"Features: {macro_fe.features_created[:10]}...")  # Show first 10

print(f"Final macro data shape: {macro_processed.shape}")

## 5. Stock Technical Feature Engineering

We'll process each stock ticker individually to ensure proper calculation of technical indicators.

In [None]:
print("🔄 Processing stock data...")
print(f"Total tickers to process: {stock_data['Ticker'].nunique()}")

processed_stock_data = []
tickers = stock_data['Ticker'].unique()

# Process first few tickers for demonstration (you can process all by removing [:5])
sample_tickers = tickers[:5]  # Remove [:5] to process all tickers

for i, ticker in enumerate(sample_tickers):
    print(f"Processing {ticker} ({i+1}/{len(sample_tickers)})...")
    
    # Get ticker data
    ticker_data = stock_data[stock_data['Ticker'] == ticker].copy()
    ticker_data = ticker_data.sort_values('Date')
    
    # Create all stock features
    ticker_data = stock_fe.create_price_features(ticker_data)
    ticker_data = stock_fe.create_moving_averages(ticker_data)
    ticker_data = stock_fe.create_volatility_features(ticker_data)
    ticker_data = stock_fe.create_momentum_features(ticker_data)
    ticker_data = stock_fe.create_volume_features(ticker_data)
    ticker_data = stock_fe.create_lagged_features(ticker_data)
    ticker_data = stock_fe.create_target_variables(ticker_data)
    
    processed_stock_data.append(ticker_data)

# Combine all processed stock data
processed_stock_data = pd.concat(processed_stock_data, ignore_index=True)

print(f"✅ Stock features created: {len(stock_fe.features_created)}")
print(f"Final processed stock data shape: {processed_stock_data.shape}")
print(f"Sample features: {stock_fe.features_created[:15]}...")  # Show first 15

## 6. Data Merging

Now we'll merge all the processed datasets into one comprehensive dataset.

In [None]:
print("🔄 Merging all datasets...")

# Merge all datasets
final_data = merger.merge_all_data(processed_stock_data, sentiment_processed, macro_processed)
print("✅ Initial merge completed")

# Clean merged data
final_data = merger.clean_merged_data(final_data)
print("✅ Data cleaning completed")

print(f"\n📊 Final Dataset Summary:")
print(f"Shape: {final_data.shape}")
print(f"Date range: {final_data['Date'].min().date()} to {final_data['Date'].max().date()}")
print(f"Tickers: {sorted(final_data['Ticker'].unique())}")
print(f"Total features: {len(final_data.columns)}")

## 7. Feature Analysis and Visualization

In [None]:
# Analyze feature categories
all_features = final_data.columns.tolist()

# Categorize features
price_features = [f for f in all_features if any(x in f for x in ['price', 'return', 'close', 'open', 'high', 'low'])]
volume_features = [f for f in all_features if 'volume' in f or 'obv' in f or 'vpt' in f]
technical_features = [f for f in all_features if any(x in f for x in ['sma', 'ema', 'rsi', 'macd', 'bb_', 'atr', 'stoch'])]
sentiment_features = [f for f in all_features if 'sentiment' in f or 'news' in f]
macro_features = [f for f in all_features if any(x in f for x in ['GDP', 'unemployment', 'inflation', 'federal', 'consumer', 'vix'])]
target_features = [f for f in all_features if 'target' in f]

print("📊 Feature Categories:")
print(f"Price features: {len(price_features)}")
print(f"Volume features: {len(volume_features)}")
print(f"Technical indicators: {len(technical_features)}")
print(f"Sentiment features: {len(sentiment_features)}")
print(f"Macro features: {len(macro_features)}")
print(f"Target variables: {len(target_features)}")
print(f"Other features: {len(all_features) - len(price_features) - len(volume_features) - len(technical_features) - len(sentiment_features) - len(macro_features) - len(target_features)}")

In [None]:
# Sample some key features for AAPL
if 'AAPL' in final_data['Ticker'].values:
    aapl_data = final_data[final_data['Ticker'] == 'AAPL'].copy()
    aapl_data = aapl_data.sort_values('Date')
    
    # Plot some key features
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    # Price and moving averages
    axes[0].plot(aapl_data['Date'], aapl_data['close'], label='Close Price', alpha=0.8)
    if 'sma_20' in aapl_data.columns:
        axes[0].plot(aapl_data['Date'], aapl_data['sma_20'], label='SMA 20', alpha=0.7)
    if 'sma_50' in aapl_data.columns:
        axes[0].plot(aapl_data['Date'], aapl_data['sma_50'], label='SMA 50', alpha=0.7)
    axes[0].set_title('AAPL: Price and Moving Averages')
    axes[0].legend()
    axes[0].tick_params(axis='x', rotation=45)
    
    # RSI
    if 'rsi_14' in aapl_data.columns:
        axes[1].plot(aapl_data['Date'], aapl_data['rsi_14'])
        axes[1].axhline(70, color='red', linestyle='--', alpha=0.7, label='Overbought')
        axes[1].axhline(30, color='green', linestyle='--', alpha=0.7, label='Oversold')
        axes[1].set_title('AAPL: RSI (14)')
        axes[1].legend()
        axes[1].tick_params(axis='x', rotation=45)
    
    # Volatility
    if 'volatility_20d' in aapl_data.columns:
        axes[2].plot(aapl_data['Date'], aapl_data['volatility_20d'])
        axes[2].set_title('AAPL: 20-Day Volatility')
        axes[2].tick_params(axis='x', rotation=45)
    
    # Sentiment
    if 'sentiment_score' in aapl_data.columns:
        axes[3].plot(aapl_data['Date'], aapl_data['sentiment_score'], alpha=0.6)
        if 'sentiment_sma_7' in aapl_data.columns:
            axes[3].plot(aapl_data['Date'], aapl_data['sentiment_sma_7'], label='7-day SMA')
        axes[3].axhline(0, color='red', linestyle='--', alpha=0.5)
        axes[3].set_title('AAPL: Sentiment Score')
        axes[3].legend()
        axes[3].tick_params(axis='x', rotation=45)
    
    # Volume ratio
    if 'volume_ratio_20' in aapl_data.columns:
        axes[4].plot(aapl_data['Date'], aapl_data['volume_ratio_20'])
        axes[4].axhline(1, color='red', linestyle='--', alpha=0.5)
        axes[4].set_title('AAPL: Volume Ratio (20-day)')
        axes[4].tick_params(axis='x', rotation=45)
    
    # Returns distribution
    if 'return_1d' in aapl_data.columns:
        axes[5].hist(aapl_data['return_1d'].dropna(), bins=50, alpha=0.7)
        axes[5].set_title('AAPL: Daily Returns Distribution')
        axes[5].axvline(0, color='red', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
else:
    print("AAPL data not available for visualization")

## 8. Feature Correlation Analysis

In [None]:
# Select key features for correlation analysis
key_features = [
    'close', 'volume', 'return_1d', 'volatility_20d',
    'rsi_14', 'sma_20_ratio', 'bb_position',
    'sentiment_score', 'sentiment_sma_7',
    'unemployment_rate', 'inflation_rate', 'vix'
]

# Filter to features that exist in our dataset
available_features = [f for f in key_features if f in final_data.columns]

if len(available_features) > 2:
    # Calculate correlation matrix
    correlation_data = final_data[available_features].select_dtypes(include=[np.number])
    correlation_matrix = correlation_data.corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, 
                annot=True, 
                cmap='coolwarm', 
                center=0, 
                square=True, 
                fmt='.2f',
                cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix of Key Features')
    plt.tight_layout()
    plt.show()
    
    print("🔍 High correlations (> 0.7):")
    high_corr = correlation_matrix.abs() > 0.7
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if high_corr.iloc[i, j]:
                print(f"{correlation_matrix.columns[i]} - {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")
else:
    print("Not enough features available for correlation analysis")

## 9. Missing Values Analysis

In [None]:
# Analyze missing values
missing_values = final_data.isnull().sum()
missing_percentage = (missing_values / len(final_data)) * 100

# Create missing values summary
missing_summary = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

# Filter to features with missing values
missing_features = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if not missing_features.empty:
    print("🔍 Features with Missing Values:")
    print(missing_features.head(20))
    
    # Plot missing values
    if len(missing_features) > 0:
        plt.figure(figsize=(12, 8))
        top_missing = missing_features.head(20)
        plt.barh(range(len(top_missing)), top_missing['Missing_Percentage'])
        plt.yticks(range(len(top_missing)), top_missing.index)
        plt.xlabel('Missing Percentage (%)')
        plt.title('Top 20 Features with Missing Values')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
else:
    print("✅ No missing values found in the dataset!")

print(f"\n📊 Overall Missing Values: {final_data.isnull().sum().sum()} out of {final_data.size} total values")
print(f"Missing percentage: {(final_data.isnull().sum().sum() / final_data.size) * 100:.2f}%")

## 10. Save Processed Data

In [None]:
# Save the processed dataset
print("💾 Saving processed data...")
final_data.to_csv('../data/processed_features.csv', index=False)
print("✅ Processed data saved to '../data/processed_features.csv'")

# Create and save feature summary
feature_summary = {
    'total_features': len(final_data.columns),
    'total_records': len(final_data),
    'date_range': {
        'start': final_data['Date'].min().strftime('%Y-%m-%d'),
        'end': final_data['Date'].max().strftime('%Y-%m-%d')
    },
    'tickers': sorted(final_data['Ticker'].unique().tolist()),
    'feature_categories': {
        'price_features': len(price_features),
        'volume_features': len(volume_features),
        'technical_features': len(technical_features),
        'sentiment_features': len(sentiment_features),
        'macro_features': len(macro_features),
        'target_features': len(target_features)
    },
    'stock_features_created': stock_fe.features_created,
    'sentiment_features_created': sentiment_fe.features_created,
    'macro_features_created': macro_fe.features_created,
    'all_features': final_data.columns.tolist()
}

import json
with open('../data/feature_summary.json', 'w') as f:
    json.dump(feature_summary, f, indent=2, default=str)

print("✅ Feature summary saved to '../data/feature_summary.json'")

# Display final summary
print("\n🎉 Feature Engineering Complete!")
print("=" * 50)
print(f"📊 Final Dataset:")
print(f"   Shape: {final_data.shape}")
print(f"   Total Features: {len(final_data.columns)}")
print(f"   Date Range: {final_data['Date'].min().date()} to {final_data['Date'].max().date()}")
print(f"   Tickers: {len(final_data['Ticker'].unique())}")
print(f"   Records per ticker (avg): {len(final_data) // len(final_data['Ticker'].unique()):.0f}")

print(f"\n🔧 Features Created:")
print(f"   Stock/Technical: {len(stock_fe.features_created)}")
print(f"   Sentiment: {len(sentiment_fe.features_created)}")
print(f"   Macro: {len(macro_fe.features_created)}")

print(f"\n📈 Ready for Model Training!")
print(f"   Target variables available: {target_features}")
print(f"   Next step: Build ML models using this feature-rich dataset")