# Feature Engineering Analysis
Complete analysis of feature engineering pipeline

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path('..').resolve()))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from src.data.database_manager import DatabaseManager
from src.features import TechnicalFeatures, BehavioralFeatures, MarketRegimeFeatures, FeaturePipeline

# Setup
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

db = DatabaseManager("../data/trading_risk.db")
print("Database initialized")

In [None]:
# Get all accounts
accounts = db.get_accounts()
print(f"Total accounts: {len(accounts)}")
print("\nAccount Types:")
print(accounts['account_type'].value_counts())

# Select accounts with sufficient data
account_summaries = []
for _, acc in accounts.iterrows():
    summary = db.get_account_summary(acc['account_id'])
    if summary['trading_days'] >= 100:
        account_summaries.append(summary)

print(f"\nAccounts with 100+ days: {len(account_summaries)}")

In [None]:
# Test feature generation for each account
feature_stats = []

# Initialize pipeline with all generators
pipeline = FeaturePipeline([
    TechnicalFeatures(),
    BehavioralFeatures(),
    MarketRegimeFeatures()
])

for acc_summary in account_summaries[:5]:  # Test first 5
    account_id = acc_summary['account_id']
    
    # Get data
    daily = db.get_account_daily_summary(account_id=account_id)
    fills = db.get_fills(account_id=account_id)
    
    # Generate features
    features = pipeline.generate_features(daily, fills, account_id)
    
    if not features.empty:
        stats = {
            'account_id': account_id,
            'account_type': acc_summary['account_type'],
            'n_days': len(daily),
            'n_features': len(features.columns) - 1,  # Exclude date
            'missing_pct': features.isnull().sum().sum() / features.size * 100,
            'tech_features': len([c for c in features.columns if c.startswith('tech_')]),
            'behav_features': len([c for c in features.columns if c.startswith('behav_')]),
            'regime_features': len([c for c in features.columns if c.startswith('regime_')])
        }
        feature_stats.append(stats)

stats_df = pd.DataFrame(feature_stats)
print("Feature Generation Statistics:")
stats_df

In [None]:
# Analyze feature quality for best account
best_account = account_summaries[0]
account_id = best_account['account_id']

print(f"Analyzing account {account_id}")
print(f"Total P&L: ${best_account['total_pl']:,.2f}")
print(f"Trading days: {best_account['trading_days']}")

# Get full data
daily = db.get_account_daily_summary(account_id=account_id)
fills = db.get_fills(account_id=account_id)

# Generate features
features = pipeline.generate_features(daily, fills, account_id)

# Merge with target
feature_analysis = features.merge(
    daily[['date', 'net']], 
    on='date', 
    how='left'
)

# Create forward target
feature_analysis['target'] = feature_analysis['net'].shift(-1)
feature_analysis = feature_analysis.dropna(subset=['target'])

print(f"\nFeature shape: {feature_analysis.shape}")

In [None]:
# Feature importance analysis
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Prepare data
feature_cols = [c for c in feature_analysis.columns if c not in ['date', 'net', 'target']]
X = feature_analysis[feature_cols].fillna(0)
y = feature_analysis['target']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # Time series split
)

# Train model
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

# Get importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
top_features = importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Features by Importance')
plt.tight_layout()
plt.show()

# Score
train_score = rf.score(X_train, y_train)
test_score = rf.score(X_test, y_test)
print(f"\nRandom Forest R² - Train: {train_score:.3f}, Test: {test_score:.3f}")

In [None]:
feature_analysis.columns

In [None]:
# Analyze feature categories
category_importance = {
    'technical': importance[importance['feature'].str.startswith('tech_')]['importance'].sum(),
    'behavioral': importance[importance['feature'].str.startswith('behav_')]['importance'].sum(),
    'regime': importance[importance['feature'].str.startswith('regime_')]['importance'].sum()
}

plt.figure(figsize=(8, 6))
plt.pie(category_importance.values(), labels=category_importance.keys(), autopct='%1.1f%%')
plt.title('Feature Importance by Category')
plt.show()

In [None]:
# Correlation analysis
# Select top features
top_feature_names = importance.head(15)['feature'].tolist()
correlation_data = feature_analysis[top_feature_names + ['target']]

plt.figure(figsize=(12, 10))
correlation_matrix = correlation_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature stability over time
# Calculate rolling statistics for top features
stability_window = 60

fig, axes = plt.subplots(3, 1, figsize=(15, 10))

for i, feat in enumerate(importance.head(3)['feature']):
    # Rolling mean and std
    rolling_mean = feature_analysis[feat].rolling(stability_window).mean()
    rolling_std = feature_analysis[feat].rolling(stability_window).std()
    
    axes[i].plot(feature_analysis['date'], feature_analysis[feat], alpha=0.3, label='Value')
    axes[i].plot(feature_analysis['date'], rolling_mean, label=f'{stability_window}d Mean')
    axes[i].fill_between(
        feature_analysis['date'],
        rolling_mean - rolling_std,
        rolling_mean + rolling_std,
        alpha=0.2
    )
    axes[i].set_title(f'{feat} - Stability Analysis')
    axes[i].legend()
    axes[i].set_xlabel('Date')

plt.tight_layout()
plt.show()

In [None]:
# Feature effectiveness by market regime
# Define simple regimes based on volatility
volatility = feature_analysis['tech_volatility_20d']
vol_percentile = volatility.rank(pct=True)

regimes = pd.cut(vol_percentile, bins=[0, 0.33, 0.67, 1.0], labels=['Low Vol', 'Med Vol', 'High Vol'])

# Analyze feature correlation by regime
regime_correlations = {}
for regime in ['Low Vol', 'Med Vol', 'High Vol']:
    mask = regimes == regime
    if mask.sum() > 20:
        regime_data = feature_analysis[mask]
        correlations = regime_data[top_feature_names].corrwith(regime_data['target']).abs()
        regime_correlations[regime] = correlations

# Plot
regime_corr_df = pd.DataFrame(regime_correlations)
regime_corr_df.plot(kind='bar', figsize=(12, 6))
plt.title('Feature Correlation with Target by Volatility Regime')
plt.xlabel('Features')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=45)
plt.legend(title='Regime')
plt.tight_layout()
plt.show()

In [None]:
# Summary statistics
print("Feature Engineering Summary:")
print(f"Total features generated: {len(feature_cols)}")
print(f"Features with >30% missing: {(feature_analysis[feature_cols].isnull().sum() > len(feature_analysis) * 0.3).sum()}")
print(f"Constant features: {(feature_analysis[feature_cols].nunique() <= 1).sum()}")
print(f"\nTop 10 most important features:")
for i, row in importance.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")