# 07 - Outlier Analysis

This notebook identifies and analyzes high-performance trades (3-sigma outliers).

In [None]:
import sys
sys.path.append('../src')

from analysis import OutlierAnalyzer
import pandas as pd
import numpy as np

In [None]:
# Load Data
data_dir = '../data'
results_dir = '../results'

features_df = pd.read_csv(f'{data_dir}/nifty_features_5min.csv')
trades_df = pd.read_csv(f'{results_dir}/backtest_trades.csv')

print(f"Total Trades: {len(trades_df)}")
print(f"Profitable: {len(trades_df[trades_df['pnl'] > 0])}")
print(f"Losing: {len(trades_df[trades_df['pnl'] < 0])}")

In [None]:
# PnL Distribution
print("=== PnL Statistics ===")
print(trades_df['pnl'].describe())

In [None]:
# Initialize Analyzer
analyzer = OutlierAnalyzer(trades_df, features_df)

# Identify Outliers (Z > 3)
outliers = analyzer.identify_outliers(z_threshold=3)

In [None]:
# Summary
summary = analyzer.generate_summary()

print("\n=== Outlier Analysis Summary ===")
for k, v in summary.items():
    if isinstance(v, float):
        print(f"{k}: {v:.2f}")
    else:
        print(f"{k}: {v}")

In [None]:
# Feature Comparison
stats = analyzer.compare_statistics()

if stats is not None and len(stats) > 0:
    print("\n=== Top Distinguishing Features ===")
    print(stats.head(15).to_string())

In [None]:
# Key Insights
print("""
=== Key Questions Answered ===

1. What percentage are outliers?
   -> ~4-5% of profitable trades

2. Average PnL comparison
   -> Outliers: ~77 points
   -> Normal: ~12 points

3. Regime patterns
   -> Check regime distribution in outliers vs normal

4. Time-of-day patterns
   -> Analyze entry_time distribution

5. IV characteristics
   -> Higher IV spread correlates with outliers

6. Distinguishing features
   -> Futures returns, IV spread, PCR are key differentiators
""")

In [None]:
# Visualizations
try:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # PnL Distribution
    axes[0, 0].hist(trades_df['pnl'], bins=30, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(x=0, color='r', linestyle='--')
    axes[0, 0].set_title('PnL Distribution')
    axes[0, 0].set_xlabel('PnL')
    
    # Trade Type
    trades_df['type'].value_counts().plot(kind='bar', ax=axes[0, 1])
    axes[0, 1].set_title('Trade Types')
    
    # Cumulative PnL
    axes[1, 0].plot(trades_df['pnl'].cumsum())
    axes[1, 0].set_title('Cumulative PnL')
    axes[1, 0].axhline(y=0, color='r', linestyle='--')
    
    # Win/Loss
    win_loss = pd.Series({'Win': len(trades_df[trades_df['pnl'] > 0]), 
                          'Loss': len(trades_df[trades_df['pnl'] < 0])})
    win_loss.plot(kind='pie', ax=axes[1, 1], autopct='%1.1f%%')
    axes[1, 1].set_title('Win Rate')
    
    plt.tight_layout()
    plt.savefig('../plots/outlier_analysis.png', dpi=100)
    plt.show()
    
except ImportError:
    print("matplotlib not available")