# Exploratory Data Analysis

This notebook performs exploratory data analysis on the financial data used for forecasting and strategy development. We analyze various statistical properties, seasonality, autocorrelation, and other characteristics of the price series to inform our strategy design.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import scipy.stats as stats

# Add src directory to path
import sys
sys.path.append('../')

# Import our data loading module
from src.data_loader import fetch_data

# Set up plotting style
plt.style.use('ggplot')
sns.set_style('whitegrid')
%matplotlib inline

## Load Configuration & Data

In [None]:
# Load configuration
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Get tickers from config
tickers = config['data']['tickers']
start_date = config['data']['start_date']
end_date = config['data']['end_date']

print(f"Analyzing data for tickers: {tickers}")
print(f"Period: {start_date} to {end_date}")

# Fetch data for each ticker
data_dict = {}
for ticker in tickers:
    data_dict[ticker] = fetch_data(ticker, start_date, end_date)
    print(f"Loaded {ticker}: {len(data_dict[ticker])} rows of data")

## Basic Price Analysis

In [None]:
# Plot price series for each ticker
plt.figure(figsize=(14, 7 * len(tickers)))

for i, ticker in enumerate(tickers, 1):
    plt.subplot(len(tickers), 1, i)
    data_dict[ticker]['Price'].plot()
    plt.title(f'{ticker} Price Series')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Return Analysis

In [None]:
# Calculate daily returns
returns_dict = {}
for ticker in tickers:
    returns_dict[ticker] = data_dict[ticker]['Price'].pct_change().dropna()
    
# Plot daily returns
plt.figure(figsize=(14, 7 * len(tickers)))

for i, ticker in enumerate(tickers, 1):
    plt.subplot(len(tickers), 1, i)
    returns_dict[ticker].plot()
    plt.title(f'{ticker} Daily Returns')
    plt.xlabel('Date')
    plt.ylabel('Return')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Statistical Properties

In [None]:
# Calculate and display key statistics for each ticker
stats_dict = {}

for ticker in tickers:
    returns = returns_dict[ticker]
    
    stats_dict[ticker] = {
        'Mean': returns.mean(),
        'Median': returns.median(),
        'Min': returns.min(),
        'Max': returns.max(),
        'Std Dev': returns.std(),
        'Skewness': returns.skew(),
        'Kurtosis': returns.kurtosis(),
        'Sharpe Ratio (Annualized)': returns.mean() / returns.std() * np.sqrt(252),
        'Positive Days %': (returns > 0).mean() * 100,
        'Negative Days %': (returns < 0).mean() * 100
    }

# Convert to DataFrame for display
stats_df = pd.DataFrame(stats_dict).T
stats_df

## Return Distribution Analysis

In [None]:
# Plot return distributions
plt.figure(figsize=(14, 6 * len(tickers)))

for i, ticker in enumerate(tickers, 1):
    returns = returns_dict[ticker]
    
    plt.subplot(len(tickers), 2, 2*i-1)
    sns.histplot(returns, kde=True, stat="density")
    
    # Add normal distribution curve
    x = np.linspace(returns.min(), returns.max(), 100)
    plt.plot(x, stats.norm.pdf(x, returns.mean(), returns.std()), 'r--', linewidth=2)
    
    plt.title(f'{ticker} Return Distribution')
    plt.xlabel('Return')
    plt.ylabel('Density')
    plt.grid(True)
    
    # QQ plot to check normality
    plt.subplot(len(tickers), 2, 2*i)
    stats.probplot(returns, dist="norm", plot=plt)
    plt.title(f'{ticker} Q-Q Plot')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Volatility Analysis

In [None]:
# Calculate rolling volatility (21-day window = approximately 1 month)
rolling_vol_dict = {}
for ticker in tickers:
    rolling_vol_dict[ticker] = returns_dict[ticker].rolling(window=21).std() * np.sqrt(252)
    
# Plot rolling volatility
plt.figure(figsize=(14, 7))

for ticker in tickers:
    rolling_vol_dict[ticker].plot(label=ticker)
    
plt.title('Annualized Rolling Volatility (21-day window)')
plt.xlabel('Date')
plt.ylabel('Volatility')
plt.legend()
plt.grid(True)
plt.show()

# Volatility clustering analysis
plt.figure(figsize=(14, 7))

for ticker in tickers:
    # Calculate absolute returns as a measure of volatility
    abs_returns = np.abs(returns_dict[ticker])
    # Calculate autocorrelation of absolute returns
    acf_values = acf(abs_returns, nlags=20, fft=True)
    plt.plot(range(len(acf_values)), acf_values, marker='o', label=ticker)
    
plt.title('Autocorrelation of Absolute Returns (Volatility Clustering)')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.axhline(y=0, color='r', linestyle='--')
plt.legend()
plt.grid(True)
plt.show()

# Monthly volatility
plt.figure(figsize=(14, 7))

for ticker in tickers:
    monthly_vol = returns_dict[ticker].groupby(returns_dict[ticker].index.month).std() * np.sqrt(252)
    plt.bar(monthly_vol.index, monthly_vol.values, alpha=0.5, label=ticker)
    
plt.title('Monthly Volatility Pattern')
plt.xlabel('Month')
plt.ylabel('Annualized Volatility')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend()
plt.grid(True)
plt.show()

## Autocorrelation Analysis

In [None]:
# Check for autocorrelation in returns (momentum/mean reversion effects)
plt.figure(figsize=(14, 10))

for i, ticker in enumerate(tickers, 1):
    # Autocorrelation function
    plt.subplot(len(tickers), 2, 2*i-1)
    plot_acf(returns_dict[ticker], lags=20, title=f'{ticker} Return Autocorrelation')
    plt.grid(True)
    
    # Partial autocorrelation function
    plt.subplot(len(tickers), 2, 2*i)
    plot_pacf(returns_dict[ticker], lags=20, title=f'{ticker} Return Partial Autocorrelation')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Stationarity Test

In [None]:
# Perform Augmented Dickey-Fuller test to check for stationarity
adf_results = {}

for ticker in tickers:
    # Test on price series (expected to be non-stationary)
    price_result = adfuller(data_dict[ticker]['Price'])
    
    # Test on return series (expected to be stationary)
    returns_result = adfuller(returns_dict[ticker].dropna())
    
    adf_results[ticker] = {
        'Price Series': {
            'ADF Statistic': price_result[0],
            'p-value': price_result[1],
            'Critical Values': price_result[4],
            'Stationary': price_result[1] < 0.05
        },
        'Return Series': {
            'ADF Statistic': returns_result[0],
            'p-value': returns_result[1],
            'Critical Values': returns_result[4],
            'Stationary': returns_result[1] < 0.05
        }
    }

# Display results
for ticker in tickers:
    print(f"\nStationarity Test Results for {ticker}:")
    print("Price Series:")
    print(f"  ADF Statistic: {adf_results[ticker]['Price Series']['ADF Statistic']:.4f}")
    print(f"  p-value: {adf_results[ticker]['Price Series']['p-value']:.4f}")
    print(f"  Is Stationary: {adf_results[ticker]['Price Series']['Stationary']}")
    
    print("\nReturn Series:")
    print(f"  ADF Statistic: {adf_results[ticker]['Return Series']['ADF Statistic']:.4f}")
    print(f"  p-value: {adf_results[ticker]['Return Series']['p-value']:.4f}")
    print(f"  Is Stationary: {adf_results[ticker]['Return Series']['Stationary']}")

## Seasonality Analysis

In [None]:
# Analyze day-of-week effects
plt.figure(figsize=(14, 7))

for ticker in tickers:
    # Get average return by day of week
    dow_returns = returns_dict[ticker].groupby(returns_dict[ticker].index.dayofweek).mean() * 100
    
    # Reindex to make Monday first
    dow_returns.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
    
    # Plot
    plt.bar(dow_returns.index, dow_returns.values, alpha=0.5, label=ticker)

plt.title('Average Daily Returns by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Return (%)')
plt.legend()
plt.grid(True)
plt.show()

# Analyze month-of-year effects
plt.figure(figsize=(14, 7))

for ticker in tickers:
    # Get average return by month
    monthly_returns = returns_dict[ticker].groupby(returns_dict[ticker].index.month).mean() * 100 * 21  # Approximate trading days per month
    
    # Reindex to use month names
    monthly_returns.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    # Plot
    plt.bar(monthly_returns.index, monthly_returns.values, alpha=0.5, label=ticker)

plt.title('Average Monthly Returns by Month')
plt.xlabel('Month')
plt.ylabel('Average Monthly Return (%)')
plt.legend()
plt.grid(True)
plt.show()

## Correlation Analysis

In [None]:
# Create a DataFrame with returns from all tickers
combined_returns = pd.DataFrame()
for ticker in tickers:
    combined_returns[ticker] = returns_dict[ticker]

# Calculate correlation matrix
correlation_matrix = combined_returns.corr()

# Plot correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title('Return Correlation Matrix')
plt.tight_layout()
plt.show()

# Calculate rolling correlation (if we have multiple tickers)
if len(tickers) > 1:
    plt.figure(figsize=(14, 7))
    
    # Calculate 60-day rolling correlation between the first two tickers
    rolling_corr = combined_returns[tickers[0]].rolling(window=60).corr(combined_returns[tickers[1]])
    
    plt.plot(rolling_corr.index, rolling_corr.values)
    plt.title(f'60-day Rolling Correlation between {tickers[0]} and {tickers[1]}')
    plt.xlabel('Date')
    plt.ylabel('Correlation')
    plt.axhline(y=correlation_matrix.loc[tickers[0], tickers[1]], color='r', linestyle='--', label='Full-period Correlation')
    plt.legend()
    plt.grid(True)
    plt.show()

## Risk-Reward Analysis

In [None]:
# Calculate annualized return and risk metrics
annualized_metrics = {}

for ticker in tickers:
    returns = returns_dict[ticker]
    
    # Calculate annualized metrics
    annualized_return = returns.mean() * 252
    annualized_vol = returns.std() * np.sqrt(252)
    sharpe_ratio = annualized_return / annualized_vol
    
    # Calculate downside risk metrics
    downside_returns = returns[returns < 0]
    downside_vol = downside_returns.std() * np.sqrt(252)
    sortino_ratio = annualized_return / downside_vol if downside_vol != 0 else np.nan
    
    # Calculate maximum drawdown
    cumulative_returns = (1 + returns).cumprod()
    running_max = cumulative_returns.cummax()
    drawdown = (cumulative_returns / running_max - 1)
    max_drawdown = drawdown.min()
    
    annualized_metrics[ticker] = {
        'Annualized Return': annualized_return,
        'Annualized Volatility': annualized_vol,
        'Sharpe Ratio': sharpe_ratio,
        'Downside Volatility': downside_vol,
        'Sortino Ratio': sortino_ratio,
        'Max Drawdown': max_drawdown,
        'Calmar Ratio': annualized_return / abs(max_drawdown) if max_drawdown != 0 else np.nan
    }

# Create DataFrame with annualized metrics
annualized_df = pd.DataFrame(annualized_metrics).T

# Format as percentages where appropriate
for col in ['Annualized Return', 'Annualized Volatility', 'Downside Volatility', 'Max Drawdown']:
    annualized_df[col] = annualized_df[col] * 100

# Display table
annualized_df

# Plot risk vs. return
plt.figure(figsize=(12, 8))
for ticker in tickers:
    plt.scatter(
        annualized_metrics[ticker]['Annualized Volatility'], 
        annualized_metrics[ticker]['Annualized Return'],
        s=100, label=ticker
    )
    plt.annotate(
        ticker, 
        xy=(annualized_metrics[ticker]['Annualized Volatility'], annualized_metrics[ticker]['Annualized Return']),
        xytext=(5, 5), textcoords='offset points'
    )

plt.title('Risk-Return Profile')
plt.xlabel('Annualized Volatility (%)')
plt.ylabel('Annualized Return (%)')
plt.grid(True)
plt.axhline(y=0, color='r', linestyle='--')

# Add Sharpe ratio lines
x = np.linspace(0, max([annualized_metrics[ticker]['Annualized Volatility'] for ticker in tickers]) * 1.5, 100)
for sharpe in [0.5, 1, 1.5, 2]:
    y = sharpe * x
    plt.plot(x, y, 'k--', alpha=0.3)
    plt.annotate(f'Sharpe = {sharpe}', xy=(x[-1], y[-1]), xytext=(5, 0), textcoords='offset points', alpha=0.5)

plt.legend()
plt.tight_layout()
plt.show()

## Conclusion and Trading Strategy Implications

In [None]:
# Based on our analysis, we can draw several conclusions about the market characteristics:

markdown_text = """
### Key Findings

1. **Stationarity**:
   - As expected, price series are non-stationary (random walk behavior)
   - Return series are stationary, allowing for predictive modeling

2. **Volatility Characteristics**:
   - Evidence of volatility clustering (periods of high/low volatility tend to persist)
   - Seasonal patterns in volatility (monthly/yearly)
   
3. **Return Patterns**:
   - Day-of-week effects: [Summarize any patterns observed]
   - Month-of-year effects: [Summarize any patterns observed]
   
4. **Autocorrelation**:
   - Short-term: [Summarize findings - presence of momentum or mean-reversion]
   - Long-term: [Summarize findings]

### Trading Strategy Implications

1. **Momentum Strategy Potential**:
   - Positive autocorrelation at certain lags suggests momentum strategies may be effective
   - Optimal lookback periods appear to be [X] days based on autocorrelation
   
2. **Mean Reversion Strategy Potential**:
   - Negative autocorrelation at certain lags suggests mean-reversion strategies may be effective
   - Z-score calculation with [Y] day window might be effective

3. **Volatility-Based Strategies**:
   - Position sizing should account for volatility clustering
   - Consider volatility breakout strategies during transitions between regimes
   
4. **Seasonal Strategies**:
   - Consider monthly rotation strategies based on observed patterns
   - Day-of-week effects can inform optimal trade entry timing

5. **Risk Management**:
   - Non-normal return distributions suggest higher risk of extreme events than standard models predict
   - Fat tails observed in return distributions require robust risk measures (like VaR)
   - Drawdown control is essential given the observed maximum drawdowns

### Next Steps

1. Implement these findings in strategy development
2. Test various lookback periods for momentum/mean-reversion based on autocorrelation findings
3. Incorporate volatility forecasting into position sizing algorithms
4. Develop ensemble approach combining multiple strategy types to exploit different market regimes
"""

from IPython.display import Markdown
display(Markdown(markdown_text))

# Save key metrics for use in other notebooks
import pickle
import os

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

metrics_data = {
    'returns_dict': returns_dict,
    'annualized_metrics': annualized_metrics,
    'correlation_matrix': correlation_matrix,
    'adf_results': adf_results
}

with open('../data/exploratory_metrics.pkl', 'wb') as f:
    pickle.dump(metrics_data, f)

print("Analysis complete! Key metrics saved to '../data/exploratory_metrics.pkl'")

## Additional Analysis: Market Regimes