# 📊 Market Data Exploration Guide

This notebook shows you how to load and explore parquet data from your Personal Quant Desk.

## What You'll Learn:
- Load parquet files
- Explore OHLCV data
- Visualize price action
- Calculate basic statistics
- Check data quality

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Plotting setup
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline
sns.set_palette("husl")

# Add parent directory to path for imports
import sys
sys.path.append('..')

print("✅ Imports successful!")

## 1️⃣ Finding Your Data

First, let's see what parquet files you have:

In [None]:
# Define data directory
data_dir = Path('../data/processed')

# List all parquet files
if data_dir.exists():
    parquet_files = list(data_dir.glob('*.parquet'))
    
    if parquet_files:
        print(f"📁 Found {len(parquet_files)} parquet files:\n")
        for f in parquet_files:
            size_mb = f.stat().st_size / (1024 * 1024)
            print(f"  • {f.name:30s} ({size_mb:.2f} MB)")
    else:
        print("⚠️  No parquet files found!")
        print("\n💡 First download some data:")
        print("   cd ../data")
        print("   python main.py update --symbols SPY --days 100")
else:
    print(f"⚠️  Directory not found: {data_dir}")
    print("\n💡 Create it with: mkdir -p ../data/processed")

## 2️⃣ Loading a Parquet File

Let's load one file and explore it:

In [None]:
# Choose a file (change this to your actual file)
symbol = 'SPY'  # Change to your symbol
file_path = data_dir / f'{symbol}.parquet'

# Load the parquet file
if file_path.exists():
    df = pd.read_parquet(file_path)
    
    print(f"✅ Loaded {symbol} data")
    print(f"\n📊 Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"📅 Date range: {df.index.min().date()} to {df.index.max().date()}")
    print(f"\n🔍 Columns: {', '.join(df.columns.tolist())}")
else:
    print(f"❌ File not found: {file_path}")
    print("\n💡 Available files:")
    if parquet_files:
        print(f"   Try: symbol = '{parquet_files[0].stem}'")

In [None]:
# Preview first few rows
print("📈 First 5 rows:")
df.head()

In [None]:
# Preview last few rows
print("📉 Last 5 rows:")
df.tail()

## 3️⃣ Data Quality Check

In [None]:
# Basic info
print("📋 Data Info:")
df.info()

In [None]:
# Check for missing values
missing = df.isnull().sum()
if missing.sum() > 0:
    print("⚠️  Missing values found:")
    print(missing[missing > 0])
else:
    print("✅ No missing values!")

In [None]:
# Statistical summary
print("📊 Statistical Summary:")
df.describe()

## 4️⃣ Price Visualization

In [None]:
# Plot closing price
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df.index, df['Close'], linewidth=1.5, label='Close Price')
ax.fill_between(df.index, df['Low'], df['High'], alpha=0.2, label='High-Low Range')

ax.set_title(f'{symbol} Price History', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price ($)', fontsize=12)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot OHLC candlestick-style
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Price
ax1.plot(df.index, df['Close'], linewidth=2, label='Close', color='#2E86DE')
ax1.plot(df.index, df['Open'], linewidth=1, alpha=0.7, label='Open', color='#54A0FF')
ax1.fill_between(df.index, df['Low'], df['High'], alpha=0.15, color='#2E86DE')
ax1.set_ylabel('Price ($)', fontsize=12)
ax1.set_title(f'{symbol} OHLC Data', fontsize=16, fontweight='bold')
ax1.legend(loc='best')
ax1.grid(True, alpha=0.3)

# Volume
ax2.bar(df.index, df['Volume'], width=0.8, alpha=0.6, color='#EE5A6F')
ax2.set_ylabel('Volume', fontsize=12)
ax2.set_xlabel('Date', fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5️⃣ Calculate Returns

In [None]:
# Calculate daily returns
df['Returns'] = df['Close'].pct_change()

print("📈 Returns Statistics:")
print(f"  Mean return: {df['Returns'].mean():.4%}")
print(f"  Std dev:     {df['Returns'].std():.4%}")
print(f"  Min return:  {df['Returns'].min():.4%}")
print(f"  Max return:  {df['Returns'].max():.4%}")
print(f"  Sharpe (annualized): {(df['Returns'].mean() / df['Returns'].std() * np.sqrt(252)):.2f}")

In [None]:
# Plot returns distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Returns over time
ax1.plot(df.index, df['Returns'], linewidth=0.8, alpha=0.7)
ax1.axhline(y=0, color='red', linestyle='--', alpha=0.5)
ax1.set_title('Daily Returns Over Time', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Return')
ax1.grid(True, alpha=0.3)

# Returns histogram
ax2.hist(df['Returns'].dropna(), bins=50, alpha=0.7, edgecolor='black')
ax2.axvline(x=df['Returns'].mean(), color='red', linestyle='--', label=f"Mean: {df['Returns'].mean():.4%}")
ax2.set_title('Returns Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Return')
ax2.set_ylabel('Frequency')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6️⃣ Simple Technical Indicators

In [None]:
# Calculate moving averages
df['SMA_20'] = df['Close'].rolling(window=20).mean()
df['SMA_50'] = df['Close'].rolling(window=50).mean()
df['SMA_200'] = df['Close'].rolling(window=200).mean()

# Calculate Bollinger Bands
df['BB_middle'] = df['Close'].rolling(window=20).mean()
df['BB_std'] = df['Close'].rolling(window=20).std()
df['BB_upper'] = df['BB_middle'] + (df['BB_std'] * 2)
df['BB_lower'] = df['BB_middle'] - (df['BB_std'] * 2)

print("✅ Calculated moving averages and Bollinger Bands")

In [None]:
# Plot with indicators
fig, ax = plt.subplots(figsize=(14, 7))

# Price and moving averages
ax.plot(df.index, df['Close'], linewidth=2, label='Close', color='black')
ax.plot(df.index, df['SMA_20'], linewidth=1.5, label='SMA 20', color='blue', alpha=0.7)
ax.plot(df.index, df['SMA_50'], linewidth=1.5, label='SMA 50', color='orange', alpha=0.7)
ax.plot(df.index, df['SMA_200'], linewidth=1.5, label='SMA 200', color='red', alpha=0.7)

# Bollinger Bands
ax.fill_between(df.index, df['BB_upper'], df['BB_lower'], alpha=0.1, color='gray', label='Bollinger Bands')

ax.set_title(f'{symbol} Price with Technical Indicators', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price ($)', fontsize=12)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7️⃣ Load Multiple Symbols

In [None]:
# Load multiple symbols
symbols = ['SPY', 'QQQ', 'GLD']  # Change to your symbols
data_dict = {}

for symbol in symbols:
    file_path = data_dir / f'{symbol}.parquet'
    if file_path.exists():
        data_dict[symbol] = pd.read_parquet(file_path)
        print(f"✅ Loaded {symbol}: {len(data_dict[symbol])} rows")
    else:
        print(f"⚠️  {symbol}.parquet not found")

if data_dict:
    # Create combined dataframe with closing prices
    closes = pd.DataFrame({symbol: df['Close'] for symbol, df in data_dict.items()})
    print(f"\n📊 Combined data shape: {closes.shape}")
else:
    print("\n⚠️  No data loaded. Download some data first!")

In [None]:
# Normalize and plot multiple symbols
if data_dict:
    # Normalize to 100 at start
    normalized = (closes / closes.iloc[0]) * 100
    
    fig, ax = plt.subplots(figsize=(14, 6))
    
    for col in normalized.columns:
        ax.plot(normalized.index, normalized[col], linewidth=2, label=col)
    
    ax.set_title('Normalized Price Comparison (Base = 100)', fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Normalized Price', fontsize=12)
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 8️⃣ Correlation Analysis

In [None]:
if data_dict and len(data_dict) > 1:
    # Calculate returns
    returns = closes.pct_change().dropna()
    
    # Correlation matrix
    corr = returns.corr()
    
    # Plot heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(corr, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    ax.set_title('Returns Correlation Matrix', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\n📊 Correlation Matrix:")
    print(corr)

## 🎯 Next Steps

Now that you know how to load and explore data, you can:

1. **Use the data ingestion system**:
   ```python
   from data.ingestion import HybridDataManager
   manager = HybridDataManager()
   data, metadata = manager.download_instrument('AAPL', '2024-01-01', '2024-12-31')
   ```

2. **Use the feature engineering pipeline**:
   ```python
   from data.features.feature_pipeline import FeaturePipeline
   pipeline = FeaturePipeline()
   features = pipeline.generate_features(df, 'SPY')
   ```

3. **Explore backtesting**:
   - Check out the backtesting module
   - Test strategies on historical data

4. **Build custom strategies**:
   - Use the strategies module
   - Create your own trading logic

Happy exploring! 🚀