# Extracting Specific Symbols (AAPL and SPY)

This notebook demonstrates how to create a new ITCH file containing only data for specific symbols of interest.

Filtering large ITCH files to specific symbols can significantly reduce file size and processing time for analysis focused on particular securities.

In [None]:
from pathlib import Path
from meatpy.itch50 import ITCH50MessageReader, ITCH50Writer
import time

# Define paths
data_dir = Path("data")
input_file = data_dir / "S081321-v50.txt.gz"
output_file = data_dir / "S081321-v50-AAPL-SPY.itch50"

# Symbols we want to extract
target_symbols = ["AAPL", "SPY"]

print(f"Input file: {input_file}")
print(f"Output file: {output_file}")
print(f"Target symbols: {target_symbols}")

# Check if input file exists
if not input_file.exists():
    print(f"❌ Input file not found: {input_file}")
    print("Please place an ITCH 5.0 file (e.g., S081321-v50.txt.gz) in the data/ directory")
else:
    print(f"✅ Input file found: {input_file}")
    input_size_gb = input_file.stat().st_size / (1024**3)
    print(f"Input file size: {input_size_gb:.2f} GB")

In [None]:
def extract_symbols(input_file, output_file, symbols):
    \"\"\"Extract data for specific symbols from an ITCH file.
    
    Args:
        input_file: Path to input ITCH file
        output_file: Path to output filtered ITCH file  
        symbols: List of symbol strings to extract
        
    Returns:
        tuple of (total_messages_processed, filtered_messages_written)
    \"\"\"
    # Convert symbols to bytes format (8 bytes, left-padded)
    symbols_bytes = [symbol.encode().ljust(8) for symbol in symbols]
    print(f\"Looking for symbols: {[s.decode() for s in symbols_bytes]}\")
    
    message_count = 0
    filtered_count = 0
    start_time = time.time()
    
    with ITCH50MessageReader(input_file) as reader:
        with ITCH50Writer(output_file, symbols=symbols) as writer:
            for message in reader:
                message_count += 1
                
                # Always include system and administrative messages
                # These provide context needed for proper file interpretation
                if (message.type in [b'S', b'R', b'H', b'Y', b'L', b'V', b'W', b'K', b'J'] 
                    or not hasattr(message, 'stock')):
                    writer.process_message(message)
                    filtered_count += 1
                    
                # Include messages for our target symbols
                elif hasattr(message, 'stock') and message.stock in symbols_bytes:
                    writer.process_message(message)
                    filtered_count += 1
                
                # Progress indicator
                if message_count % 1_000_000 == 0:
                    elapsed = time.time() - start_time
                    rate = message_count / elapsed if elapsed > 0 else 0
                    print(f\"Processed {message_count:,} messages in {elapsed:.1f}s \"
                          f\"({rate:,.0f} msg/s), kept {filtered_count:,} messages \"
                          f\"({100*filtered_count/message_count:.1f}%)\")
    
    elapsed = time.time() - start_time
    print(f\"\\n✅ Extraction complete in {elapsed:.1f} seconds\")
    print(f\"   Total messages processed: {message_count:,}\")
    print(f\"   Messages written: {filtered_count:,}\")
    print(f\"   Filtering ratio: {100*filtered_count/message_count:.2f}%\")
    
    return message_count, filtered_count"

In [None]:
# Perform the extraction
if input_file.exists():
    print("🚀 Starting symbol extraction...\n")
    
    total_messages, filtered_messages = extract_symbols(input_file, output_file, target_symbols)
    
    # Check output file size
    if output_file.exists():
        output_size_gb = output_file.stat().st_size / (1024**3)
        input_size_gb = input_file.stat().st_size / (1024**3)
        compression_ratio = output_size_gb / input_size_gb * 100
        
        print(f"\n📁 File size comparison:")
        print(f"   Input file:  {input_size_gb:.2f} GB")
        print(f"   Output file: {output_size_gb:.2f} GB")
        print(f"   Size reduction: {100-compression_ratio:.1f}% smaller")
        print(f"   Compression ratio: {compression_ratio:.1f}%")
    else:
        print(f"❌ Output file was not created: {output_file}")
else:
    print("⚠️  Cannot run extraction without input file")

In [None]:
# Verify the output by reading a sample of messages
if output_file.exists():
    print(\"\\n🔍 Verifying output file - first 20 messages:\")
    
    with ITCH50MessageReader(output_file) as reader:
        for i, message in enumerate(reader):
            if i >= 20:
                break
                
            msg_type = message.type.decode()
            
            if hasattr(message, 'stock'):
                symbol = message.stock.decode().strip()
                print(f\"  {i+1:2d}. Type {msg_type} - Symbol: {symbol}\")
            else:
                # System messages don't have symbols
                msg_descriptions = {
                    'S': 'System Event',
                    'R': 'Stock Directory', 
                    'H': 'Trading Halt',
                    'Y': 'Reg SHO',
                    'L': 'Market Participant Position'
                }
                desc = msg_descriptions.get(msg_type, 'System message')
                print(f\"  {i+1:2d}. Type {msg_type} - {desc}\")
    
    print(f\"\\n✅ Output file verification complete\")
    print(f\"   File contains filtered data for symbols: {target_symbols}\")
    print(f\"   File path: {output_file}\")"

## Key Points

- **System Messages**: Always include system and administrative messages (S, R, H, Y, L, V, W, K, J) as they provide essential context
- **Symbol Filtering**: Messages with stock symbols are filtered to include only the target symbols
- **Significant Size Reduction**: Filtering to specific symbols can reduce file size by 90%+ depending on how many symbols are in the original file
- **Processing Speed**: Smaller filtered files process much faster for subsequent analysis
- **Output Format**: The output is a valid ITCH 5.0 file that can be processed by any ITCH-compatible tool

## Performance Tips

- **Early Filtering**: Filter as early as possible in your data pipeline to reduce downstream processing time
- **Multiple Symbols**: You can filter for multiple symbols in a single pass
- **Memory Usage**: The ITCH50Writer buffers data efficiently to minimize memory usage during filtering
- **Parallel Processing**: For multiple symbol sets, consider running extractions in parallel

## Next Steps

With your filtered file, you can now:
1. Process order book data much faster
2. Generate snapshots at regular intervals
3. Calculate trading metrics and statistics
4. Create visualizations and reports