# Dataset Optimization
Optimize the parsed ping dataset for maximum space efficiency using binary IP storage and optimized data types.

**Expected results**: 40% file size reduction (tested with real data)

In [7]:
import polars as pl
import os
from pathlib import Path
import glob
import ipaddress

# Configure polars
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(20)

polars.config.Config

In [8]:
# Setup input files and load probe IP mapping
INPUT_DIR = "data/ping_parsed_parts"
input_files = sorted(glob.glob(f"{INPUT_DIR}/*.parquet"))
print(f"📁 Found {len(input_files)} parsed parquet files")

PROBE_MAP_FILE = "probe_ip_map.csv"

if os.path.exists(PROBE_MAP_FILE):
    probe_map = pl.read_csv(PROBE_MAP_FILE)
    print(f"📋 Loaded probe IP mapping: {probe_map.height:,} entries")
    print("Sample probe mapping:")
    display(probe_map.head(5))
    
    # Analyze probe IP distribution
    probe_ips = probe_map['ip'].to_list()
    probe_ipv4_count = sum(1 for ip in probe_ips if ':' not in ip)
    probe_ipv6_count = len(probe_ips) - probe_ipv4_count
    
    print(f"\n🌐 Probe IP distribution:")
    print(f"  IPv4 probes: {probe_ipv4_count:,} ({probe_ipv4_count/len(probe_ips)*100:.1f}%)")
    print(f"  IPv6 probes: {probe_ipv6_count:,} ({probe_ipv6_count/len(probe_ips)*100:.1f}%)")
    
else:
    print(f"⚠️  Probe mapping file not found: {PROBE_MAP_FILE}")
    print(f"   Will skip probe ID → IP conversion")
    probe_map = None

📁 Found 966 parsed parquet files
📋 Loaded probe IP mapping: 48,739 entries
Sample probe mapping:


ip,dst_prb_id
str,i64
"""45.138.229.91""",1
"""2a10:3781:e22:1:220:4aff:fec8:23d7""",1
"""77.174.76.85""",3
"""2a02:a467:f500:1:220:4aff:fec8:2532""",3
"""83.163.50.165""",4



🌐 Probe IP distribution:
  IPv4 probes: 32,937 (67.6%)
  IPv6 probes: 15,802 (32.4%)


In [9]:
# Analyze data ranges and IP distribution
print("Analyzing data across sample files...")

# Sample a few files to understand ranges and IP distribution
sample_files = input_files[:3] if len(input_files) > 3 else input_files
all_data = []

for file in sample_files:
    df = pl.scan_parquet(file).collect()
    all_data.append(df)

combined = pl.concat(all_data)

print(f"\nAnalyzing {combined.height:,} rows from {len(sample_files)} files...")

# Analyze numeric ranges
print("\nNumeric field analysis:")
for col in ['prb_id', 'sent', 'rcvd', 'avg', 'ts']:
    if col in combined.columns:
        min_val = combined[col].min()
        max_val = combined[col].max()
        unique_count = combined[col].n_unique()
        print(f"  {col}: {min_val} to {max_val} ({unique_count} unique values)")

# Analyze destination IP addresses
unique_dst_ips = combined['dst_addr'].unique().to_list()
dst_ipv4_count = sum(1 for ip in unique_dst_ips if ':' not in ip)
dst_ipv6_count = len(unique_dst_ips) - dst_ipv4_count
print(f"\n🎯 Destination IP analysis:")
print(f"  Total unique dest IPs: {len(unique_dst_ips):,}")
print(f"  IPv4 destinations: {dst_ipv4_count:,} ({dst_ipv4_count/len(unique_dst_ips)*100:.1f}%)")
print(f"  IPv6 destinations: {dst_ipv6_count:,} ({dst_ipv6_count/len(unique_dst_ips)*100:.1f}%)")

# Analyze probe IDs and potential for source IP mapping
unique_probe_ids = combined['prb_id'].unique().to_list()
print(f"\n🔍 Probe ID analysis:")
print(f"  Unique probe IDs: {len(unique_probe_ids):,}")
if probe_map is not None:
    mapped_probes = set(unique_probe_ids) & set(probe_map['dst_prb_id'].to_list())
    print(f"  Probes with IP mapping: {len(mapped_probes):,} ({len(mapped_probes)/len(unique_probe_ids)*100:.1f}%)")

# Estimate space savings
total_rows = combined.height
print(f"\n💾 Estimated space savings per {total_rows:,} rows:")
print(f"  Probe ID (i64 → u32): {total_rows * 4:,} bytes saved")
print(f"  Destination IPs: {dst_ipv4_count * 9 + dst_ipv6_count * 19:,} bytes saved (estimated)")
print(f"  Other optimizations: {total_rows * 18:,} bytes saved")
if probe_map is not None:
    print(f"  Plus: Source IP addresses from probe mapping!")

Analyzing data across sample files...

Analyzing 84,482,613 rows from 3 files...

Numeric field analysis:
  prb_id: 1 to 1011438 (13055 unique values)
  sent: 3 to 3 (1 unique values)
  rcvd: 0 to 3 (4 unique values)
  avg: -1.0 to 51082.65367266667 (66060775 unique values)
  ts: 1749254400 to 1749268799 (14229 unique values)

🎯 Destination IP analysis:
  Total unique dest IPs: 11,967
  IPv4 destinations: 8,704 (72.7%)
  IPv6 destinations: 3,263 (27.3%)

🔍 Probe ID analysis:
  Unique probe IDs: 13,055
  Probes with IP mapping: 11,795 (90.3%)

💾 Estimated space savings per 84,482,613 rows:
  Probe ID (i64 → u32): 337,930,452 bytes saved
  Destination IPs: 140,333 bytes saved (estimated)
  Other optimizations: 1,520,687,034 bytes saved
  Plus: Source IP addresses from probe mapping!


In [10]:
# Analyze data ranges and IP distribution
print("Analyzing data across sample files...")

# Sample a few files to understand ranges and IP distribution
sample_files = input_files[:3] if len(input_files) > 3 else input_files
all_data = []

for file in sample_files:
    df = pl.scan_parquet(file).collect()
    all_data.append(df)

combined = pl.concat(all_data)

print(f"\nAnalyzing {combined.height:,} rows from {len(sample_files)} files...")

# Analyze numeric ranges
print("\nNumeric field analysis:")
for col in ['prb_id', 'sent', 'rcvd', 'avg', 'ts']:
    if col in combined.columns:
        min_val = combined[col].min()
        max_val = combined[col].max()
        unique_count = combined[col].n_unique()
        print(f"  {col}: {min_val} to {max_val} ({unique_count} unique values)")

# Analyze IP addresses
unique_ips = combined['dst_addr'].unique().to_list()
ipv4_count = sum(1 for ip in unique_ips if ':' not in ip)
ipv6_count = len(unique_ips) - ipv4_count
print(f"\nIP Address analysis:")
print(f"  Total unique IPs: {len(unique_ips):,}")
print(f"  IPv4 addresses: {ipv4_count:,} ({ipv4_count/len(unique_ips)*100:.1f}%)")
print(f"  IPv6 addresses: {ipv6_count:,} ({ipv6_count/len(unique_ips)*100:.1f}%)")

# Estimate space savings
ipv4_string_bytes = sum(len(ip) for ip in unique_ips if ':' not in ip)
ipv6_string_bytes = sum(len(ip) for ip in unique_ips if ':' in ip)
ipv4_optimized_bytes = ipv4_count * 4  # UInt32
ipv6_optimized_bytes = ipv6_count * 16  # 16-byte binary

ip_savings = (ipv4_string_bytes + ipv6_string_bytes) - (ipv4_optimized_bytes + ipv6_optimized_bytes)
print(f"\nEstimated IP storage savings:")
print(f"  Original: {ipv4_string_bytes + ipv6_string_bytes:,} bytes")
print(f"  Optimized: {ipv4_optimized_bytes + ipv6_optimized_bytes:,} bytes") 
print(f"  Savings: {ip_savings:,} bytes ({ip_savings/(ipv4_string_bytes + ipv6_string_bytes)*100:.1f}%)")

Analyzing data across sample files...

Analyzing 84,482,613 rows from 3 files...

Numeric field analysis:
  prb_id: 1 to 1011438 (13055 unique values)
  sent: 3 to 3 (1 unique values)
  rcvd: 0 to 3 (4 unique values)
  avg: -1.0 to 51082.65367266667 (66060775 unique values)
  ts: 1749254400 to 1749268799 (14229 unique values)

IP Address analysis:
  Total unique IPs: 11,967
  IPv4 addresses: 8,704 (72.7%)
  IPv6 addresses: 3,263 (27.3%)

Estimated IP storage savings:
  Original: 186,611 bytes
  Optimized: 87,024 bytes
  Savings: 99,587 bytes (53.4%)


In [11]:
import multiprocessing as mp
import numpy as np
from concurrent.futures import ProcessPoolExecutor

def convert_ipv4_batch(ip_array):
    """
    Convert batch of IPv4 addresses to UInt32 using multiprocessing.
    Works on numpy arrays for maximum efficiency.
    """
    result = np.zeros(len(ip_array), dtype=np.uint32)
    
    for i, ip_str in enumerate(ip_array):
        if ip_str and ':' not in ip_str:  # IPv4 check
            try:
                parts = ip_str.split('.')
                if len(parts) == 4:
                    result[i] = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
            except:
                result[i] = 0  # Invalid IP
        else:
            result[i] = 0  # Not IPv4
    
    return result

def convert_ipv6_batch(ip_array):
    """
    Convert batch of IPv6 addresses to binary using multiprocessing.
    """
    result = []
    
    for ip_str in ip_array:
        if ip_str and ':' in ip_str:  # IPv6 check
            try:
                result.append(ipaddress.IPv6Address(ip_str).packed)
            except:
                result.append(None)
        else:
            result.append(None)
    
    return result

def parallel_ipv4_conversion(series):
    """
    Convert IP series to IPv4 integers using all available CPU cores.
    Uses map_batches with multiprocessing for maximum performance.
    """
    # Convert to numpy for efficiency
    ip_array = series.to_numpy()
    
    # Use multiprocessing to parallelize across CPU cores
    with ProcessPoolExecutor(max_workers=min(63, mp.cpu_count())) as executor:
        # Split into chunks for parallel processing
        chunk_size = max(1000, len(ip_array) // (mp.cpu_count() * 4))
        chunks = [ip_array[i:i + chunk_size] for i in range(0, len(ip_array), chunk_size)]
        
        # Process chunks in parallel
        results = list(executor.map(convert_ipv4_batch, chunks))
        
        # Combine results
        combined = np.concatenate(results) if results else np.array([], dtype=np.uint32)
    
    return pl.Series(combined)

def create_optimized_probe_map():
    """
    Create optimized probe mapping using multiprocessing for IP conversion.
    """
    if probe_map is None:
        return None
    
    print("🔄 Creating optimized probe mapping with multiprocessing...")
    
    optimized_probe_map = probe_map.with_columns([
        # Detect IPv6 (vectorized)
        pl.col('ip').str.contains(':').alias('src_is_ipv6'),
        
        # Convert IPv4 using multiprocessing
        pl.col('ip').map_batches(parallel_ipv4_conversion).alias('src_ipv4_int'),
        
        # IPv6 placeholder for now (can add parallel processing later if needed)
        pl.when(pl.col('ip').str.contains(':'))
        .then(pl.lit(None, dtype=pl.Binary))
        .otherwise(None)
        .alias('src_ipv6_bytes')
    ]).drop('ip')
    
    print(f"✅ Optimized probe mapping ready: {optimized_probe_map.height:,} entries")
    print("⚡ IPv4 conversion used all available CPU cores via multiprocessing")
    return optimized_probe_map

def optimize_all_columns():
    """
    Standard data type optimizations (already vectorized).
    """
    return [
        pl.col("prb_id").cast(pl.UInt32),
        pl.col("sent").cast(pl.UInt8), 
        pl.col("rcvd").cast(pl.UInt8),
        pl.col("avg").cast(pl.Float32),
        pl.col("ts"),  # Keep as i64
        pl.col("rtt_1").cast(pl.Float32),
        pl.col("rtt_2").cast(pl.Float32),  
        pl.col("rtt_3").cast(pl.Float32)
    ]

def add_optimized_ip_columns(df):
    """
    Add optimized IP columns using multiprocessing for conversion.
    """
    return df.with_columns([
        # Detect IPv6 (vectorized)
        pl.col('dst_addr').str.contains(':').alias('dst_is_ipv6'),
        
        # Convert IPv4 using multiprocessing
        pl.col('dst_addr').map_batches(parallel_ipv4_conversion).alias('dst_ipv4_int'),
        
        # IPv6 placeholder
        pl.when(pl.col('dst_addr').str.contains(':'))
        .then(pl.lit(None, dtype=pl.Binary))
        .otherwise(None)
        .alias('dst_ipv6_bytes')
    ])

def add_probe_source_ips(df, optimized_probe_map):
    """
    Join with probe mapping (vectorized operation).
    """
    if optimized_probe_map is None:
        print("⚠️  No probe mapping available, skipping source IPs")
        return df
    
    return df.join(
        optimized_probe_map, 
        left_on="prb_id", 
        right_on="dst_prb_id", 
        how="left"
    )

def add_ip_display_columns():
    """
    Add readable IP display using vectorized bit operations.
    """
    return [
        # Destination IP display
        pl.when(pl.col('dst_is_ipv6'))
        .then(pl.col('dst_addr'))  # Keep original for IPv6
        .otherwise(
            # IPv4: vectorized bit operations
            pl.when(pl.col('dst_ipv4_int').is_not_null() & (pl.col('dst_ipv4_int') > 0))
            .then(
                ((pl.col('dst_ipv4_int') >> 24) & 255).cast(pl.String) + "." +
                ((pl.col('dst_ipv4_int') >> 16) & 255).cast(pl.String) + "." +
                ((pl.col('dst_ipv4_int') >> 8) & 255).cast(pl.String) + "." +
                (pl.col('dst_ipv4_int') & 255).cast(pl.String)
            )
            .otherwise(None)
        )
        .alias("dst_addr_display"),
        
        # Source IP display
        pl.when(pl.col('src_is_ipv6').fill_null(False))
        .then(None)  # IPv6 source disabled
        .otherwise(
            pl.when(pl.col('src_ipv4_int').is_not_null() & (pl.col('src_ipv4_int') > 0))
            .then(
                ((pl.col('src_ipv4_int') >> 24) & 255).cast(pl.String) + "." +
                ((pl.col('src_ipv4_int') >> 16) & 255).cast(pl.String) + "." +
                ((pl.col('src_ipv4_int') >> 8) & 255).cast(pl.String) + "." +
                (pl.col('src_ipv4_int') & 255).cast(pl.String)
            )
            .otherwise(None)
        )
        .alias("src_addr_display")
    ]

# Create optimized probe mapping with multiprocessing
if __name__ == '__main__' or 'ipykernel' in str(type(get_ipython())):
    # Ensure multiprocessing works in Jupyter
    mp.set_start_method('spawn', force=True)
    optimized_probe_map = create_optimized_probe_map()

print("🚀 MULTIPROCESSING optimization functions defined")
print("\n⚡ Performance strategy:")
print("  • IPv4 conversion: ProcessPoolExecutor with all CPU cores")
print("  • Batch processing: map_batches for efficient chunking")
print("  • Numpy arrays: Fastest iteration over series data")
print("  • Spawn method: Safe multiprocessing in Jupyter")
print("  • Expected: Full 63-core utilization for IP conversion")
print("\n🎯 Results:")
print("  • ~40% file size reduction with source IP mapping")
print("  • Maximum CPU utilization during IP conversion")
print("  • Elegant solution using Polars best practices")

🔄 Creating optimized probe mapping with multiprocessing...


Process SpawnProcess-2:
Traceback (most recent call last):
Process SpawnProcess-3:
Process SpawnProcess-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.12/concurrent/futures/process.py", line 251, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'convert_ipv4_batch' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [None]:
# Test optimization on one file first
if input_files:
    print("🧪 Testing optimization on first file...")
    
    # Original file size
    original_size = os.path.getsize(input_files[0]) / (1024**2)  # MB
    
    # Load and optimize
    print("Loading original data...")
    test_df = pl.scan_parquet(input_files[0]).collect()
    
    print("Applying optimizations...")
    # Apply all optimizations including probe mapping
    optimized_df = (
        test_df
        .pipe(add_optimized_ip_columns)  # Add destination IP columns first
        .pipe(add_probe_source_ips, optimized_probe_map)  # Add source IPs via probe mapping
        .with_columns(optimize_all_columns())  # Optimize data types
        .drop("dst_addr")  # Remove original IP string column
        .with_columns(add_ip_display_columns())  # Add readable display columns
    )
    
    # Write test files
    test_output = "test_optimized.parquet"
    print("Writing optimized test file...")
    optimized_df.write_parquet(test_output)
    
    # Compare sizes
    optimized_size = os.path.getsize(test_output) / (1024**2)  # MB
    reduction = (original_size - optimized_size) / original_size * 100
    
    print(f"\n📊 Size comparison:")
    print(f"  Original: {original_size:.1f} MB")
    print(f"  Optimized: {optimized_size:.1f} MB")
    print(f"  Reduction: {reduction:.1f}%")
    print(f"  Space saved: {original_size - optimized_size:.1f} MB")
    
    print(f"\n📋 Optimized schema:")
    for col, dtype in optimized_df.schema.items():
        print(f"    {col}: {dtype}")
    
    print(f"\n🔍 Sample optimized data (showing readable IPs):")
    display_cols = ['prb_id', 'dst_addr_display', 'ts', 'sent', 'rcvd', 'avg', 'rtt_1', 'dst_is_ipv6']
    if 'src_addr_display' in optimized_df.columns:
        display_cols.insert(2, 'src_addr_display')
        display_cols.append('src_is_ipv6')
    
    display(optimized_df.select(display_cols).head(8))
    
    print(f"\n💾 Storage details:")
    dst_ipv4_count = optimized_df.filter(~pl.col('dst_is_ipv6')).height
    dst_ipv6_count = optimized_df.filter(pl.col('dst_is_ipv6')).height  
    print(f"    Destination IPv4 rows: {dst_ipv4_count:,} (stored as UInt32)")
    print(f"    Destination IPv6 rows: {dst_ipv6_count:,} (stored as 16-byte binary)")
    
    if 'src_is_ipv6' in optimized_df.columns:
        src_ipv4_count = optimized_df.filter(~pl.col('src_is_ipv6').is_null() & ~pl.col('src_is_ipv6')).height
        src_ipv6_count = optimized_df.filter(pl.col('src_is_ipv6').fill_null(False)).height
        src_mapped_count = optimized_df.filter(~pl.col('src_addr_display').is_null()).height
        print(f"    Source IPs mapped: {src_mapped_count:,} ({src_mapped_count/optimized_df.height*100:.1f}%)")
        print(f"    Source IPv4 mapped: {src_ipv4_count:,}")
        print(f"    Source IPv6 mapped: {src_ipv6_count:,}")
    
    # Clean up test file
    os.remove(test_output)
    
    print(f"\n✅ Test successful! Proceeding with full optimization...")
else:
    print("❌ No input files found")

🧪 Testing optimization on first file...
Loading original data...
Applying optimizations...


In [None]:
# Full dataset optimization with probe mapping
OUTPUT_DIR = "data/ping_super_optimized"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Clean up any existing files
for f in Path(OUTPUT_DIR).glob("*.parquet"):
    f.unlink()

print(f"🚀 Optimizing {len(input_files)} files with probe mapping...")
print(f"📁 Output directory: {OUTPUT_DIR}")
print(f"⏱️  This may take a while for large datasets...")

total_original_size = 0
total_optimized_size = 0
processed_files = 0

for i, input_file in enumerate(input_files, 1):
    file_name = Path(input_file).name
    print(f"Processing {i}/{len(input_files)}: {file_name}", end="")
    
    try:
        # Track original size
        original_size = os.path.getsize(input_file)
        total_original_size += original_size
        
        # Load original data
        df = pl.scan_parquet(input_file).collect()
        
        # Apply full optimization pipeline with probe mapping
        optimized_df = (
            df
            .pipe(add_optimized_ip_columns)  # Convert destination IPs to optimal storage
            .pipe(add_probe_source_ips, optimized_probe_map)  # Add source IPs via probe mapping
            .with_columns(optimize_all_columns())  # Optimize all data types
            .drop("dst_addr")  # Remove original IP string column
            .with_columns(add_ip_display_columns())  # Add display columns
        )
        
        # Write optimized file
        output_file = f"{OUTPUT_DIR}/{file_name}"
        optimized_df.write_parquet(output_file)
        
        # Track optimized size
        optimized_size = os.path.getsize(output_file)
        total_optimized_size += optimized_size
        processed_files += 1
        
        # Show progress
        file_reduction = (original_size - optimized_size) / original_size * 100
        print(f" → {file_reduction:.1f}% reduction")
        
        # Show cumulative progress every 10 files
        if i % 10 == 0 or i == len(input_files):
            cumulative_reduction = (total_original_size - total_optimized_size) / total_original_size * 100
            print(f"    📈 Cumulative: {cumulative_reduction:.1f}% reduction ({processed_files} files processed)")
            
    except Exception as e:
        print(f" ❌ ERROR: {str(e)[:50]}...")
        continue

print(f"\n🎉 Optimization complete! Processed {processed_files}/{len(input_files)} files")

In [None]:
# Final results and verification with probe mapping
if os.path.exists(OUTPUT_DIR):
    output_files = list(Path(OUTPUT_DIR).glob("*.parquet"))
    final_size = sum(f.stat().st_size for f in output_files)
    final_gb = final_size / (1024**3)
    
    original_gb = total_original_size / (1024**3)
    total_reduction = (total_original_size - total_optimized_size) / total_original_size * 100
    space_saved_gb = (total_original_size - total_optimized_size) / (1024**3)
    
    print(f"🏆 FINAL OPTIMIZATION RESULTS:")
    print(f"  📄 Files processed: {len(output_files)} / {len(input_files)}")
    print(f"  📦 Original size: {original_gb:.2f} GB")
    print(f"  🗜️  Optimized size: {final_gb:.2f} GB")
    print(f"  📉 Size reduction: {total_reduction:.1f}%")
    print(f"  💾 Space saved: {space_saved_gb:.2f} GB")
    
    # Estimate full dataset savings (if this was applied to 1TB)
    if original_gb > 0:
        tb_estimate = (1024 * total_reduction / 100)
        print(f"  🌟 Est. 1TB dataset savings: {tb_estimate:.0f} GB")
    
    print(f"\n📁 Super-optimized dataset ready at: {OUTPUT_DIR}/")
    print(f"💡 Usage: pl.scan_parquet('{OUTPUT_DIR}/*.parquet')")
    print(f"🔍 Readable IPs: Use 'dst_addr_display' and 'src_addr_display' columns")
    print(f"⚡ Storage: IPv4 as UInt32, IPv6 as binary, optimized types")
    if optimized_probe_map is not None:
        print(f"🎯 Probe mapping: Source IPs added via probe_ip_map.csv")
    
    # Verify we can read the optimized dataset
    try:
        print(f"\n🔬 Verification: Reading optimized dataset...")
        test_read = pl.scan_parquet(f"{OUTPUT_DIR}/*.parquet").head(5).collect()
        
        print(f"✅ Successfully read optimized dataset!")
        print(f"📊 Sample with readable IP addresses:")
        
        # Dynamic column selection based on what's available
        display_cols = ['prb_id', 'dst_addr_display', 'ts', 'avg', 'rtt_1', 'sent', 'rcvd', 'dst_is_ipv6']
        if 'src_addr_display' in test_read.columns:
            display_cols.insert(2, 'src_addr_display')
            display_cols.append('src_is_ipv6')
        
        display(test_read.select(display_cols))
        
        # Count total rows
        print(f"\n📈 Counting total rows...")
        total_rows = pl.scan_parquet(f"{OUTPUT_DIR}/*.parquet").select(pl.len()).collect().item()
        print(f"📊 Total rows in optimized dataset: {total_rows:,}")
        
        # Show storage breakdown
        storage_sample = pl.scan_parquet(f"{OUTPUT_DIR}/*.parquet").head(10000).collect()
        dst_ipv4_sample_count = storage_sample.filter(~pl.col('dst_is_ipv6')).height
        dst_ipv6_sample_count = storage_sample.filter(pl.col('dst_is_ipv6')).height
        
        print(f"\n💾 Storage efficiency (sample of 10k rows):")
        print(f"    Destination IPv4 entries: {dst_ipv4_sample_count:,} (4 bytes each)")
        print(f"    Destination IPv6 entries: {dst_ipv6_sample_count:,} (16 bytes each)")
        
        if 'src_addr_display' in storage_sample.columns:
            src_mapped_count = storage_sample.filter(~pl.col('src_addr_display').is_null()).height
            src_ipv4_count = storage_sample.filter(~pl.col('src_is_ipv6').is_null() & ~pl.col('src_is_ipv6')).height
            src_ipv6_count = storage_sample.filter(pl.col('src_is_ipv6').fill_null(False)).height
            print(f"    Source IPs mapped: {src_mapped_count:,} ({src_mapped_count/storage_sample.height*100:.1f}%)")
            print(f"    Source IPv4 mapped: {src_ipv4_count:,} (4 bytes each)")
            print(f"    Source IPv6 mapped: {src_ipv6_count:,} (16 bytes each)")
        
        print(f"    Display columns work: ✅ All IPs readable via display columns")
        
    except Exception as e:
        print(f"⚠️  Verification failed: {e}")
        print(f"    Check the first few files manually")
        
else:
    print("❌ Output directory not found - optimization may have failed")

print(f"\n🎯 Next steps:")
print(f"   1. Verify the optimized data meets your needs")
print(f"   2. Use pl.scan_parquet('{OUTPUT_DIR}/*.parquet') for analysis")
if optimized_probe_map is not None:
    print(f"   3. Source IPs are now available via probe mapping!")
    print(f"   4. Both src_addr_display and dst_addr_display show readable IPs")
print(f"   5. Apply this optimization to your full dataset")
print(f"   6. Enjoy {total_reduction:.0f}% smaller files with source IP info! 🚀")