In [None]:
# Load Required Data                                                                                                                       
                                                                                                                                              
import sys                                                                                                                                  
import pandas as pd                                                                                                                         
import numpy as np                                                                                                                          
from pathlib import Path                                                                                                                    
                                                                                                                                          
# Add src to path                                                                                                                           
sys.path.append('../')  # or adjust path as needed                                                                                          
from src.features.sector_mapping import build_enhanced_sector_mappings, validate_sector_assignments                                         
from src.data.loader import load_stock_universe, load_etf_universe                                                                          
                                                                                                                                          
# Load universe data with sectors                                                                                                           
stocks, sectors = load_stock_universe(max_symbols=None, include_sectors=True)  # Limit for testing                                           
etfs = load_etf_universe()  # Gets all 46 ETFs                                                                                              
                                                                                                                                          
print(f"âœ… Loaded {len(stocks['Close'].columns)} stocks and {len(etfs['Close'].columns)} ETFs")                                             
print(f"ðŸ“Š Sector mappings: {len(sectors)} symbols")      

In [None]:
# Convert to Symbol-Keyed Format                                                                                                           
                                                                                                                                          
# Convert wide format to symbol-keyed DataFrames (like orchestrator does)                                                                   
def convert_to_symbol_dict(data_dict):                                                                                                      
  symbol_data = {}                                                                                                                        
  for metric, df in data_dict.items():                             
      for symbol in df.columns:                                    
          if symbol not in symbol_data:                            
              symbol_data[symbol] = pd.DataFrame(index=df.index)                                                                          
          symbol_data[symbol][metric.lower()] = df[symbol]         
  return symbol_data                                               

stocks_by_symbol = convert_to_symbol_dict(stocks)                    
etfs_by_symbol = convert_to_symbol_dict(etfs)                        

print(f"Converted to {len(stocks_by_symbol)} stock DataFrames") 

In [None]:
# Build Enhanced Mappings                                                                                                                  
                                                                                                                                              
# Find universe CSV automatically                                                                                                           
from src.data.loader import _discover_universe_csv                                                                                          
universe_csv = _discover_universe_csv('../cache/stock_data.pkl')                                                                            
                                                                                                                                          
# Build enhanced mappings with correlation analysis                                                                                         
enhanced_mappings = build_enhanced_sector_mappings(                                                                                         
  universe_csv=universe_csv,                                                                                                              
  stock_data=stocks_by_symbol,                                                                                                            
  etf_data=etfs_by_symbol,                                                                                                                
  base_sectors=sectors                                                                                                                    
)                                                                                                                                           
                                                                                                                                          
print(f"âœ… Enhanced mappings for {len(enhanced_mappings)} symbols")  

In [None]:
# 4. Analyze the Results                                                                                                                      
                                                                                                                                          
# Validation report                                                                                                                         
validation_report = validate_sector_assignments(enhanced_mappings)                                                                          

print("=== VALIDATION REPORT ===")
print(f"Total symbols: {validation_report['total_symbols']}")
print(f"Confidence distribution: {validation_report['confidence_distribution']}")
print(f"Average sector correlation: {validation_report['avg_sector_correlation']:.3f}")
print(f"Subsector coverage: {validation_report['subsector_coverage']:.1%}")

# Show top subsector improvements
print("\n=== TOP SUBSECTOR IMPROVEMENTS ===")
improvements_df = pd.DataFrame(validation_report['subsector_improvements'])
if not improvements_df.empty:
  display(improvements_df)

In [None]:
#5. Explore Individual Mappings

# Look at specific examples
examples = ['AAPL', 'NVDA', 'JPM', 'GOOGL', 'AMZN']
for symbol in examples:
  if symbol in enhanced_mappings:
      mapping = enhanced_mappings[symbol]
      print(f"\n{symbol}:")
      print(f"  Sector: {mapping['csv_sector']} â†’ {mapping['sector_etf']}")
      print(f"  Subsector: {mapping['subsector_etf']}")
      print(f"  Correlations: {mapping['correlations']}")
      print(f"  Confidence: {mapping['confidence']}")


In [None]:
# 6. Summary Statistics                                                                                                    16:45:47 [276/1831]

# Overall mapping statistics
mapping_df = pd.DataFrame.from_dict(enhanced_mappings, orient='index')

print("=== SECTOR ETF DISTRIBUTION ===")
print(mapping_df['sector_etf'].value_counts())

print("\n=== SUBSECTOR ETF DISTRIBUTION ===")
print(mapping_df['subsector_etf'].value_counts().dropna())

print("\n=== CONFIDENCE DISTRIBUTION ===")
print(mapping_df['confidence'].value_counts())

# Correlation statistics
sector_corrs = [m['correlations'].get('sector', np.nan) for m in enhanced_mappings.values()]
subsector_corrs = [m['correlations'].get('subsector', np.nan) for m in enhanced_mappings.values()]

print(f"\nSector correlation: mean={np.nanmean(sector_corrs):.3f}, median={np.nanmedian(sector_corrs):.3f}")
print(f"Subsector correlation: mean={np.nanmean(subsector_corrs):.3f}, median={np.nanmedian(subsector_corrs):.3f}")
