# Save ItemDataLoader DataFrame

This notebook loads item data using ItemDataLoader and saves the resulting DataFrames to CSV files for testing and inspection purposes.

In [None]:
import os
import sys

# Add project root to path
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)
os.chdir(project_root)

import pandas as pd
from services.item_data_loader import ItemDataLoader

## 1. Initialize ItemDataLoader

In [None]:
loader = ItemDataLoader(data_source='local')

## 2. Load and Prepare Items

If `load_and_prepare_items()` fails due to pandas version issues, we'll load step by step.

In [None]:
try:
    item_df, alias_df = loader.load_and_prepare_items()
    print(f"Successfully loaded: item_df={item_df.shape}, alias_df={alias_df.shape}")
except Exception as e:
    print(f"Error with load_and_prepare_items: {e}")
    print("Loading step by step...")
    
    # Step 1: Load raw data
    raw_data = loader.load_raw_data()
    print(f"Raw data: {raw_data.shape}")
    
    # Step 2: Normalize columns
    normalized_data = loader.normalize_columns(raw_data)
    print(f"Normalized: {normalized_data.shape}")
    
    # Step 3: Filter by domain
    filtered_data = loader.filter_by_domain(normalized_data)
    print(f"Filtered: {filtered_data.shape}")
    
    # Step 4: Load alias rules
    alias_pdf = loader.load_alias_rules()
    print(f"Alias rules: {alias_pdf.shape}")
    
    # Step 5: Skip expand_build_aliases if it fails (uses problematic query)
    try:
        alias_pdf = loader.expand_build_aliases(alias_pdf, filtered_data)
    except Exception as e2:
        print(f"Skipping expand_build_aliases: {e2}")
    
    # Step 6: Create bidirectional aliases
    alias_pdf = loader.create_bidirectional_aliases(alias_pdf)
    print(f"After bidirectional: {alias_pdf.shape}")
    
    # Step 7: Apply cascading alias rules
    with_aliases = loader.apply_cascading_alias_rules(filtered_data, alias_pdf)
    print(f"After cascading: {with_aliases.shape}")
    
    # Step 8: Add user defined entities
    with_user_entities = loader.add_user_defined_entities(with_aliases, None)
    
    # Step 9: Add domain name column
    with_domain_names = loader.add_domain_name_column(with_user_entities)
    
    # Step 10: Filter test items
    item_df = loader.filter_test_items(with_domain_names)
    alias_df = alias_pdf
    
    print(f"Final: item_df={item_df.shape}, alias_df={alias_df.shape}")

## 3. Inspect Item DataFrame

In [None]:
print(f"Item DataFrame Shape: {item_df.shape}")
print(f"Columns: {list(item_df.columns)}")

In [None]:
item_df[['item_nm', 'item_id', 'item_nm_alias', 'item_dmn']].head(10)

## 4. Inspect Alias DataFrame

In [None]:
print(f"Alias DataFrame Shape: {alias_df.shape}")
print(f"Columns: {list(alias_df.columns)}")

In [None]:
alias_df.head(10)

## 5. Inspect Raw Alias DataFrame

In [None]:
if loader.alias_pdf_raw is not None:
    print(f"Raw Alias DataFrame Shape: {loader.alias_pdf_raw.shape}")
    display(loader.alias_pdf_raw.head(10))

## 6. Save DataFrames to CSV

In [None]:
output_dir = 'tests/item_data_output'
os.makedirs(output_dir, exist_ok=True)

# Save item DataFrame
item_output_path = os.path.join(output_dir, 'item_df.csv')
item_df.to_csv(item_output_path, index=False, encoding='utf-8-sig')
print(f"Saved Item DataFrame: {item_output_path}")

# Save alias DataFrame
alias_output_path = os.path.join(output_dir, 'alias_df.csv')
alias_df.to_csv(alias_output_path, index=False, encoding='utf-8-sig')
print(f"Saved Alias DataFrame: {alias_output_path}")

# Save raw alias DataFrame
if loader.alias_pdf_raw is not None:
    raw_alias_path = os.path.join(output_dir, 'alias_raw_df.csv')
    loader.alias_pdf_raw.to_csv(raw_alias_path, index=False, encoding='utf-8-sig')
    print(f"Saved Raw Alias DataFrame: {raw_alias_path}")

## 7. Query Examples

In [None]:
# Find all aliases for a specific item
item_name = 'iPhone 17'
item_aliases = item_df[item_df['item_nm'].str.contains(item_name, case=False, na=False)]
print(f"Aliases for '{item_name}':")
item_aliases[['item_nm', 'item_nm_alias', 'item_dmn']].drop_duplicates()