# Test SmartAutoDataLoader - Excel Files

This notebook tests the Excel loading functionality of SmartAutoDataLoader

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add the parent directory to path (go up one level from test folder)
sys.path.append('../')

from smart_auto_data_loader import SmartAutoDataLoader

ModuleNotFoundError: No module named 'smart_auto_data_loader'

## 1. Create Test Excel Files

In [None]:
# Create test directory
test_dir = Path('test_data')
test_dir.mkdir(exist_ok=True)

# Create sample data with different data types
sample_data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Date': ['2023-01-15', '2023-02-20', '2023-03-25', '2023-04-30', '2023-05-15'],
    'Amount': [100.5, 200.75, 150.25, 300.0, 175.5],
    'Category': ['A', 'B', 'A', 'C', 'B'],
    'Active': [True, False, True, True, False]
}

df_sample = pd.DataFrame(sample_data)
print("Sample data created:")
print(df_sample)
print(f"\nData types: {df_sample.dtypes}")

Sample data created:
   ID     Name        Date  Amount Category  Active
0   1    Alice  2023-01-15  100.50        A    True
1   2      Bob  2023-02-20  200.75        B   False
2   3  Charlie  2023-03-25  150.25        A    True
3   4    Diana  2023-04-30  300.00        C    True
4   5      Eve  2023-05-15  175.50        B   False

Data types: ID            int64
Name         object
Date         object
Amount      float64
Category     object
Active         bool
dtype: object


In [None]:
# Create single sheet Excel file
excel_file_single = test_dir / 'test_single_sheet.xlsx'
df_sample.to_excel(excel_file_single, index=False)
print(f"✅ Created single sheet Excel: {excel_file_single}")

# Create multi-sheet Excel file
excel_file_multi = test_dir / 'test_multi_sheet.xlsx'
with pd.ExcelWriter(excel_file_multi) as writer:
    df_sample.to_excel(writer, sheet_name='Sheet1', index=False)
    df_sample.iloc[:3].to_excel(writer, sheet_name='Small_Sheet', index=False)
    (df_sample * 2).to_excel(writer, sheet_name='Large_Sheet', index=False)

print(f"✅ Created multi-sheet Excel: {excel_file_multi}")

# Create Excel with different date formats
date_data = {
    'ID': [1, 2, 3, 4],
    'ISO_Date': ['2023-12-01', '2023-12-02', '2023-12-03', '2023-12-04'],
    'EU_Date': ['01/12/2023', '02/12/2023', '03/12/2023', '04/12/2023'],
    'German_Date': ['01.12.2023', '02.12.2023', '03.12.2023', '04.12.2023'],
    'Value': [10, 20, 30, 40]
}

df_dates = pd.DataFrame(date_data)
excel_file_dates = test_dir / 'test_dates.xlsx'
df_dates.to_excel(excel_file_dates, index=False)
print(f"✅ Created date test Excel: {excel_file_dates}")

✅ Created single sheet Excel: test_data/test_single_sheet.xlsx
✅ Created multi-sheet Excel: test_data/test_multi_sheet.xlsx
✅ Created date test Excel: test_data/test_dates.xlsx


## 2. Initialize SmartAutoDataLoader

In [None]:
# Initialize loader with verbose mode
loader = SmartAutoDataLoader(verbose=True)
print("SmartAutoDataLoader initialized!")

🎯 SmartAutoDataLoader ready!
SmartAutoDataLoader initialized!


## 3. Test Format Detection

In [None]:
# Test format detection
print("=== FORMAT DETECTION TEST ===")
for file_path in [excel_file_single, excel_file_multi, excel_file_dates]:
    detected_format = loader.detect_format(str(file_path))
    print(f"File: {file_path.name} -> Format: {detected_format}")
    assert detected_format == 'excel', f"Expected 'excel', got '{detected_format}'"

print("✅ Format detection passed!")

=== FORMAT DETECTION TEST ===
🔍 Format detected: excel
File: test_single_sheet.xlsx -> Format: excel
🔍 Format detected: excel
File: test_multi_sheet.xlsx -> Format: excel
🔍 Format detected: excel
File: test_dates.xlsx -> Format: excel
✅ Format detection passed!


## 4. Test Single Sheet Excel Loading

In [None]:
print("=== SINGLE SHEET EXCEL TEST ===")
try:
    df_loaded = loader.load_excel(str(excel_file_single))
    
    print(f"\n📊 Loaded DataFrame info:")
    print(f"Shape: {df_loaded.shape}")
    print(f"Columns: {list(df_loaded.columns)}")
    print(f"Data types: {df_loaded.dtypes}")
    print(f"\nFirst few rows:")
    print(df_loaded.head())
    
    # Verify data integrity
    assert len(df_loaded) == 5, f"Expected 5 rows, got {len(df_loaded)}"
    assert len(df_loaded.columns) == 6, f"Expected 6 columns, got {len(df_loaded.columns)}"
    
    print("\n✅ Single sheet Excel loading passed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== SINGLE SHEET EXCEL TEST ===
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']

📊 Loaded DataFrame info:
Shape: (5, 6)
Columns: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
Data types: ID                   int64
Name                object
Date        datetime64[ns]
Amount             float64
Category            object
Active                bool
dtype: object

First few rows:
   ID     Name       Date  Amount Category  Active
0   1    Alice 2023-01-15  100.50        A    True
1   2      Bob 2023-02-20  200.75        B   False
2   3  Charlie 2023-03-25  150.25        A    True
3   4    Diana 2023-04-30  300.00        C    True
4   5      Eve 2023-05-15  175.50        B   False

✅ Single sheet Excel loading passed!


## 5. Test Multi-Sheet Excel Loading

In [None]:
print("=== MULTI-SHEET EXCEL TEST ===")
try:
    # Test auto-detection (should pick first or largest sheet)
    df_auto = loader.load_excel(str(excel_file_multi))
    print(f"Auto-detected sheet loaded: {df_auto.shape}")
    
    # Test specific sheet selection
    df_small = loader.load_excel(str(excel_file_multi), sheet_name='Small_Sheet')
    print(f"Small_Sheet loaded: {df_small.shape}")
    
    df_large = loader.load_excel(str(excel_file_multi), sheet_name='Large_Sheet')
    print(f"Large_Sheet loaded: {df_large.shape}")
    
    # Verify different sheet sizes
    assert len(df_small) == 3, f"Expected 3 rows in small sheet, got {len(df_small)}"
    assert len(df_large) == 5, f"Expected 5 rows in large sheet, got {len(df_large)}"
    
    print("\n✅ Multi-sheet Excel loading passed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== MULTI-SHEET EXCEL TEST ===
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1', 'Small_Sheet', 'Large_Sheet']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
Auto-detected sheet loaded: (5, 6)
📈 Loading Excel file...
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 3 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
Small_Sheet loaded: (3, 6)
📈 Loading Excel file...
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
Large_Sheet loaded: (5, 6)

✅ Multi-sheet Excel loading passed!


## 6. Test Universal Load Method

In [None]:
print("=== UNIVERSAL LOAD METHOD TEST ===")
try:
    # Test universal load method (should auto-delegate to load_excel)
    df_universal = loader.load(str(excel_file_single))
    
    print(f"Universal load result: {df_universal.shape}")
    print(f"Columns: {list(df_universal.columns)}")
    
    # Verify it works the same as direct Excel loading
    df_direct = loader.load_excel(str(excel_file_single))
    
    assert df_universal.shape == df_direct.shape, "Universal and direct loading should match"
    assert list(df_universal.columns) == list(df_direct.columns), "Columns should match"
    
    print("\n✅ Universal load method passed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== UNIVERSAL LOAD METHOD TEST ===
🎯 Loading file: test_single_sheet.xlsx
🔍 Format detected: excel
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
Universal load result: (5, 6)
Columns: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']

✅ Universal load method passed!


## 7. Test DateTime Detection and Parsing

In [None]:
print("=== DATETIME DETECTION TEST ===")
try:
    df_dates_loaded = loader.load_excel(str(excel_file_dates))
    
    print(f"\nLoaded date test file:")
    print(f"Shape: {df_dates_loaded.shape}")
    print(f"Data types: {df_dates_loaded.dtypes}")
    print(f"\nData preview:")
    print(df_dates_loaded.head())
    
    # Check for detected time columns
    time_columns = loader.detect_time_columns(df_dates_loaded)
    print(f"\nDetected time columns: {time_columns}")
    
    # Verify at least some date columns were detected
    date_columns_count = sum(1 for col in df_dates_loaded.columns 
                           if 'datetime' in str(df_dates_loaded[col].dtype).lower())
    print(f"Date columns converted: {date_columns_count}")
    
    print("\n✅ DateTime detection test completed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== DATETIME DETECTION TEST ===
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'ISO_Date' (%Y-%m-%d)
   ✅ Found date column: 'EU_Date' (%d/%m/%Y)
   ✅ Found date column: 'German_Date' (%d.%m.%Y)
   📅 Total date columns found: 3
✅ Excel loaded: 4 rows, 5 columns
   📊 Column names: ['ID', 'ISO_Date', 'EU_Date', 'German_Date', 'Value']

Loaded date test file:
Shape: (4, 5)
Data types: ID                      int64
ISO_Date       datetime64[ns]
EU_Date        datetime64[ns]
German_Date    datetime64[ns]
Value                   int64
dtype: object

Data preview:
   ID   ISO_Date    EU_Date German_Date  Value
0   1 2023-12-01 2023-12-01  2023-12-01     10
1   2 2023-12-02 2023-12-02  2023-12-02     20
2   3 2023-12-03 2023-12-03  2023-12-03     30
3   4 2023-12-04 2023-12-04  2023-12-04     40
🕒 Found 3 datetime columns: ['ISO_Date', 'EU_Date', 'German_Date']

Detected time columns: ['ISO_Date',

## 8. Test Comprehensive Reporting

In [None]:
print("=== COMPREHENSIVE REPORTING TEST ===")
try:
    # Generate report for Excel file
    report = loader.build_report(str(excel_file_single))
    
    print(f"\n📊 Load Report:")
    print(f"File: {report.file_path}")
    print(f"Size: {report.file_size_mb:.2f} MB")
    print(f"Format: {report.detected_format}")
    print(f"Rows: {report.total_rows}")
    print(f"Columns: {report.total_columns}")
    print(f"Date columns: {report.date_columns_found}")
    print(f"Quality score: {report.quality_score}")
    print(f"Success: {report.success}")
    print(f"Loading time: {report.loading_time_seconds:.3f}s")
    
    if report.errors:
        print(f"Errors: {report.errors}")
    if report.warnings:
        print(f"Warnings: {report.warnings}")
    
    # Verify report completeness
    assert report.detected_format == 'excel', f"Expected 'excel', got '{report.detected_format}'"
    assert report.success == True, "Report should indicate success"
    assert report.total_rows > 0, "Should have rows"
    assert report.total_columns > 0, "Should have columns"
    
    print("\n✅ Comprehensive reporting passed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== COMPREHENSIVE REPORTING TEST ===
🎯 Loading file: test_single_sheet.xlsx
🔍 Format detected: excel
📈 Loading Excel file...
   📋 Available sheets: ['Sheet1']
   ✅ Selected sheet: 'Sheet1'
🗓️ Searching for date columns...
   ✅ Found date column: 'Date' (%Y-%m-%d)
   📅 Total date columns found: 1
✅ Excel loaded: 5 rows, 6 columns
   📊 Column names: ['ID', 'Name', 'Date', 'Amount', 'Category', 'Active']
🕒 Found 1 datetime columns: ['Date']
🔍 Format detected: excel
🔍 Format detected: excel
🔍 Format detected: excel
📊 Report generated for test_single_sheet.xlsx

📊 Load Report:
File: test_data/test_single_sheet.xlsx
Size: 0.00 MB
Format: excel
Rows: 5
Columns: 6
Date columns: ['Date']
Quality score: 100
Success: True
Loading time: 0.006s

✅ Comprehensive reporting passed!


## 9. Test Error Handling

In [None]:
print("=== ERROR HANDLING TEST ===")

# Test non-existent file
try:
    loader.load_excel('nonexistent_file.xlsx')
    print("❌ Should have raised an error for non-existent file")
except Exception as e:
    print(f"✅ Correctly caught error for non-existent file: {type(e).__name__}")

# Test invalid sheet name
try:
    loader.load_excel(str(excel_file_single), sheet_name='NonExistentSheet')
    print("❌ Should have raised an error for non-existent sheet")
except Exception as e:
    print(f"✅ Correctly caught error for non-existent sheet: {type(e).__name__}")

print("\n✅ Error handling tests passed!")

=== ERROR HANDLING TEST ===
📈 Loading Excel file...
❌ Error loading Excel file: Excel file not found: nonexistent_file.xlsx
✅ Correctly caught error for non-existent file: ValueError
📈 Loading Excel file...
❌ Error loading Excel file: Worksheet named 'NonExistentSheet' not found
✅ Correctly caught error for non-existent sheet: ValueError

✅ Error handling tests passed!


## 10. Performance Test

In [None]:
print("=== PERFORMANCE TEST ===")
try:
    # Test memory estimation
    memory_estimate = loader.estimate_memory_usage(str(excel_file_single))
    
    print(f"\n💾 Memory Estimation:")
    print(f"File size: {memory_estimate['file_size_mb']:.3f} MB")
    print(f"Estimated memory: {memory_estimate['estimated_memory_mb']:.3f} MB")
    if memory_estimate['recommended_chunksize']:
        print(f"Recommended chunk size: {memory_estimate['recommended_chunksize']}")
    
    # Verify estimation structure
    assert 'file_size_mb' in memory_estimate, "Missing file_size_mb"
    assert 'estimated_memory_mb' in memory_estimate, "Missing estimated_memory_mb"
    assert memory_estimate['file_size_mb'] > 0, "File size should be positive"
    
    print("\n✅ Performance test passed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== PERFORMANCE TEST ===
💾 File size: 0.0MB, estimated memory: 0.0MB

💾 Memory Estimation:
File size: 0.005 MB
Estimated memory: 0.012 MB

✅ Performance test passed!


## Summary

In [None]:
print("\n" + "="*50)
print("🎯 SMARTAUTODATALOADER EXCEL TESTING COMPLETE")
print("="*50)
print("\n✅ All tests completed successfully!")
print("\n📋 Features tested:")
print("   • Format detection for Excel files")
print("   • Single sheet Excel loading")
print("   • Multi-sheet Excel handling")
print("   • Universal load method delegation")
print("   • DateTime detection and parsing")
print("   • Comprehensive reporting")
print("   • Error handling")
print("   • Performance estimation")
print("\n🎉 SmartAutoDataLoader Excel functionality is working correctly!")

# Cleanup
import shutil
if test_dir.exists():
    shutil.rmtree(test_dir)
    print(f"\n🧹 Cleaned up test directory: {test_dir}")


🎯 SMARTAUTODATALOADER EXCEL TESTING COMPLETE

✅ All tests completed successfully!

📋 Features tested:
   • Format detection for Excel files
   • Single sheet Excel loading
   • Multi-sheet Excel handling
   • Universal load method delegation
   • DateTime detection and parsing
   • Comprehensive reporting
   • Error handling
   • Performance estimation

🎉 SmartAutoDataLoader Excel functionality is working correctly!

🧹 Cleaned up test directory: test_data
