In [12]:
import sys
import os
from pathlib import Path

# Fix path issues - use absolute path to the data_loader directory
data_loader_path = '/Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data_loader'
if data_loader_path not in sys.path:
    sys.path.insert(0, data_loader_path)

print(f"Added data_loader directory to path: {data_loader_path}")

# Check if we can access the current working directory safely
try:
    current_dir = os.getcwd()
    print(f"Current working directory: {current_dir}")
except FileNotFoundError:
    print("Current working directory is not accessible, but path is set correctly")
    # Set to a known working directory
    os.chdir('/Users/svitlanakovalivska/layered-populate-data-pool-da')
    print(f"Changed to project root: {os.getcwd()}")

# Verify the smart_auto_data_loader file exists
smart_loader_file = Path(data_loader_path) / 'smart_auto_data_loader.py'
if smart_loader_file.exists():
    print(f"✅ Found smart_auto_data_loader.py")
else:
    print(f"❌ smart_auto_data_loader.py not found at {smart_loader_file}")

Added data_loader directory to path: /Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data_loader
Current working directory: /Users/svitlanakovalivska/layered-populate-data-pool-da
✅ Found smart_auto_data_loader.py


# CSV Loading TEST SmartAutoDataLoader

In [13]:
# DIAGNOSTIC: Check which methods are missing
from smart_auto_data_loader import SmartAutoDataLoader

loader = SmartAutoDataLoader()


df = loader.load("/Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data/test.csv")
    

🎯 SmartAutoDataLoader ready!
🎯 Loading file: test.csv
🔍 Format detected: csv
📊 Loading CSV file...
🔤 Encoding detected: utf-8
🗓️ Searching for date columns...
   📅 No date columns detected
✅ CSV loaded: 3263 rows, 4 columns


In [14]:
df.columns


Index(['id', 'keyword', 'location', 'text'], dtype='object')

# Excel Loading TEST SmartAutoDataLoader

In [15]:
# Import and test  Excel loading functionality
from smart_auto_data_loader import SmartAutoDataLoader

loader = SmartAutoDataLoader()
df = loader.load("/Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data/statistischer-bericht-auslaend-bevoelkerung-2010200247005.xlsx")

print(df.head())

🎯 SmartAutoDataLoader ready!
🎯 Loading file: statistischer-bericht-auslaend-bevoelkerung-2010200247005.xlsx
🔍 Format detected: excel
📈 Loading Excel file...
   📋 Available sheets: ['Titel', 'Inhaltsübersicht', 'GENESIS-Online', 'Impressum', 'Informationen zur Statistik', '12521-01', '12521-02', '12521-03', '12521-04', '12521-05', '12521-06', '12521-07', '12521-08', '12521-09', '12521-10', '12521-11', '12521-12', '12521-13', '12521-14', '12521-15', '12521-16', 'Erläuterung_zu_CSV-Tabellen', 'csv-12521-01', 'csv-12521-02', 'csv-12521-03', 'csv-12521-04', 'csv-12521-05', 'csv-12521-06', 'csv-12521-07', 'csv-12521-08', 'csv-12521-09', 'csv-12521-10', 'csv-12521-11', 'csv-12521-12', 'csv-12521-13', 'csv-12521-14', 'csv-12521-15', 'csv-12521-16']
   ✅ Selected sheet: 'csv-12521-15'
🗓️ Searching for date columns...
   ✅ Found date column: 'Stichtag' (%d.%m.%Y)
   📅 Total date columns found: 1
✅ Excel loaded: 1309 rows, 24 columns
   📊 Column names: ['Statistik_Code', 'Statistik_Label', 'Stich

# JSON Loading TEST SmartAutoDataLoader

In [16]:
from smart_auto_data_loader import SmartAutoDataLoader

loader = SmartAutoDataLoader()
df = loader.load("___.json")

print(df.head())

🎯 SmartAutoDataLoader ready!
🎯 Loading file: ___.json
🔍 Format detected: json
🗂️ Loading JSON file with intelligent analysis...
❌ Error loading JSON: [Errno 2] No such file or directory: '___.json'
Empty DataFrame
Columns: []
Index: []


Traceback (most recent call last):
  File "/Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data_loader/smart_auto_data_loader.py", line 251, in load_json
    with open(source, 'r', encoding='utf-8') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '___.json'


In [17]:
# DIAGNOSIS: Why still only 22 columns?
import json
import pandas as pd

json_file = "____.json"

print("=== 🔍 JSON FLATTENING DIAGNOSIS ===")

# Load raw data
with open(json_file, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"Source data: {len(raw_data)} records")
first_item = raw_data[0]
workflow_steps = first_item.get('workflowSteps', [])
print(f"workflowSteps: {len(workflow_steps)} elements")

if workflow_steps:
    input_rows = workflow_steps[0].get('inputRows', [])
    print(f"inputRows in first step: {len(input_rows)} elements")

# Test pandas normalize
df_pandas = pd.json_normalize(raw_data, sep='_', max_level=None)
print(f"pandas.json_normalize: {df_pandas.shape}")

# Deep flattening
def deep_flatten(data, sep='_', prefix=''):
    result = {}
    if isinstance(data, dict):
        for key, value in data.items():
            new_key = f"{prefix}{sep}{key}" if prefix else key
            if isinstance(value, dict):
                result.update(deep_flatten(value, sep, new_key))
            elif isinstance(value, list):
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        result.update(deep_flatten(item, sep, f"{new_key}_{i}"))
                else:
                    result[new_key] = str(value) if value else ''
            else:
                result[new_key] = value
    return result

flattened_records = [deep_flatten(item) for item in raw_data]
df_deep = pd.DataFrame(flattened_records)
print(f"Deep flattening: {df_deep.shape}")

=== 🔍 JSON FLATTENING DIAGNOSIS ===


FileNotFoundError: [Errno 2] No such file or directory: '____.json'

In [None]:
# Comparison results
print(f"✅ Deep flattening gave {df_deep.shape[1]} columns!")
print(f"❌ pandas.json_normalize gave only {df_pandas.shape[1]} columns")

workflow_cols = [col for col in df_deep.columns if 'workflow' in col.lower()]
input_cols = [col for col in df_deep.columns if 'input' in col.lower()]

print(f"\nColumns with 'workflow': {len(workflow_cols)}")
print(f"Columns with 'input': {len(input_cols)}")

if workflow_cols:
    print("Workflow column examples:", workflow_cols[:3])

print(f"\n💡 CONCLUSION:")
print(f"Proper flattening gives {df_deep.shape[1]} columns instead of 22")
print(f"Need to replace pd.json_normalize with deep_flatten in SmartAutoDataLoader!")

In [None]:
# Test if smart methods were added to SmartAutoDataLoader
import importlib
import smart_auto_data_loader
importlib.reload(smart_auto_data_loader)

from smart_auto_data_loader import SmartAutoDataLoader

print("=== 🔍 CHECKING SMART METHODS ===")

loader = SmartAutoDataLoader(verbose=True)

# Check if the new methods exist
methods_to_check = [
    '_analyze_json_complexity',
    '_smart_deep_flatten_json'
]

for method in methods_to_check:
    has_method = hasattr(loader, method)
    print(f"Has {method}: {has_method}")

# If methods don't exist, show current load_json parameters
import inspect
try:
    signature = inspect.signature(loader.load_json)
    print(f"\nCurrent load_json parameters: {list(signature.parameters.keys())}")
    
    # Check for auto_deep_flatten parameter
    if 'auto_deep_flatten' in signature.parameters:
        print("✅ load_json has auto_deep_flatten parameter")
    else:
        print("❌ load_json missing auto_deep_flatten parameter")
        
except Exception as e:
    print(f"Cannot inspect load_json: {e}")

# Test simple loading until methods are added
print(f"\n=== 🧪 TESTING CURRENT FUNCTIONALITY ===")
try:
    df_test = loader.load("/Users/svitlanakovalivska/layered-populate-data-pool-da/db_population_utils/data/KG_Export_Aufträge.json")
    print(f"Current result: {df_test.shape}")
    
    if df_test.shape[1] > 400:
        print("🎉 SUCCESS! Smart flattening is working!")
    else:
        print("📝 NOTE: Still using basic flattening, need to add smart methods to file")
        
except Exception as e:
    print(f"Error during testing: {e}")

print(f"\n💡 NEXT STEPS:")
print(f"1. Add _analyze_json_complexity method to smart_auto_data_loader.py")
print(f"2. Add _smart_deep_flatten_json method to smart_auto_data_loader.py") 
print(f"3. Update load_json method with auto_deep_flatten parameter")
print(f"4. Restart kernel and test again")