In [None]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
import shutil

def get_latest_parquet_file(folder_path):
    try:
        # List all Parquet files in the folder
        parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
        
        if not parquet_files:
            print(f"No Parquet files found in {folder_path}")
            return None
        
        # Sort files to get the latest
        latest_file = sorted(parquet_files, reverse=True)[0]
        return os.path.join(folder_path, latest_file)
    
    except Exception as e:
        print(f"Error finding latest Parquet file in {folder_path}: {e}")
        return None

def identify_primary_keys(df):
    potential_keys = []

    id_like_columns = [
        col for col in df.columns 
        if any(keyword in col.lower() for keyword in ['sequence_id'])
    ]
    potential_keys.extend(id_like_columns)
    
    # Remove duplicates
    merge_keys = list(dict.fromkeys(potential_keys))
    
    # Additional validation
    if not merge_keys:
        # use all columns
        merge_keys = list(df.columns)
    
    return merge_keys

# Generate merge condition
def generate_merge_condition(merge_keys):
    merge_conditions = [
        f"existing.{key} = new.{key}" for key in merge_keys
    ]
    
    return " AND ".join(merge_conditions)

def process_sap_table(table_name):
    
    # Paths
    inbound_folder = f'/lakehouse/default/Files/inbound-sap/{table_name}/unprocessed'
    processed_folder = f'/lakehouse/default/Files/inbound-sap/{table_name}/processed'
    delta_table_path = f'Tables/dbo/{table_name}'
    
    # Create processed folder if not exists
    os.makedirs(processed_folder, exist_ok=True)
    
    # Get latest Parquet file
    latest_file = get_latest_parquet_file(inbound_folder)
    
    if not latest_file:
        print(f"No files to process for {table_name}")
        return
    
    # Read Parquet file to Pandas
    try:
        pdf = pd.read_parquet(latest_file)
        df = spark.createDataFrame(pdf)
        
        # Identify merge keys
        merge_keys = identify_primary_keys(pdf)
        print(f"Identified merge keys for {table_name}: {merge_keys}")
        
        # Check if Delta table exists
        if DeltaTable.isDeltaTable(spark, delta_table_path):
            # Merge new data
            delta_table = DeltaTable.forPath(spark, delta_table_path)
            
            # Generate merge condition
            merge_condition = generate_merge_condition(merge_keys)
            
            delta_table.alias('existing') \
                .merge(df.alias('new'), merge_condition) \
                .whenMatchedUpdateAll() \
                .whenNotMatchedInsertAll() \
                .execute()
            
            print(f"Merged data for {table_name}")
        else:
            # Create new delta table
            df.write.format("delta").mode("overwrite").save(delta_table_path)
            print(f"Created new Delta table for {table_name}")
        
        # Move processed file
        processed_file = os.path.join(processed_folder, os.path.basename(latest_file))
        shutil.move(latest_file, processed_file)
        print(f"Moved {latest_file} to {processed_file}")
    
    except Exception as e:
        print(f"Error processing {table_name}: {e}")

def process_all_sap_tables():

    base_path = '/lakehouse/default/Files/inbound-sap'
    
    # Get all table names from subfolders
    table_names = [
        d for d in os.listdir(base_path) 
        if os.path.isdir(os.path.join(base_path, d))
    ]
    
    # Process each table
    for table_name in table_names:
        print(f"Processing table: {table_name}")
        process_sap_table(table_name)

# Main execution
def main(table_name=None): # pass table_name instead of None to process a table
    if table_name:
        print(f"Processing single table: {table_name}")
        process_sap_table(table_name)
    else:
        process_all_sap_tables()

# call main() function explicitly
if __name__=="__main__":
    main()

In [32]:
%%sql
select * from sap_ekko

StatementMeta(, e638685a-0e46-4b38-b4dc-09b5fce3ab40, 34, Finished, Available, Finished)

<Spark SQL result set with 53 rows and 18 fields>