# Excel File Row Difference Analyzer

This notebook provides a function to compare two Excel files and identify which rows are different between them.

In [None]:
from pathlib import Path

import pandas as pd

In [ ]:
def diff_xlsx_files(file1_path, file2_path, output_path, join_columns=None):
    """
    Compare two Excel files and generate a report showing row differences.
    
    Args:
        file1_path: Path to first Excel file (reference)
        file2_path: Path to second Excel file (comparison)
        output_path: Path for difference report Excel file
        join_columns: List of column names to use for row matching (if None, compares all columns)
        
    Returns:
        dict: Summary of row differences found
        
    Raises:
        FileNotFoundError: If input files don't exist
        ValueError: If files have incompatible structures
    """
    # Convert to Path objects
    file1 = Path(file1_path)
    file2 = Path(file2_path)
    output = Path(output_path)
    
    # Validate input files exist
    if not file1.exists():
        raise FileNotFoundError(f"File not found: {file1}")
    if not file2.exists():
        raise FileNotFoundError(f"File not found: {file2}")
    
    # Create output directory if needed
    output.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Read files
        df1 = pd.read_excel(file1)
        df2 = pd.read_excel(file2)
        
        print(f"File 1: {len(df1)} rows, {len(df1.columns)} columns")
        print(f"File 2: {len(df2)} rows, {len(df2.columns)} columns")
        
        # Check for empty dataframes
        if df1.empty and df2.empty:
            print("Warning: Both files are empty")
            summary = {"status": "both_empty", "differences": 0}
            return summary
        
        # Ensure same columns for comparison
        if list(df1.columns) != list(df2.columns):
            raise ValueError(
                f"Column mismatch - File 1: {list(df1.columns)}, "
                f"File 2: {list(df2.columns)}"
            )
        
        # Determine comparison columns
        if join_columns:
            # Validate join columns exist
            missing_cols = set(join_columns) - set(df1.columns)
            if missing_cols:
                raise ValueError(f"Join columns not found: {missing_cols}")
            comparison_cols = join_columns
        else:
            # Use all columns for comparison
            comparison_cols = list(df1.columns)
        
        print(f"Comparing rows using columns: {comparison_cols}")
        
        # Find row differences using merge
        merged = pd.merge(
            df1, df2, 
            on=comparison_cols, 
            how='outer', 
            indicator=True
        )
        
        # Separate different types of rows
        only_in_file1 = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
        only_in_file2 = merged[merged['_merge'] == 'right_only'].drop('_merge', axis=1)
        common_rows = merged[merged['_merge'] == 'both'].drop('_merge', axis=1)
        
        # Create summary
        summary_data = {
            'Metric': [
                'Rows only in File 1',
                'Rows only in File 2', 
                'Common rows',
                'Total rows in File 1',
                'Total rows in File 2',
                'Total unique rows'
            ],
            'Count': [
                len(only_in_file1),
                len(only_in_file2),
                len(common_rows),
                len(df1),
                len(df2),
                len(merged)
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        
        # Write results to Excel with multiple sheets
        with pd.ExcelWriter(output, engine='openpyxl') as writer:
            # Summary sheet
            summary_df.to_excel(writer, sheet_name='Summary', index=False)
            
            # Rows only in file 1
            if len(only_in_file1) > 0:
                only_in_file1.to_excel(writer, sheet_name='Only_in_File1', index=False)
                print(f"Found {len(only_in_file1)} rows only in File 1")
            
            # Rows only in file 2  
            if len(only_in_file2) > 0:
                only_in_file2.to_excel(writer, sheet_name='Only_in_File2', index=False)
                print(f"Found {len(only_in_file2)} rows only in File 2")
            
            # Common rows
            if len(common_rows) > 0:
                common_rows.to_excel(writer, sheet_name='Common_Rows', index=False)
                print(f"Found {len(common_rows)} common rows")
            
            # Original files for reference
            df1.to_excel(writer, sheet_name='Original_File1', index=False)
            df2.to_excel(writer, sheet_name='Original_File2', index=False)
        
        # Prepare summary
        total_differences = len(only_in_file1) + len(only_in_file2)
        summary = {
            "status": "completed",
            "file1_rows": len(df1),
            "file2_rows": len(df2),
            "only_in_file1": len(only_in_file1),
            "only_in_file2": len(only_in_file2),
            "common_rows": len(common_rows),
            "total_differences": total_differences,
            "comparison_columns": comparison_cols
        }
        
        print(f"Row difference analysis completed.")
        print(f"Total differences: {total_differences} rows")
        print(f"Report saved to: {output}")
        
        return summary
        
    except Exception as e:
        print(f"Error processing files: {e}")
        raise

## Usage Examples

### Example 1: Compare all columns to find row differences

In [ ]:
# Compare files using all columns to identify different rows
file1_path = Path('data1.xlsx')
file2_path = Path('data2.xlsx')
output_path = Path('row_differences.xlsx')

try:
    summary = diff_xlsx_files(file1_path, file2_path, output_path)
    print("Row comparison completed successfully!")
    print(f"Summary: {summary}")
except FileNotFoundError as e:
    print(f"File error: {e}")
except ValueError as e:
    print(f"Data error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

### Example 2: Compare using specific key columns to match rows

In [ ]:
# Compare files using specific key columns for row matching
file1_path = Path('dataset1.xlsx')
file2_path = Path('dataset2.xlsx')
output_path = Path('row_differences_by_key.xlsx')
key_cols = ['id', 'name']  # Specify key columns that define unique rows

try:
    summary = diff_xlsx_files(file1_path, file2_path, output_path, join_columns=key_cols)
    print("Row comparison completed successfully!")
    print(f"Summary: {summary}")
except FileNotFoundError as e:
    print(f"File error: {e}")
except ValueError as e:
    print(f"Data error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

## View Results

If comparison was successful, you can preview the summary:

In [None]:
# Display summary if available
if 'summary' in locals():
    print("Difference Analysis Summary:")
    for key, value in summary.items():
        print(f"  {key}: {value}")
else:
    print("No comparison results available")

In [None]:
# Optional: Load and preview the generated difference report
if 'output_path' in locals() and Path(output_path).exists():
    # Read summary sheet
    summary_df = pd.read_excel(output_path, sheet_name='Summary')
    print("Difference Report Summary:")
    display(summary_df)
else:
    print("No difference report file found")