In [8]:
import pandas as pd
from pathlib import Path

# Paths to the directories containing the CSV files
part_a_dir = Path('./PARTA-output')
part_b_dir = Path('./PARTB-output')

# Find all CSV files in each directory
part_a_files = part_a_dir.glob('*.csv')
part_b_files = part_b_dir.glob('*.csv')

# Read the CSV files into DataFrames
df_a = pd.concat([pd.read_csv(file) for file in part_a_files])
df_b = pd.concat([pd.read_csv(file) for file in part_b_files])

# Ensure that 'LastName' column exists in both DataFrames
df_a.reset_index(drop=True, inplace=True)
df_b.reset_index(drop=True, inplace=True)

# Ensure that 'LastName' column exists in both DataFrames
if 'LastName' in df_a.columns and 'LastName' in df_b.columns:
    # Remove duplicate values in the 'LastName' column
    df_a.drop_duplicates(subset='LastName', inplace=True)
    df_b.drop_duplicates(subset='LastName', inplace=True)

    # Set 'LastName' as the index for both DataFrames
    df_a.set_index('LastName', inplace=True)
    df_b.set_index('LastName', inplace=True)

    # Find the difference between the two DataFrames
    diff = pd.concat([df_a, df_b], axis='columns', keys=['PARTA', 'PARTB'], sort=False)
    diff = diff[diff.isnull().any(axis=1)]  # Filter out rows that are the same in both

    # Save the differences to a new CSV file
    diff.to_csv('differencesAB.csv')
    print('The differences have been saved to differencesAB.csv')
else:
    print('Error: The column "LastName" was not found in both DataFrames.')


The differences have been saved to differencesAB.csv
