In [4]:
import os
import pandas as pd
from collections import defaultdict

def read_csv_files(folder):
    """Reads all CSV files in a folder and returns a dictionary of DataFrames."""
    csv_files = {}
    for filename in os.listdir(folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder, filename)
            csv_files[filename] = pd.read_csv(file_path)
    return csv_files

def compare_dataframes(df1, df2):
    """Compares two DataFrames and returns the added, removed, and changed rows count."""
    # Finding added and removed rows
    added = pd.concat([df2, df1]).drop_duplicates(keep=False)
    removed = pd.concat([df1, df2]).drop_duplicates(keep=False)
    
    # Finding changed rows
    merged = pd.merge(df1, df2, how='outer', indicator=True)
    changed = merged[merged['_merge'] != 'both']
    
    return len(added), len(removed), len(changed)

def compare_versions(folder1, folder2, folder3):
    """Compares CSV files across three versions and returns a summary."""
    version_1 = read_csv_files(folder1)
    version_2 = read_csv_files(folder2)
    version_3 = read_csv_files(folder3)
    
    differences = defaultdict(list)
    summary = {'1->2': {'files': 0, 'added': 0, 'removed': 0, 'changed': 0, 'new_files': 0, 'removed_files': 0},
               '2->3': {'files': 0, 'added': 0, 'removed': 0, 'changed': 0, 'new_files': 0, 'removed_files': 0}}
    
    # Compare files in version 1 to version 2
    all_files_1_2 = set(version_1.keys()).union(set(version_2.keys()))  # All files from both versions
    for file_name in all_files_1_2:
        added, removed, changed = 0, 0, 0
        if file_name in version_1 and file_name in version_2:
            added, removed, changed = compare_dataframes(version_1[file_name], version_2[file_name])
        elif file_name in version_2:
            # File was added in version 2
            added = len(version_2[file_name])
            summary['1->2']['new_files'] += 1
        elif file_name in version_1:
            # File was removed in version 2
            removed = len(version_1[file_name])
            summary['1->2']['removed_files'] += 1
        
        if added > 0 or removed > 0 or changed > 0:
            differences[file_name].append(("Version 1 -> Version 2", added, removed, changed))
            summary['1->2']['files'] += 1
            summary['1->2']['added'] += added
            summary['1->2']['removed'] += removed
            summary['1->2']['changed'] += changed
    
    # Compare files in version 2 to version 3
    all_files_2_3 = set(version_2.keys()).union(set(version_3.keys()))  # All files from both versions
    for file_name in all_files_2_3:
        added, removed, changed = 0, 0, 0
        if file_name in version_2 and file_name in version_3:
            added, removed, changed = compare_dataframes(version_2[file_name], version_3[file_name])
        elif file_name in version_3:
            # File was added in version 3
            added = len(version_3[file_name])
            summary['2->3']['new_files'] += 1
        elif file_name in version_2:
            # File was removed in version 3
            removed = len(version_2[file_name])
            summary['2->3']['removed_files'] += 1
        
        if added > 0 or removed > 0 or changed > 0:
            differences[file_name].append(("Version 2 -> Version 3", added, removed, changed))
            summary['2->3']['files'] += 1
            summary['2->3']['added'] += added
            summary['2->3']['removed'] += removed
            summary['2->3']['changed'] += changed

    return differences, summary

def print_summary(summary):
    """Prints a concise summary of the differences between versions."""
    for version_pair, stats in summary.items():
        print(f"\nSummary for {version_pair}:")
        print(f"  Number of files with changes: {stats['files']}")
        print(f"  Total added rows: {stats['added']}")
        print(f"  Total removed rows: {stats['removed']}")
        print(f"  Total changed rows: {stats['changed']}")
        print(f"  Number of new files: {stats['new_files']}")
        print(f"  Number of removed files: {stats['removed_files']}")

def print_differences(differences):
    """Prints a high-level summary of the changes in files."""
    for file_name, changes in differences.items():
        print(f"\nChanges in file: {file_name}")
        for version, added, removed, changed in changes:
            print(f"  {version}: {added} added, {removed} removed, {changed} changed")


# Usage example
folder_1 = './folders_to_compare/6 days set'
folder_2 = './folders_to_compare/9 days set'
folder_3 = './folders_to_compare/12 days set'

differences, summary = compare_versions(folder_1, folder_2, folder_3)

# Print the concise summary
print_summary(summary)

# Optional: Print detailed changes (can be skipped if you only want the summary)
# print_differences(differences)



Summary for 1->2:
  Number of files with changes: 916
  Total added rows: 82350
  Total removed rows: 90
  Total changed rows: 0
  Number of new files: 915
  Number of removed files: 1

Summary for 2->3:
  Number of files with changes: 2909
  Total added rows: 261810
  Total removed rows: 0
  Total changed rows: 0
  Number of new files: 2909
  Number of removed files: 0
