<a href="https://colab.research.google.com/github/suzannefox/burrow/blob/main/compare_metadata_dataframes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
# Function to generate a report of similarities and differences in 2 dataframes
def generate_report(z1, z2):
    report = []

    z1name = [name for name in globals() if globals()[name] is z1][0]
    z2name = [name for name in globals() if globals()[name] is z2][0]

    # Find common and unique variables
    common_vars = set(z1['Variables']).intersection(set(z2['Variables']))
    unique_z1_vars = set(z1['Variables']).difference(set(z2['Variables']))
    unique_z2_vars = set(z2['Variables']).difference(set(z1['Variables']))

    report.append(f"Unique Variables in {z1name}:\n" + "\n".join(unique_z1_vars))
    report.append(f"\nUnique Variables in {z2name}:\n" + "\n".join(unique_z2_vars))

    # Compare common variables
    report.append("\nDifferences in common variables:")
    for var in common_vars:
        z1_row = z1[z1['Variables'] == var]
        z2_row = z2[z2['Variables'] == var]
        if not z1_row.empty and not z2_row.empty:
            z1_order, z2_order = z1_row['Order'].values[0], z2_row['Order'].values[0]
            z1_excel, z2_excel = z1_row['Excel'].values[0], z2_row['Excel'].values[0]
            if z1_order != z2_order or z1_excel != z2_excel:
                report.append(f"\nVariable: {var}")
                if z1_order != z2_order:
                    report.append(f"  Order: {z1name} = {z1_order}, {z2name} = {z2_order}")
                if z1_excel != z2_excel:
                    report.append(f"  Excel: {z1name} = {z1_excel}, {z2name} = {z2_excel}")

    return "\n".join(report)


In [3]:
# create some example metadata dataframes
penguins_meta_20240621 = pd.DataFrame({'Variables': ['species','sex','island','bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g'],
                   'Order': [1,2,3,4,5,6,7],
                   'Excel': ['A','B','C','D','E','F','G']})

penguins_meta_20240623 = pd.DataFrame({'Variables': ['species','island','billlength','billdepth','flipper_length_mm','body_mass_g','sex','timestamp'],
                   'Order': [1,2,3,4,5,6,7,8],
                   'Excel': ['A','B','C','D','E','F','G','H']})

In [4]:
# Generate and print the report
report = generate_report(penguins_meta_20240621,  penguins_meta_20240623)
print(report)

Unique Variables in penguins_meta_20240621:
bill_depth_mm
bill_length_mm

Unique Variables in penguins_meta_20240623:
timestamp
billdepth
billlength

Differences in common variables:

Variable: island
  Order: penguins_meta_20240621 = 3, penguins_meta_20240623 = 2
  Excel: penguins_meta_20240621 = C, penguins_meta_20240623 = B

Variable: flipper_length_mm
  Order: penguins_meta_20240621 = 6, penguins_meta_20240623 = 5
  Excel: penguins_meta_20240621 = F, penguins_meta_20240623 = E

Variable: sex
  Order: penguins_meta_20240621 = 2, penguins_meta_20240623 = 7
  Excel: penguins_meta_20240621 = B, penguins_meta_20240623 = G

Variable: body_mass_g
  Order: penguins_meta_20240621 = 7, penguins_meta_20240623 = 6
  Excel: penguins_meta_20240621 = G, penguins_meta_20240623 = F
