In [None]:
import IPython
import os
import pandas as pd
import pathlib
import sys

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names

DATA_DIR = pathlib.Path.cwd().parent / "data"

In [None]:
# Load
path_to_score_file_1 = DATA_DIR / "compare_two_score_csvs/usa (pre 970).csv"
path_to_score_file_2 = DATA_DIR / "compare_two_score_csvs/usa (post 970).csv"

score_1_df = pd.read_csv(
    path_to_score_file_1,
    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string"},
)

score_2_df = pd.read_csv(
    path_to_score_file_2,
    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string"},
)

score_2_df.head()

In [None]:
# List columns in one but not the other
score_2_df.columns.difference(score_1_df.columns)

In [None]:
# List rows in one but not the other

if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(
    score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]
):
    print("Different lengths!")

print("Difference in tract IDs:")
print(
    set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])
    ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])
)

In [None]:
# Join
merged_df = score_1_df.merge(
    score_2_df,
    how="outer",
    on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME,
    suffixes=("_1", "_2"),
)
merged_df

In [None]:
# Check each duplicate column:
# Remove the suffix "_1"
duplicate_columns = [x[:-2] for x in merged_df.columns if "_1" in x]

columns_to_exclude_from_duplicates_check = ["Total threshold criteria exceeded"]

columns_to_check = [
    column
    for column in duplicate_columns
    if column not in columns_to_exclude_from_duplicates_check
]

any_errors_found = False
for column_to_check in columns_to_check:
    print(f"Checking duplicate column {column_to_check}")
    if not merged_df[f"{column_to_check}_1"].equals(
        merged_df[f"{column_to_check}_2"]
    ):
        print(f"Error! Different values in {column_to_check}")
        print(
            merged_df[f"{column_to_check}_1"].compare(
                merged_df[f"{column_to_check}_2"]
            )
        )
        any_errors_found = True

if any_errors_found:
    raise ValueError(f"Error! Different values in one or more columns.")