In [5]:
import IPython
import pandas as pd
import os, sys, pathlib

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names

DATA_DIR = pathlib.Path.cwd().parent / "data"

In [7]:
# Load
path_to_score_file_1 = (
    DATA_DIR / "score" / "csv" / "full" / "usa1.csv"
)
path_to_score_file_2 = (
        DATA_DIR / "score" / "csv" / "full" / "usa2.csv"
)

score_1_df = pd.read_csv(
    path_to_score_file_1,
    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string"},
)

score_2_df = pd.read_csv(
    path_to_score_file_2,
    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string"},
)

score_2_df.head()

In [None]:
# List columns in one but not the other 
score_2_df.columns.difference(score_1_df.columns)

In [None]:
# List rows in one but not the other

if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):
    print("Different lengths!")

print("Difference in tract IDs:")
print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))


In [None]:
# Join 
merged_df = score_1_df.merge(score_2_df, how="outer", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))
merged_df

In [None]:
# Check each duplicate column: 
duplicate_columns = [x[:-2] for x in merged_df.columns if "_1" in x]

for duplicate_column in duplicate_columns:
    print(f"Checking duplicate column {duplicate_column}")
    if not merged_df[f"{duplicate_column}_1"].equals(merged_df[f"{duplicate_column}_2"]):
        print(merged_df[f"{duplicate_column}_1"].compare(merged_df[f"{duplicate_column}_2"]))
        raise ValueError(f"Error! Different values in {duplicate_column}")