In [None]:
from stratigraphy import DATAPATH, PROJECT_ROOT
import pandas as pd
import numpy as np

In [None]:
# Find the run ID in the mlflow UI that can be found at the following URL: http://127.0.0.1:5000/
run_id_1 = "ee380f63e170484ba416fda95f860a4c"  # old run
run_id_2 = "9b27ba02f14745e1b43f138f57df8455"  # new run

In [None]:
# list the tem most recent directories in path
path = PROJECT_ROOT / "mlartifacts" / "856911286406481746" 
dirs = sorted(path.iterdir(), key=lambda x: x.stat().st_ctime, reverse=True)[:10]

# Remove ds_store
dirs = [str(d) for d in dirs if "DS_Store" not in str(d)]
dirs = [d.split("/")[-1] for d in dirs]
dirs

In [None]:
run_id_1 = dirs[1]
run_id_2 = dirs[0]  # newst run

In [None]:
run_id_1_path = PROJECT_ROOT / "mlartifacts" / "856911286406481746" / run_id_1 / "artifacts" / "document_level_metrics.csv"
run_id_2_path = PROJECT_ROOT / "mlartifacts" / "856911286406481746" / run_id_2 / "artifacts" / "document_level_metrics.csv"

run_id_1_page_dir = PROJECT_ROOT / "mlartifacts" / "856911286406481746" / run_id_1 / "artifacts" / "pages"
run_id_2_page_dir = PROJECT_ROOT / "mlartifacts" / "856911286406481746" / run_id_2 / "artifacts" / "pages"

# Load the GT
import json
# filename = PROJECT_ROOT / "data" / "geoquat_ground_truth.json"
filename = PROJECT_ROOT / "data" / "zurich_ground_truth.json"

with open(filename, 'r') as f:
    data = json.load(f)

In [None]:
metrics_run_id_1_df = pd.read_csv(run_id_1_path).drop(columns=["Unnamed: 0"])
metrics_run_id_2_df = pd.read_csv(run_id_2_path).drop(columns=["Unnamed: 0"])

# Keep only the document_name and groundwater_depth
metrics_run_id_1_df = metrics_run_id_1_df[["document_name", "groundwater_depth"]]
metrics_run_id_2_df = metrics_run_id_2_df[["document_name", "groundwater_depth"]]

In [None]:
metrics_run_id_1_df.head()

In [None]:
metrics_run_id_1_df.columns

In [None]:
# Change the nan values to 2 - this is to make sure that the difference is not due to the fact that the model did not predict the label

metrics_run_id_1_df = metrics_run_id_1_df.fillna(2)
metrics_run_id_2_df = metrics_run_id_2_df.fillna(2)

# Now the groundwater_depth column can contain three different values: 0, 1, 2 which correspond to the three classes: false, true, no prediction

Find the cases where the prediction changed

In [None]:
# get unequal cases:
changed_documents = metrics_run_id_1_df.groundwater_depth != metrics_run_id_2_df.groundwater_depth
changed_documents

In [None]:
merged = pd.merge(metrics_run_id_1_df[changed_documents], metrics_run_id_2_df[changed_documents], how='inner', on="document_name", suffixes=('_new', ''))

In [None]:
merged

In [None]:
# Draw the changes head to head

import matplotlib.pyplot as plt

for i in range(merged.shape[0]):
    print(merged.iloc[i].document_name)
    
    # print the groundtruth
    gt_single_file_data = data[merged.iloc[i].document_name]["groundwater"]
    print(gt_single_file_data)

    img_1 = plt.imread(run_id_1_page_dir / f"{merged.iloc[i].document_name}_page1.png")
    img_2 = plt.imread(run_id_2_page_dir / f"{merged.iloc[i].document_name}_page1.png")

    fig, ax = plt.subplots(1, 2, figsize=(20, 20))
    ax[0].imshow(img_1)
    ax[0].axis("off")
    ax[0].set_title("Run 1")

    ax[1].imshow(img_2)
    ax[1].axis("off")
    ax[1].set_title("Run 2")
    plt.show()


In [None]:
img_1 = plt.imread(run_id_1_page_dir / f"{merged.iloc[i].document_name}_page1.png")
img_2 = plt.imread(run_id_2_page_dir / f"{merged.iloc[i].document_name}_page1.png")

# Export the mislabeled data 

The goal here is export the mislabeled file with ignoring the ones that are already known as not being fixable due to various reasons

In [None]:
metrics_run_id_2_df = pd.read_csv(run_id_2_path).drop(columns=["Unnamed: 0"])
metrics_run_id_2_df = metrics_run_id_2_df[metrics_run_id_2_df.groundwater_depth == 0.0]
metrics_run_id_2_df

In [None]:
filenames_to_ignore = [
    # confusion between depth and GT -> need to extract elevation
    "267125029-bp.pdf",
    "267123083-bp.pdf",
    "268125539-bp.pdf",
    "269126084-bp.pdf",
    "269126085-bp.pdf",
    "677250019-bp.pdf",
    "681249002-bp.pdf",
    # confusion between UCSC Class GW and the keyword
    "267125086-bp.pdf",
    "673251002-bp.pdf",
    "699240002-bp.pdf",
    # text extraction issue
    "267124070-bp.pdf",
    "267123080-bp.pdf",
    "267125469-bp.pdf",
    "675246002-bp.pdf",
    # groundwater keyword is in column
    "267122001-bp.pdf",
    # logo groundwater only
    "267123094-bp.pdf",
    "267125308-bp.pdf",
    "678249001-bp.pdf",
    "699240002-bp.pdf",
    "267125310-bp.pdf",
    # several groundwater measurements
    "267123089-bp.pdf",
    "267123078-bp.pdf",
    "269126084-bp.pdf",
    "269126085-bp.pdf",
    "680244002-bp.pdf",
    "678249001-bp.pdf",
    # gt is wrong
    "267125223-bp.pdf",
    "267125339-bp.pdf",
    "267125338-bp.pdf",
    "267125334-bp.pdf",
    "268124635-bp.pdf",
    "269124200-bp.pdf",
    "675246002-bp.pdf",
    "268124569-bp.pdf",
]

metrics_run_id_2_df = metrics_run_id_2_df[~metrics_run_id_2_df.document_name.isin(filenames_to_ignore)]
metrics_run_id_2_df

In [None]:
import matplotlib.pyplot as plt

for i in range(metrics_run_id_2_df.shape[0]):
    print(metrics_run_id_2_df.iloc[i].document_name)
    
    # print the groundtruth
    gt_single_file_data = data[metrics_run_id_2_df.iloc[i].document_name]["groundwater"]
    print(gt_single_file_data)

    img_2 = plt.imread(run_id_2_page_dir / f"{metrics_run_id_2_df.iloc[i].document_name}_page1.png")

    plt.figure(figsize=(20, 20))
    plt.imshow(img_2)
    plt.axis("off")
    plt.show()