In [60]:
import warnings
import os
from pathlib import Path
from datetime import date, datetime
from random import sample
from zoneinfo import ZoneInfo
import numpy as np
import pandas as pd

from google.cloud.bigquery.client import Client as BigQueryClient

from utils import *

warnings.filterwarnings("ignore", category=UserWarning)

big_query_client = BigQueryClient()


In [61]:
n_samples = 100
seed = 0
# For our sample date, we take the date halfway between the
# two latest expert sample dates to ensure separation and currency
subsystem_sample_date = date(2024, 10, 23)


expert_sample_dates = [
    (2024, 6, 4),
    (2024, 6, 10),
    (2024, 7, 4),
    (2024, 7, 8),
    (2024, 7, 15),
    (2024, 8, 16),
    (2024, 8, 17),
    (2024, 8, 18),
    (2024, 8, 19),
    (2024, 9, 9),
    (2024, 9, 16),
    (2024, 9, 23),
    (2024, 9, 26),
    (2024, 10, 17),
    (2024, 10, 29),
]

expert_sample_dates = [date(*sample_date) for sample_date in expert_sample_dates]



### Get all irids on subsystem sample date

### Add case id to each irid

In [62]:
# Get irid-to-case-id table
case_to_irid_query = f"""
        SELECT DISTINCT
            app_case.id AS case_id,
            attachment.informationRequestId AS request_id,
            attachment.filename AS image_file_name
        FROM `noimos-data-lake-exp.prod.int_app_attachment` AS attachment
        INNER JOIN `noimos-data-lake-exp.prod.int_app_case` AS app_case
        ON attachment.informationrequestid = app_case.defaultinformationrequestid 
        WHERE NOT ENDS_WITH(attachment.filename, ".pdf") AND NOT ENDS_WITH(attachment.filename, ".xml");
"""

case_to_irid_table = query_bigquery(query=case_to_irid_query, big_query_client=big_query_client)
case_to_irid_table = case_to_irid_table[["case_id", "request_id"]].drop_duplicates()



In [63]:
irids_by_date = query_bigquery(
    big_query_client=big_query_client,
    query="SELECT informationRequestId, timestamp FROM `noimos-core-axach-prod.mod.workflow_output`"
)

timezone = ZoneInfo('UTC')
start_of_day = datetime.combine(subsystem_sample_date, datetime.min.time()).replace(tzinfo=timezone)
end_of_day = datetime.combine(subsystem_sample_date, datetime.max.time()).replace(tzinfo=timezone)
irids_on_subsystem_sample_date = irids_by_date[(irids_by_date['timestamp'] >= start_of_day) & (irids_by_date['timestamp'] <= end_of_day)]

# Join case id info to sampled irids
sampled_irids = pd.merge(irids_on_subsystem_sample_date, case_to_irid_table, left_on="informationRequestId", right_on="request_id", how="inner")
sampled_irids = sampled_irids[["informationRequestId", "timestamp", "case_id"]]


# Blacklisting

In [64]:
blacklist_irids = pd.DataFrame(columns=["informationRequestId", "timestamp"])

### Get case ids on expert sample dates for blacklisting

In [65]:
for date in expert_sample_dates:
    start_of_day = datetime.combine(date, datetime.min.time()).replace(tzinfo=timezone)
    end_of_day = datetime.combine(date, datetime.max.time()).replace(tzinfo=timezone)
    irids_on_date = irids_by_date[(irids_by_date['timestamp'] >= start_of_day) & (irids_by_date['timestamp'] <= end_of_day)]
    blacklist_irids = pd.concat([blacklist_irids, irids_on_date], axis=0)

  blacklist_irids = pd.concat([blacklist_irids, irids_on_date], axis=0)


### Get case ids on mod_gt for blacklisting

### Get case ids on mod_train/mod_test for black listing

In [66]:
# Get mod_train irids
mod_train_version = "0.7.0"

query = f"""
    SELECT
        *
    FROM
        `noimos-core-axach-development.mod.train` AS train
"""

query_job = big_query_client.query(query)
mod_train_table = query_job.to_dataframe()

mod_train_irids = mod_train_table[mod_train_table["version"] == mod_train_version]["informationRequestId"].unique()

# Get mod_test irids
mod_test_version = "0.7.0"

query = f"""
    SELECT
        *
    FROM
        `noimos-core-axach-development.mod.test` AS test
"""

query_job = big_query_client.query(query)
mod_test_table = query_job.to_dataframe()

mod_test_irids = mod_test_table[mod_test_table["version"] == mod_test_version]["informationRequestId"].unique()

mod_train_test_irids = np.concatenate((mod_train_irids, mod_test_irids))
mod_train_test_irids = pd.DataFrame({"informationRequestId": mod_train_test_irids, "timestamp": [np.nan] * len(mod_train_test_irids)})

blacklist_irids = pd.concat([blacklist_irids, mod_train_test_irids], axis=0)

pass

  blacklist_irids = pd.concat([blacklist_irids, mod_train_test_irids], axis=0)


In [67]:
"""mod_train_test_irids["on_expert_sample_date"] = None

for irid in mod_train_test_irids["informationRequestId"]:
    mod_train_test_irids["on_expert_sample_date"] = irid in blacklist_irids["informationRequestId"]
    
pass"""

'mod_train_test_irids["on_expert_sample_date"] = None\n\nfor irid in mod_train_test_irids["informationRequestId"]:\n    mod_train_test_irids["on_expert_sample_date"] = irid in blacklist_irids["informationRequestId"]\n    \npass'

In [68]:
# Add case id to blacklist irids
blacklist_irids = pd.merge(blacklist_irids, case_to_irid_table, how="left", left_on="informationRequestId", right_on="request_id")
blacklist_irids = blacklist_irids[["informationRequestId", "timestamp", "case_id"]]

In [69]:
sampled_irids["case_blacklisted"] = None
for i, row in sampled_irids.iterrows():
    sampled_irids.loc[i, "case_blacklisted"] = row["case_blacklisted"] in blacklist_irids["case_id"]
    
pass

In [70]:
sampled_irids = sampled_irids.sort_values(by="timestamp", ascending=True)


### Get heuristic/llm/xml, language, kva provider

In [71]:
line_item_query = f"""
    SELECT DISTINCT
        informationRequestId, kva.language, kva.provider
    FROM
      `noimos-core-axach-prod.mod.line_item_classification`,
      UNNEST(classifiedKva) as kva
"""

line_item_table = query_bigquery(query=line_item_query, big_query_client=big_query_client)

kva_query = f"""
    SELECT informationRequestId, sourceFilePath FROM `noimos-core-axach-prod.mod.kva`
"""

kva_table = query_bigquery(query=kva_query, big_query_client=big_query_client)

workflow_output_query = f"""
    SELECT informationRequestId, llmPath FROM `noimos-core-axach-prod.mod.workflow_output`
"""

workflow_output_table = query_bigquery(query=workflow_output_query, big_query_client=big_query_client)

def get_file_type(filename):
    if filename.endswith('.pdf'):
        return 'pdf'
    elif filename.endswith('.xml'):
        return 'xml'
    else:
        return 'unknown'

kva_table["kva_file_type"] = kva_table['sourceFilePath'].apply(get_file_type)

sampled_irids = pd.merge(left=sampled_irids, right=line_item_table, on="informationRequestId", how="left")
sampled_irids = pd.merge(left=sampled_irids, right=workflow_output_table, on="informationRequestId", how="left")
sampled_irids = pd.merge(left=sampled_irids, right=kva_table, on="informationRequestId", how="left")


### Sample

In [72]:
sampled_irids = sampled_irids[sampled_irids["case_blacklisted"] == False]
sampled_irids = sampled_irids.drop_duplicates(subset="case_id", keep="first")
final_sampled_irids = sampled_irids.sample(n=n_samples, random_state=1)

### Get llm path info for sampled irids

In [None]:
final_sampled_irids["llm_path"] = None




for i, row in final_sampled_irids.iterrows():
    pass

### Calculate global dataset statistics

In [74]:
statistics_global = {
    "language": {
        "de": (line_item_table["language"] == "de").sum() / len(line_item_table),
        "fr": (line_item_table["language"] == "fr").sum() / len(line_item_table),
        None: (line_item_table["language"].isna()).sum() / len(line_item_table),
    },
    "provider": {
        "autoidat": (line_item_table["provider"] == "autoidat").sum() / len(line_item_table),
        "audatex": (line_item_table["provider"] == "audatex").sum() / len(line_item_table),
        "other": ((line_item_table["provider"] != "audatex") & (line_item_table["provider"] != "autoidat")).sum() / len(line_item_table),
    },
    "kva_file_type": {
        "pdf": (kva_table["kva_file_type"] == "pdf").sum() / len(kva_table),
        "xml": (kva_table["kva_file_type"] == "xml").sum() / len(kva_table),
        "other": (kva_table["kva_file_type"] == "unknown").sum() / len(kva_table),
    }
}

### Calculate sampled dataset statistics

In [75]:
statistics_sampled = {
    "language": {
        "de": (final_sampled_irids["language"] == "de").sum() / len(final_sampled_irids),
        "fr": (final_sampled_irids["language"] == "fr").sum() / len(final_sampled_irids),
        None: (final_sampled_irids["language"].isna()).sum() / len(final_sampled_irids),
    },
    "provider": {
        "autoidat": (final_sampled_irids["provider"] == "autoidat").sum() / len(final_sampled_irids),
        "audatex": (final_sampled_irids["provider"] == "audatex").sum() / len(final_sampled_irids),
        "other": ((final_sampled_irids["provider"] != "audatex") & (final_sampled_irids["provider"] != "autoidat")).sum() / len(final_sampled_irids),
    },
    "kva_file_type": {
        "pdf": (final_sampled_irids["kva_file_type"] == "pdf").sum() / len(final_sampled_irids),
        "xml": (final_sampled_irids["kva_file_type"] == "xml").sum() / len(final_sampled_irids),
        "other": (final_sampled_irids["kva_file_type"] == "unknown").sum() / len(final_sampled_irids),
    }
}