In [1]:
import pandas as pd
import polars as pl

In [2]:
pre_annotated_df = pl.read_csv("../data/to_manually_annotate.csv")
uog_df = pl.read_csv("../data/undina_annotations.tsv", separator="\t")
mnz_df = pl.read_csv("../data/michael_annotations.csv")

mnz_df = (
    pl.concat([
        pre_annotated_df,
        mnz_df.rename({"title": "_title", "text": "_text"})
    ], how="horizontal")
)
assert len(mnz_df.filter(pl.col("set_id").ne(pl.col("_title")))) == 0
mnz_df = (
    mnz_df
    .select("set_id", "label_id", "spl_version", "title", "section", "label", "annotation")
)

merged_df = (
    uog_df
    .join(mnz_df, on=["set_id", "label_id", "spl_version", "section"])
    .filter(pl.col("section").eq("DI"))
    .with_columns(pl.col("label", "title").str.replace_all("\n", " "))
    .select("set_id", "label_id", "spl_version", "title", "section", "label", "uog_annotation", mnz_annotation = "annotation")
)

merged_df.head(2)

set_id,label_id,spl_version,title,section,label,uog_annotation,mnz_annotation
str,str,i64,str,str,str,str,str
"""908691b4-7950-4f3e-bbea-ea568f…","""e6f0f0dd-940a-490f-a404-56dd56…",1,"""Isoxsuprine Hydrochloride Tabl…","""DI""",,,
"""5a0ba417-8a4a-4d7f-b85a-1839ee…","""8e64b577-1ecb-46f2-a7c8-3577a1…",5,"""These highlights do not includ…","""DI""",,,


In [3]:
diff_df = (
    merged_df
    .with_columns(
        pl.col("uog_annotation", "mnz_annotation")
            .str.split(",")
            .list.eval(
                pl.element()
                .str.strip_chars()
                .str.to_lowercase()
                .str.replace(".", "", literal=True)
                .str.replace("-", " ")
            )
            .list.unique()
    )
    .with_columns(
        uog_diff=pl.col("uog_annotation").list.set_difference(pl.col("mnz_annotation")),
        mnz_diff=pl.col("mnz_annotation").list.set_difference(pl.col("uog_annotation")),
    )
    # Replace empty list with null
    .with_columns(
        uog_diff=pl.when(pl.col("uog_diff").list.len() > 0).then(pl.col("uog_diff")),
        mnz_diff=pl.when(pl.col("mnz_diff").list.len() > 0).then(pl.col("mnz_diff")),
    )
    .with_columns(consensus=pl.col("uog_annotation").list.set_intersection(pl.col("mnz_annotation")))
)

diff_df.head(2)

set_id,label_id,spl_version,title,section,label,uog_annotation,mnz_annotation,uog_diff,mnz_diff,consensus
str,str,i64,str,str,str,list[str],list[str],list[str],list[str],list[str]
"""908691b4-7950-4f3e-bbea-ea568f…","""e6f0f0dd-940a-490f-a404-56dd56…",1,"""Isoxsuprine Hydrochloride Tabl…","""DI""",,,,,,
"""5a0ba417-8a4a-4d7f-b85a-1839ee…","""8e64b577-1ecb-46f2-a7c8-3577a1…",5,"""These highlights do not includ…","""DI""",,,,,,


In [9]:
(
    diff_df
    .with_columns(pl.col(pl.List(pl.Utf8)).list.join(", "))
    .write_csv("../data/initial_diff.csv")
)

In [8]:
(
    diff_df
    .filter(pl.col("uog_diff").is_not_null() | pl.col("mnz_diff").is_not_null())
    .with_columns(pl.col(pl.List(pl.Utf8)).list.join(","))
)

set_id,label_id,spl_version,title,section,label,uog_annotation,mnz_annotation,uog_diff,mnz_diff,consensus
str,str,i64,str,str,str,str,str,str,str,str
"""e45cc371-9ebc-4904-12bc-65cb4e…","""ccb063c5-2695-4ed5-a281-c75f6e…",17,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS There hav…","""sulfonylureas,st john's wort,s…","""sulfonylureas,sulfonamides,gri…","""tetracycline""","""tetracyclines""","""sulfonylureas,st john's wort,s…"
"""b8eb330a-a75b-46cb-b694-5be932…","""3860168c-32e9-4151-b08e-25d4bb…",5,"""DOPRAM Injection(doxapram hydr…","""DI""","""Drug Interactions Administrat…","""doxopram,oxidase inhibiting dr…","""theophylline,monoamine oxidase…","""doxopram,oxidase inhibiting dr…","""sympathomimetic drugs,monoamin…","""neuromuscular blocking agents,…"
"""f6927e91-bb46-47b3-ab66-f9d8a9…","""8bdc1b76-3c74-4a6a-8a82-86a6a0…",2,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS 7.1 C…","""raxibacumab,anthrax vaccine ad…","""scava,raxibacumab,anthrax vacc…","""anthrax vaccine adsorbed (ava)""","""scava,anthrax vaccine""","""raxibacumab,ciprofloxacin"""
"""6a95effd-32a5-46b6-a30e-7f4f9b…","""168e1482-e6a5-416d-9711-2c8d9c…",7,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS •Co…","""calcifediol,voriconazole,rayal…","""cytochrome p450 inhibitors,ind…","""compounds that stimulate micro…","""drugs stimulating microsomal h…","""calcifediol,voriconazole,rayal…"
"""c00d1607-ac36-457b-a34b-75ad74…","""a9982b9f-49dc-474b-8179-da9189…",32,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS Drug int…","""protease inhibitors,nifedipine…","""anti infectives,pimozide,triam…","""protease inhibitors,benzodiaze…","""antipsychotics,antiarrhythmics…","""nifedipine,mesoridazine,phenob…"
…,…,…,…,…,…,…,…,…,…,…
"""cdfbe0cd-eb15-45a1-ac17-531bcd…","""1d6c9e9d-e17d-4609-91fa-75e5bd…",11,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS Warf…","""ocaliva,colestipol,bile acid b…","""colesevelam,cyp1a2 substrates,…","""ocaliva,obeticholic acid""",,"""colestipol,bile acid binding r…"
"""dac9637f-3326-4f25-b7b9-f9f54b…","""2a704323-0860-4d2f-8743-f2b4d0…",5,"""Ergomar® Sublingual Tablets, 2…","""DI""","""Drug Interactions CYP 3A…","""vasoconstrictors,cyp3a4 inhibi…","""sympathomimetics,macrolide ant…","""ergotamine tartrate sublingual…",,"""vasoconstrictors,cyp3a4 inhibi…"
"""f11c21f8-f725-445e-b38e-1e4c5b…","""b3b84922-3dcf-4c02-91ea-856e48…",5,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS CYP3A…","""xermelo,midazolam,telotristat,…","""midazolam,cyp3a4 substrates,sh…","""xermelo,telotristat ethyl,telo…",,"""midazolam,short acting octreot…"
"""03880372-2c68-45c6-a53a-f420c4…","""0823bccc-f79a-400b-a8bf-8e0988…",21,"""These highlights do not includ…","""DI""","""7 DRUG INTERACTIONS [see Dos…","""nelfinavir,norethindrone,didan…","""macrolide antibiotics,hiv anti…","""edurant,norethindrone,sildenaf…","""macrolide antibiotics,non nucl…","""nelfinavir,didanosine,delavird…"
