In [1]:
import polars as pl
import os

In [2]:
os.chdir("./Notebooks")

# Sample from the Union Dataframe
* sample 25 items from the top Union, non-true predictions to validate
* concatenate the first prediction by individual approaches
* merge in node names

In [3]:
sampled = {
    "CHEBI:77590": "armodafinil",
    "CHEBI:2968": "bacampicillin",
    "CHEBI:3128": "bisoprolol fumarate",
    "CHEBI:48811": "chloroquine",
    "CHEBI:453011": "ciclopirox",
    "CHEBI:3758": "Clonidine hydrochloride",
    "CHEBI:64198": "dimercaprol",
    "CHEBI:28748": "doxorubicin",
    "CHEBI:4814": "eprosartan",
    "CHEBI:5165": "Fosphenytoin",
    "CHEBI:5775": "hydralazine",
    "CHEBI:66913": "ingenol mebutate",
    "IKEY:COCFEDIXXNGUNL-RFKWWTKHSA-N": "insulin glargine",
    "CHEBI:6775": "mesalamine",
    "CHEBI:64354": "metolazone",
    "CHEBI:6904": "metoprolol",
    "CHEBI:31854": "Milnacipran hydrochloride",
    "CHEBI:83766": "olaparib",
    "CHEBI:7826": "oxiconazole nitrate",
    "CHEBI:7944": "Paxil",
    "CHEBI:8481": "Propantheline",
    "CHEBI:32087": "Ramatroban",
    "CHEBI:92246": "ranitidine",
    "CHEBI:9207": "sotalol hydrochloride",
    "CHEBI:32184": "tazarotene",
}

In [4]:
sampled_25 = (
    pl.read_parquet("./data_output/union_method_combinations.parquet")
    .filter(
        pl.col("is_answer") == False,
        pl.col("method_size") == 7,
        pl.col("h").is_in(sampled.keys()),
    )
    .sort("filt_rank", descending=False)
    .group_by("h", maintain_order=True)
    .agg(["answers", "filt_rank"])
    .with_columns(
        pl.col("answers").list.first().alias("first_ans"),
        pl.col("filt_rank").list.first().alias("first_rank"),
    )
)

sampled_25

h,answers,filt_rank,first_ans,first_rank
str,list[str],list[u32],str,u32
"""CHEBI:9207""","[""DOID:10763"", ""DOID:2841"", … ""DOID:12995""]","[1, 3, … 174]","""DOID:10763""",1
"""CHEBI:77590""","[""DOID:3312"", ""DOID:5419"", … ""MESH:C531617""]","[1, 2, … 190]","""DOID:3312""",1
"""CHEBI:31854""","[""DOID:1470"", ""DOID:3312"", … ""DOID:9296""]","[1, 2, … 164]","""DOID:1470""",1
"""CHEBI:32087""","[""DOID:2841"", ""DOID:1205"", … ""DOID:5614""]","[1, 2, … 153]","""DOID:2841""",1
"""CHEBI:83766""","[""DOID:10283"", ""DOID:4556"", … ""MONDO:0011420""]","[1, 4, … 185]","""DOID:10283""",1
…,…,…,…,…
"""CHEBI:3128""","[""DOID:6000"", ""DOID:5844"", … ""DOID:1924""]","[3, 4, … 173]","""DOID:6000""",3
"""CHEBI:5165""","[""DOID:12382"", ""DOID:891"", … ""DOID:820""]","[3, 4, … 148]","""DOID:12382""",3
"""CHEBI:3758""","[""DOID:9282"", ""DOID:1826"", … ""DOID:0060883""]","[3, 4, … 172]","""DOID:9282""",3
"""CHEBI:4814""","[""DOID:6000"", ""DOID:9654"", … ""CHEBI:17858""]","[4, 5, … 154]","""DOID:6000""",4


Original code to generate samples

```python
sampled_25 = (
    pl.read_parquet("./data_output/union_method_combinations.parquet")
    .filter(pl.col("is_answer") == False, pl.col("method_size") == 7)
    .group_by("h", maintain_order=True)
    .agg(["answers", "filt_rank"])
    .with_columns(
        pl.col("answers").list.first().alias("first_ans"),
        pl.col("filt_rank").list.first().alias("first_rank"),
    )
    .sample(n=25, seed=20240620)
)
```

## Add first non-true predictions from individual approaches

### import the results dataframe

In [5]:
results_df = pl.read_parquet("./data_output/results_df.parquet")
results_df.head(2)

h,method,predicted_answers,answers,rank
str,str,list[str],str,i64
"""CHEBI:135735""","""CBR""","[""DOID:3393"", ""DOID:6000"", … ""DOID:0060343""]","""DOID:10763""",3
"""CHEBI:135735""","""CBR""","[""DOID:3393"", ""DOID:6000"", … ""DOID:0060343""]","""DOID:10591""",12


In [6]:
result_df = results_df.group_by(pl.all().exclude(["answers", "rank"])).agg(
    ["answers", "rank"]
)

result_df.head(2)

h,method,predicted_answers,answers,rank
str,str,list[str],list[str],list[i64]
"""CHEBI:50727""","""CBR""","[""DOID:1612"", ""DOID:3459"", … ""DOID:552""]","[""DOID:10283"", ""DOID:9119"", … ""DOID:5821""]","[7, 55, … null]"
"""CHEBI:28329""","""Rephetio""","[""DOID:14330"", ""DOID:1826"", … ""DOID:3890""]","[""DOID:12306""]",[48]


### Get first non-true answer

In [7]:
(
    result_df.filter(pl.col("h").is_in(sampled_25["h"]))
    .with_columns(
        pl.col("predicted_answers")
        .list.set_symmetric_difference(pl.col("answers"))
        .list.first()
        .alias("first_ans")
    )
    .select(["h", "method", "first_ans"])
)

h,method,first_ans
str,str,str
"""CHEBI:92246""","""Rephetio""","""DOID:12849"""
"""CHEBI:83766""","""RotatE""","""DOID:4451"""
"""CHEBI:2968""","""ComplEx""","""WD:Q21109048"""
"""CHEBI:31854""","""pCBR""","""DOID:1470"""
"""CHEBI:3758""","""CBR""","""DOID:1826"""
…,…,…
"""CHEBI:48811""","""pCBR""","""DOID:8577"""
"""CHEBI:6904""","""DistMult""","""UMLS:C0221155"""
"""CHEBI:83766""","""TransE""","""DOID:4903"""
"""CHEBI:3128""","""pCBR""","""DOID:2349"""


## combine sampled dataframes together

In [8]:
results_df_concat = pl.concat(
    [
        sampled_25.with_columns(method=pl.lit("Union")).select(
            ["h", "method", "first_ans"]
        ),
        (
            result_df.filter(pl.col("h").is_in(sampled_25["h"]))
            .with_columns(
                pl.col("predicted_answers")
                .list.set_symmetric_difference(pl.col("answers"))
                .list.first()
                .alias("first_ans")
            )
            .select(["h", "method", "first_ans"])
        ),
    ]
)

In [9]:
assert results_df_concat.shape[0] == 200, "results should have 200 rows"

### Get names of the compound and predicted diseases

In [10]:
nodes = pl.read_csv("../../../MRN_dataset/nodes_biolink.csv").select(
    ["id", "name", "label"]
)
nodes.head(2)

id,name,label
str,str,str
"""UBERON:0000002""","""cervix""","""AnatomicalEntity"""
"""UBERON:0000004""","""human nose""","""AnatomicalEntity"""


### Merging node names

#### look at how many individual predictions aren't disease targets

In [11]:
results_df_concat.join(nodes, left_on="h", right_on="id", how="left").join(
    nodes, left_on="first_ans", right_on="id", how="left"
).rename(
    {
        "name": "drug_name",
        "label": "drug_label",
        "name_right": "disease_name",
        "label_right": "disease_label",
    }
).filter(
    pl.col("disease_label") != "Disease"
)

h,method,first_ans,drug_name,drug_label,disease_name,disease_label
str,str,str,str,str,str,str
"""CHEBI:32087""","""pCBR""","""NCBIGene:367""","""Ramatroban""","""ChemicalSubstance""","""AR""","""MacromolecularMachine"""
"""CHEBI:32184""","""DistMult""","""MESH:D011763""","""tazarotene""","""ChemicalSubstance""","""Pyrrolizidine Alkaloids""","""ChemicalSubstance"""
"""CHEBI:64198""","""DistMult""","""NCBIGene:5819""","""dimercaprol""","""ChemicalSubstance""","""NECTIN2""","""MacromolecularMachine"""
"""CHEBI:5165""","""TransE""","""HP:0002069""","""Fosphenytoin""","""ChemicalSubstance""","""Generalized tonic-clonic seizu…","""PhenotypicFeature"""
"""CHEBI:64198""","""ComplEx""","""REACT:R-HSA-5357769""","""dimercaprol""","""ChemicalSubstance""","""Caspase activation via extrins…","""Pathway"""
…,…,…,…,…,…,…
"""CHEBI:8481""","""RotatE""","""NCBITaxon:5825""","""Propantheline""","""ChemicalSubstance""","""Plasmodium chabaudi""","""OrganismTaxon"""
"""CHEBI:48811""","""TransE""","""KEGG:hsa05215""","""chloroquine""","""ChemicalSubstance""","""Prostate cancer""","""Pathway"""
"""CHEBI:64198""","""RotatE""","""NCBITaxon:272561""","""dimercaprol""","""ChemicalSubstance""","""Chlamydia trachomatis D/UW-3/C…","""OrganismTaxon"""
"""CHEBI:7944""","""ComplEx""","""HP:0002155""","""Paxil""","""ChemicalSubstance""","""Hypertriglyceridemia""","""PhenotypicFeature"""


#### Export the individual union first predictions

In [12]:
individual_preds = (
    results_df_concat.join(nodes, left_on="h", right_on="id", how="left")
    .join(nodes, left_on="first_ans", right_on="id", how="left")
    .rename(
        {
            "h": "drug_id",
            "name": "drug_name",
            "label": "drug_label",
            "first_ans": "disease_id",
            "name_right": "disease_name",
            "label_right": "disease_label",
        }
    )
)

individual_preds

drug_id,method,disease_id,drug_name,drug_label,disease_name,disease_label
str,str,str,str,str,str,str
"""CHEBI:9207""","""Union""","""DOID:10763""","""sotalol hydrochloride""","""ChemicalSubstance""","""hypertension""","""Disease"""
"""CHEBI:77590""","""Union""","""DOID:3312""","""armodafinil""","""ChemicalSubstance""","""bipolar disorder""","""Disease"""
"""CHEBI:31854""","""Union""","""DOID:1470""","""Milnacipran hydrochloride""","""ChemicalSubstance""","""major depressive disorder""","""Disease"""
"""CHEBI:32087""","""Union""","""DOID:2841""","""Ramatroban""","""ChemicalSubstance""","""asthma""","""Disease"""
"""CHEBI:83766""","""Union""","""DOID:10283""","""olaparib""","""ChemicalSubstance""","""prostate cancer""","""Disease"""
…,…,…,…,…,…,…
"""CHEBI:48811""","""pCBR""","""DOID:8577""","""chloroquine""","""ChemicalSubstance""","""ulcerative colitis""","""Disease"""
"""CHEBI:6904""","""DistMult""","""UMLS:C0221155""","""metoprolol""","""ChemicalSubstance""","""Systolic hypertension""","""Disease"""
"""CHEBI:83766""","""TransE""","""DOID:4903""","""olaparib""","""ChemicalSubstance""","""granular cell carcinoma""","""Disease"""
"""CHEBI:3128""","""pCBR""","""DOID:2349""","""bisoprolol fumarate""","""ChemicalSubstance""","""arteriosclerosis""","""Disease"""


In [13]:
individual_preds.write_excel("./data_output/union_random_25_individual_curation.xlsx")
individual_preds.write_parquet(
    "./data_output/union_random_25_individual_curation2.parquet"
)

### get the individual ranks that make up the union approach

In [40]:
sampled_25_ranks = (
    results_df.filter(pl.col("h").is_in(sampled_25["h"]))
    .group_by(["h", "method", "predicted_answers"], maintain_order=True)
    .agg(["answers", "rank"])
    .explode("predicted_answers")
    .filter(~pl.col("predicted_answers").is_in(pl.col("answers")))
    .group_by(pl.all().exclude("predicted_answers"), maintain_order=True)
    .agg("predicted_answers")
    .with_columns(pl.col("predicted_answers").list.unique(maintain_order=True))
    .with_columns(
        pred_rank=pl.int_ranges(1, pl.col("predicted_answers").list.len() + 1)
    )
    .explode(["predicted_answers", "pred_rank"])
    .select(["h", "method", "predicted_answers", "pred_rank"])
)

In [90]:
# Add missing ranks to the dataframe and separate them into their own columns
sampled_25_with_ind_ranks = (
    sampled_25.join(  # get ranks for each prediction
        sampled_25_ranks.sort(["method", "pred_rank"])
        .group_by(["h", "predicted_answers"], maintain_order=True)
        .agg(["method", "pred_rank"]),
        left_on=["h", "first_ans"],
        right_on=["h", "predicted_answers"],
        how="left",
    )
    .with_columns(  # get the missing methods
        pl.when(pl.col("method").list.len() < 7)
        .then(
            pl.col("method").list.concat(
                pl.col("method").list.set_symmetric_difference(
                    [
                        "CBR",
                        "pCBR",
                        "Rephetio",
                        "TransE",
                        "DistMult",
                        "ComplEx",
                        "RotatE",
                    ]
                )
            )
        )
        .otherwise(pl.col("method")),
        (7 - pl.col("pred_rank").list.len()).alias(
            "pred_rank_len"
        ),  # calculate the number of missing ranks
        pl.lit(None).alias("nones"),  # add a none column
    )
    .with_columns(  # padd missing ranks
        pl.col("nones").cast(pl.Utf8).repeat_by(pl.col("pred_rank_len")).alias("nones")
    )
    .with_columns(
        pl.col("pred_rank").list.concat(pl.col("nones")).alias("pred_rank")
    )  # concatenate missing rank pad
    .select(pl.all().exclude(["pred_rank_len", "nones"]))  # hide padding columns
    .explode(["method", "pred_rank"])  # expand to sort
    .sort(["h", "method"])
    .with_columns(pl.struct("method", "pred_rank").alias("method_rank"))
    .group_by(
        pl.all().exclude(["method", "pred_rank", "method_rank"]), maintain_order=True
    )
    .agg("method_rank")
    .with_columns(  # create a struct list and expand as a paired dict rather than paired lists
        pl.col("method_rank").list.to_struct(
            n_field_strategy="max_width",
            fields=[
                "CBR",
                "ComplEx",
                "DistMult",
                "Rephetio",
                "RotatE",
                "TransE",
                "pCBR",
            ],
        )
    )
    .unnest("method_rank")  # Unnest the Struct List
    .with_columns(  # Unnest the Struct Object
        pl.col("CBR").struct.field("pred_rank").alias("CBR"),
        pl.col("ComplEx").struct.field("pred_rank").alias("ComplEx"),
        pl.col("DistMult").struct.field("pred_rank").alias("DistMult"),
        pl.col("Rephetio").struct.field("pred_rank").alias("Rephetio"),
        pl.col("RotatE").struct.field("pred_rank").alias("RotatE"),
        pl.col("TransE").struct.field("pred_rank").alias("TransE"),
        pl.col("pCBR").struct.field("pred_rank").alias("pCBR"),
    )
    .join(nodes, left_on="h", right_on="id", how="left")
    .join(nodes, left_on="first_ans", right_on="id", how="left")
    .rename(
        {
            "name": "drug_name",
            "label": "drug_label",
            "h": "drug_id",
            "first_ans": "disease_id",
            "name_right": "disease_name",
            "label_right": "disease_label",
        }
    )
    .select(
        [
            "drug_id",
            "drug_name",
            "drug_label",
            "disease_id",
            "disease_name",
            "disease_label",
            "CBR",
            "pCBR",
            "Rephetio",
            "TransE",
            "DistMult",
            "ComplEx",
            "RotatE",
        ]
    )
)

sampled_25_with_ind_ranks

drug_id,drug_name,drug_label,disease_id,disease_name,disease_label,CBR,pCBR,Rephetio,TransE,DistMult,ComplEx,RotatE
str,str,str,str,str,str,str,str,str,str,str,str,str
"""CHEBI:28748""","""doxorubicin""","""ChemicalSubstance""","""DOID:10534""","""stomach cancer""","""Disease""","""10""",,"""1""","""50""","""283""","""233""","""18"""
"""CHEBI:2968""","""bacampicillin""","""ChemicalSubstance""","""DOID:874""","""bacterial pneumonia""","""Disease""",,,"""672""","""6""","""4""","""2""","""8"""
"""CHEBI:3128""","""bisoprolol fumarate""","""ChemicalSubstance""","""DOID:6000""","""congestive heart failure""","""Disease""","""8""","""11""",,"""2""","""38""","""5""","""4"""
"""CHEBI:31854""","""Milnacipran hydrochloride""","""ChemicalSubstance""","""DOID:1470""","""major depressive disorder""","""Disease""","""30""","""1""","""14""","""1""","""52""","""1""","""71"""
"""CHEBI:32087""","""Ramatroban""","""ChemicalSubstance""","""DOID:2841""","""asthma""","""Disease""","""3""","""37""","""1""","""26""","""5""","""10""","""151"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""CHEBI:83766""","""olaparib""","""ChemicalSubstance""","""DOID:10283""","""prostate cancer""","""Disease""","""1""","""1""","""6""","""9""","""130""","""173""","""7"""
"""CHEBI:8481""","""Propantheline""","""ChemicalSubstance""","""DOID:2841""","""asthma""","""Disease""","""39""","""1""","""7""","""108""","""561""","""270""","""42"""
"""CHEBI:9207""","""sotalol hydrochloride""","""ChemicalSubstance""","""DOID:10763""","""hypertension""","""Disease""","""3""","""3""","""1""","""28""","""86""","""27""","""1"""
"""CHEBI:92246""","""ranitidine""","""ChemicalSubstance""","""DOID:750""","""peptic ulcer disease""","""Disease""","""92""","""48""","""40""","""1""","""222""","""50""","""22"""


In [93]:
sampled_25_with_ind_ranks.write_parquet(
    "./data_output/union_random_25_curation.parquet"
)
sampled_25_with_ind_ranks.write_excel("./data_output/union_random_25_curation.xlsx")

<xlsxwriter.workbook.Workbook at 0x7f8be317b690>