In [15]:
import polars as pl
import pandas as pd

In [52]:
longqt = (
    pl.read_csv('../data/longQT.csv',
                new_columns=['drug_name', 'drug_class', 'thera_use', 'pubmed'])
    .with_columns(
        pl.col('drug_name').str.to_lowercase())
    .filter(pl.col('drug_name').is_in(['papaverine hcl (intra-coronary)', 'cesium chloride', 'quizartinib']))
    .with_columns(
        pl.when(pl.col('drug_name') == 'papaverine hcl (intra-coronary)')
        .then(pl.lit('papaverine'))
        .otherwise(pl.col('drug_name')).alias('drug_name')
    )
    )
longqt.head()

drug_name,drug_class,thera_use,pubmed,literal
str,str,str,str,str
"""papaverine hcl…","""Vasodilator, C…","""Diagnostic adj…","""LINK""","""papaverine"""
"""cesium chlorid…","""toxin""","""Alternative th…","""LINK""","""cesium chlorid…"
"""quizartinib""","""Tyrosine kinas…","""Acute myeloid …","""LINK""","""quizartinib"""


In [8]:
labels_w_ADRs = pl.read_csv("../data/labels_w_ADRs.csv")
print(labels_w_ADRs.shape)
labels_w_ADRs.head()

(2109, 16)


file,set_id,spl_version,title,ingredient_rx_cui,ingredient_name,pt_meddra_id,pt_meddra_term,cohort_id,condition_name,positive_controls,label,cohort_id_right,drug_concept_id,affect,reference
str,str,i64,str,i64,str,i64,str,i64,str,i64,i64,i64,i64,i64,i64
"""../data/2024_l…","""4e8516dc-a3c6-…",1,"""These highligh…",6064,"""isotretinoin""",10019851,"""hepatotoxicity…",500000301,"""acute liver in…",28,1,,,,
"""../data/2024_l…","""0e98593a-8424-…",9,"""These highligh…",20352,"""carvedilol""",10017955,"""gastrointestin…",500001001,"""gi bleed""",21,1,,,,
"""../data/2024_l…","""5c8cebcd-699f-…",100,"""These highligh…",8754,"""propafenone""",10060795,"""hepatic enzyme…",500000301,"""acute liver in…",14,1,,,,
"""../data/2024_l…","""ab8616a5-ea72-…",10,"""These highligh…",40254,"""valproate""",10018830,"""haematemesis""",500001001,"""gi bleed""",11,1,,,,
"""../data/2024_l…","""10a3fbc9-00c6-…",1,"""These highligh…",7646,"""omeprazole""",10019692,"""hepatic necros…",500000301,"""acute liver in…",15,1,,,,


In [17]:
adr_section = (
pl.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/20231113_onsides/adverse_reactions.csv',
                        dtypes={
                        'pt_meddra_id': pl.Int64,
                        'pt_meddra_term': pl.Utf8,
                        'num_ingredients':  pl.Int64,
                        'ingredients_rxcuis': pl.Utf8,
                        'ingredients_names': pl.Utf8
                        })
                        .filter(pl.col('num_ingredients') == 1)
                        .select(
                            pl.col('pt_meddra_id'),
                            pl.col('pt_meddra_term').str.to_lowercase(),
                            pl.col('ingredients_rxcuis').cast(pl.Int64).alias('ingredient_rx_cui'),
                            pl.col('ingredients_names').str.to_lowercase().alias('ingredient_name')
                            )
)

box_warnings = (pl.from_pandas(pd.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/20231113_onsides/boxed_warnings.csv'))
                .filter(pl.col('num_ingredients') == 1)
                .select(
                    pl.col('pt_meddra_id'),
                    pl.col('pt_meddra_term').str.to_lowercase(),
                    pl.col('ingredients_rxcuis').cast(pl.Int64).alias('ingredient_rx_cui'),
                    pl.col('ingredients_names').str.to_lowercase().alias('ingredient_name')
                    )                
                )

adr_section = pl.concat([adr_section, box_warnings])

In [18]:
adr_section.head()

pt_meddra_id,pt_meddra_term,ingredient_rx_cui,ingredient_name
i64,str,i64,str
10000059,"""abdominal disc…",6916,"""metolazone"""
10000060,"""abdominal dist…",6916,"""metolazone"""
10000081,"""abdominal pain…",6916,"""metolazone"""
10001507,"""agranulocytosi…",6916,"""metolazone"""
10001682,"""alkalosis hypo…",6916,"""metolazone"""


In [41]:
# get the qt meddra pt terms
qt_meddra_pt = (longqt
 .join(adr_section, left_on='drug_name', right_on='ingredient_name', how='inner')
 .filter(pl.col('pt_meddra_term').str.contains('(?i)qt'))
 .select(pl.col('pt_meddra_term'), pl.col('pt_meddra_id')).unique()
 )
qt_meddra_pt

pt_meddra_term,pt_meddra_id
str,i64
"""electrocardiog…",10014387
"""long qt syndro…",10024803


In [49]:
missed_drugs = (longqt
 .join(adr_section, left_on='drug_name', right_on='ingredient_name', how='left')
 .filter(pl.col('pt_meddra_id').is_null())
 .select(pl.col('drug_name'), pl.col('drug_class'), 'thera_use')
)
print(missed_drugs)

shape: (3, 3)
┌─────────────────────────────────┬───────────────────────────┬────────────────────────────┐
│ drug_name                       ┆ drug_class                ┆ thera_use                  │
│ ---                             ┆ ---                       ┆ ---                        │
│ str                             ┆ str                       ┆ str                        │
╞═════════════════════════════════╪═══════════════════════════╪════════════════════════════╡
│ papaverine hcl (intra-coronary) ┆ Vasodilator, Coronary     ┆ Diagnostic adjunct         │
│ cesium chloride                 ┆ toxin                     ┆ Alternative therapy cancer │
│ quizartinib                     ┆ Tyrosine kinase inhibitor ┆ Acute myeloid leukemia     │
└─────────────────────────────────┴───────────────────────────┴────────────────────────────┘
