# Mappings

In [31]:
#imports 

import re 

import feather
import numpy as np
import pandas as pd
import scipy as sc

ROOT = "/Users/payalchandak/Not On Dropbox/Data/Sex Demographics/" 

In [26]:
# Map ATC fuction 

# code to map different ATC levels to create ancestor-descendant relationships
# writes to "ATC_MAPPING.csv" file in ROOT directory
# pass any numpy int array of ATC drug concept IDs here as concept_ids parameter
# currently builds a map of all IDs found in AEOLUS
# pass str path of files location to ROOT
# ignore 'low memory' error if thrown
def build_ATC_map(ROOT, concept_ids):

    #reads ATC data provided by OHDSI
    ATC_ancestor = pd.read_csv(ROOT + "CONCEPT_ANCESTOR.csv", delimiter="\t")
    ATC_concept = pd.read_csv(ROOT + "CONCEPT.csv", delimiter="\t")

    #set up ATC mapping dataframe
    descendant_concept_id = concept_ids.unique()
    columns = [
        'ATC 1st id', 'ATC 1st name', 
        'ATC 2nd id', 'ATC 2nd name',
        'ATC 3rd id', 'ATC 3rd name', 
        'ATC 4th id', 'ATC 4th name',
        'ATC 5th id', 'ATC 5th name']
    ATC_mapping = pd.DataFrame(columns=columns, index=descendant_concept_id)

    #add values to ATC mapping
    for drugID in ATC_mapping.index.values:
        ancestors = ATC_ancestor.query('descendant_concept_id==@drugID')['ancestor_concept_id'].values
        result = ATC_concept[ATC_concept['concept_id'].isin(ancestors)]\
                                                        .query('vocabulary_id=="ATC"')\
                                                        .set_index('concept_class_id')
        for classid, row in result.iterrows():
            idCol = str(classid) + ' id'
            nameCol = str(classid) + ' name'
            ATC_mapping.at[drugID, idCol] = row.concept_id
            ATC_mapping.at[drugID, nameCol] = row.concept_name

    #reformat
    columns = [
        'ATC_1_id', 'ATC_1_name', 
        'ATC_2_id', 'ATC_2_name', 
        'ATC_3_id', 'ATC_3_name', 
        'ATC_4_id', 'ATC_4_name', 
        'ATC_5_id', 'ATC_5_name']
    ATC_mapping.columns = columns
    ATC_mapping = ATC_mapping.reset_index().rename(columns={'index': 'drug_concept_id'})

    #write to file
    ATC_mapping.to_csv(ROOT + "ATC_MAPPING.csv")
    
    return

In [None]:
ROOT = "ATC Vocabulary/"
aeolus_ID = pd.read_csv("Reference Data/AEOLUS_DRUG_IDNAME.csv")['drug_concept_id']

build_ATC_map(ROOT, concept_ids)

In [None]:
#ATC handling functions     

#files needed by functions:
ATC_mapping = pd.read_csv(ROOT + "ATC_MAPPING.csv")
aeolus_IDNAME = pd.read_csv("Reference Data/AEOLUS_DRUG_IDNAME.csv")

def assignATC(df, ATC_level="3"):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)
    
    ATC_col = "ATC_" + ATC_level + "_name"
    df = df.assign(newCol="").rename(columns={'newCol': ATC_col})

    if (df.index.name == 'drug_concept_name'):
        for drugName, row in df.iterrows():
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')['drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[ATC_col].values[0]
            df.at[drugName, ATC_col] = ATC_val
    else:
        for idx, row in df.iterrows():
            drugName = row.drug_concept_name
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')['drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[ATC_col].values[0]
            df.at[idx, ATC_col] = ATC_val
    return df

def groupbyATC(df, ATC_level="3", minimum_count=0, secondary=""):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)

    ATC_col = "ATC_" + ATC_level + "_name"
    if(ATC_col not in df.columns):
        df = assignATC(df, ATC_level=ATC_level)

    if (bool(secondary)):
        df_grouped = df.groupby([ATC_col, secondary]).count()
    else:
        df_grouped = df.groupby(ATC_col).count()

    df_grouped = df_grouped.take(
        [0], axis=1).rename(columns={df_grouped.columns[0]: "count"})

    if (bool(minimum_count)):
        df_grouped = df_grouped[df_grouped['count'] > minimum_count]

    return df_grouped

def getdataforATCgroup(df, ATCgroup, ATC_level="3"):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)
    
    ATC_col = "ATC_" + ATC_level + "_name"
    indicesToKeep = []

    if (df.index.name == 'drug_concept_name'):
        for drugName, row in df.iterrows():
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')[
                'drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[
                ATC_col].values[0]
            if (ATC_val == ATCgroup):
                indicesToKeep = np.append(indicesToKeep, drugName)
    else:
        for idx, row in df.iterrows():
            drugName = row.drug_concept_name
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')[
                'drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[
                ATC_col].values[0]
            if (ATC_val == ATCgroup):
                indicesToKeep = np.append(indicesToKeep, idx)

    return df.loc[indicesToKeep, :]

In [3]:
df_genderAsso_sig = pd.read_feather(ROOT+"Stages/Significant Gender Associations.feather")
df_drugRisks_sig = pd.read_feather(ROOT+"Stages/Significant Drug Risks.feather").set_index('drug_concept_name')

In [2]:
#ADDING MANUALLY MAPPED RANKS FOR MISSING DATA

def addDictRanks(df, rankDict, rankData):
    for i in df.index:
        currentRank = df.at[i, 'outcome_rank']
        if ((currentRank == -1) or (currentRank == 0)):
            searchName = df.at[i, 'outcome_concept_name']
            rankName = rankDict.query('outcome_concept_name==@searchName')[
                'crowdsourced_name']
            if (not rankName.empty):
                rankName = rankName.item()
                newRank = outRank.query('Name==@rankName')['Rank score']
                df.at[i, 'outcome_rank'] = newRank


outRank = pd.read_excel(ROOT + 'Outcome Severity Mapping/Ranked ADRs.xlsx', sheet_name='Ranked ADRs')
df_rankDict = pd.read_feather(ROOT + "Outcome Severity Mapping/Outcomes Dictionary.feather")
addDictRanks(df_genderAsso_sig, rankDict=df_rankDict, rankData=outRank)

NameError: name 'df_genderAsso_sig' is not defined

In [6]:
outRank.head()

Unnamed: 0,Name,Rank score,Rank Stdev (% out 2929)
0,CARDIAC ARREST,1.0,0.897036
1,BONE CANCER METASTATIC,0.982921,0.204848
2,LEFT VENTRICULAR FAILURE,0.973702,0.839303
3,HIV INFECTION,0.971544,0.809613
4,ANAL CANCER,0.956513,0.156455


In [7]:
df_rankDict.head()

Unnamed: 0,outcome_concept_name,crowdsourced_name
0,Urinary.tract.infectious.disease,URINARY TRACT INFECTION
1,Impaired.renal.function.disorder,RENAL IMPAIRMENT
2,Headache.disorder,HEADACHE
3,Thrombocytopenic.disorder,THROMBOCYTOPENIC PURPURA
4,Hemorrhage.AND/OR.hematoma.complicating.procedure,HAEMORRHAGE


In [13]:
#Check if outcome ranks are missing and if yes, set up regEx search

outRank = pd.read_excel(ROOT + 'Outcome Severity Mapping/Ranked ADRs.xlsx', sheet_name='Ranked ADRs')
df_rankDict = pd.read_feather(ROOT +"Outcome Severity Mapping/Outcomes Dictionary.feather")
rankNames = outRank.Name.unique()
rankNames = np.sort(rankNames)

toDo = ['URINE ITCHING']#df_genderAsso_sig.query('outcome_rank==0').reset_index()['outcome_concept_name'].unique()
done = df_rankDict['outcome_concept_name']

for i in range(0, len(done)):
    idx = np.argwhere(toDo == done[i])
    toDo = np.delete(toDo, idx)
#done = done[-1:]
if (len(toDo) == 0):
    print("All Done :)")
else:
    search = toDo[0]
    print(search)

URINE ITCHING


In [14]:
#RegEx search of crowdsourced dictionary

#Print close-est 



words = search.upper().replace(".", " ").split()
newWords = []
ignore = [
    'ABNORMAL', 'SYNDROME', 'DISORDERS', 'BY', 'ITS', 'REACTION', 'AND/OR',
    'RAISED', 'DECREASED', 'INCREASED', 'OF', 'AND', 'ON', 'DUE', "THE", 'TO',
    'IN', 'FROM', 'WITH', 'DISORDER', 'DISEASE'
]
for word in words:
    if word not in ignore:
        #word = word.rstrip(word[-2:])
        newWords.append(word)
reg = '|'.join(word for word in newWords)
for name in rankNames:
    if (re.search(reg, name)):
        #if (re.search('dil', name, re.IGNORECASE)):
        print(name)

BLOOD URINE
CELLS IN URINE
MUSCLE TWITCHING
PROTEIN URINE ABSENT
SPECIFIC GRAVITY URINE INCREASED
URINE ABNORMALITY
URINE ANALYSIS ABNORMAL
URINE COLOUR ABNORMAL
URINE FLOW DECREASED
URINE ODOUR ABNORMAL
URINE OUTPUT
URINE OUTPUT INCREASED
WHITE BLOOD CELLS URINE POSITIVE


In [15]:
#ATC handling functions     

#files needed by functions:
ATC_mapping = pd.read_feather(ROOT+"ATC Vocabulary/ATC Mapping.feather")
aeolus_IDNAME = pd.read_feather(ROOT+"AEOLUS/drugIDdrugName_unique.feather")

def assignATC(df, ATC_level="3"):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)
    
    ATC_col = "ATC_" + ATC_level + "_name"
    df = df.assign(newCol="").rename(columns={'newCol': ATC_col})

    if (df.index.name == 'drug_concept_name'):
        for drugName, row in df.iterrows():
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')['drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[ATC_col].values[0]
            df.at[drugName, ATC_col] = ATC_val
    else:
        for idx, row in df.iterrows():
            drugName = row.drug_concept_name
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')['drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[ATC_col].values[0]
            df.at[idx, ATC_col] = ATC_val
    return df

def groupbyATC(df, ATC_level="3", minimum_count=0, secondary=""):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)

    ATC_col = "ATC_" + ATC_level + "_name"
    if(ATC_col not in df.columns):
        df = assignATC(df, ATC_level=ATC_level)

    if (bool(secondary)):
        df_grouped = df.groupby([ATC_col, secondary]).count()
    else:
        df_grouped = df.groupby(ATC_col).count()

    df_grouped = df_grouped.take(
        [0], axis=1).rename(columns={df_grouped.columns[0]: "count"})

    if (bool(minimum_count)):
        df_grouped = df_grouped[df_grouped['count'] > minimum_count]

    return df_grouped

def getdataforATCgroup(df, ATCgroup, ATC_level="3"):
    if (type(ATC_level)!=str):
        ATC_level = str(ATC_level)
    
    ATC_col = "ATC_" + ATC_level + "_name"
    indicesToKeep = []

    if (df.index.name == 'drug_concept_name'):
        for drugName, row in df.iterrows():
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')[
                'drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[
                ATC_col].values[0]
            if (ATC_val == ATCgroup):
                indicesToKeep = np.append(indicesToKeep, drugName)
    else:
        for idx, row in df.iterrows():
            drugName = row.drug_concept_name
            drugID = aeolus_IDNAME.query('drug_concept_name==@drugName')[
                'drug_concept_id'].values
            ATC_val = ATC_mapping.query('drug_concept_id==@drugID')[
                ATC_col].values[0]
            if (ATC_val == ATCgroup):
                indicesToKeep = np.append(indicesToKeep, idx)

    return df.loc[indicesToKeep, :]