In [75]:
import pandas as pd
import requests

In [2]:
def downloadCurrentShowcaseFiles():
    '''Download showcase and coding files from ukbb website in case they are out of date. Stored in ./dataFiles.
    '''
    encoding = 'utf-8'
    showcase = requests.get("https://biobank.ctsu.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.csv")
    with open('dataFiles/showcase.csv', 'w+') as f:
        f.write(str(showcase.content, encoding))
    codings = requests.get("https://biobank.ctsu.ox.ac.uk/~bbdatan/Codings_Showcase.csv")
    with open('dataFiles/codings.csv', 'w+') as f:
        f.write(str(codings.content, encoding))

In [68]:
def construct_search_df(pathToShowcase: str, pathToCoding: str, pathToReadcode: str) -> pd.DataFrame:
    '''Constructs one large dataframe from showcase.csv, codings.csv (downloaded through downloadCurrentShowcaseFiles), and readcodes.csv (provided)
    
    Keyword arguments:
    ------------------
    pathToShowcase: str
        location of showcase.csv
    pathToCoding: str
        location of codings.csv
    pathToReadcode: str
        location of readcodes.csv

    Returns:
    --------
    searchDf: pd.DataFrame
        searchable dataframe with the columns [Field, FieldID, Coding, Value, Meaning,]
    
    '''
    showcase = pd.read_csv(pathToShowcase, dtype=str)
    codings = pd.read_csv(pathToCoding, dtype=str)
    readcodes = pd.read_csv(pathToReadcode, dtype=str)
    showcase_excerpt = showcase[['Field', 'FieldID', 'Coding']]
    
    readcodes = readcodes.rename(columns={"type": "Coding", "code": "Value", "description": "Meaning"})
    readcodes['Field'] = 'gp_clinical, ' + readcodes.Coding
    readcodes['FieldID'] = readcodes.Coding
    readcodes = readcodes.drop(["Unnamed: 0"], axis=1)
    
    searchDf = showcase_excerpt.merge(codings, how='outer', on="Coding")
    searchDf = pd.concat([searchDf,readcodes])
    
    searchDf.Coding = searchDf.Coding.astype('str')
    searchDf.FieldID = searchDf.FieldID.astype('str')
        
    return searchDf


In [69]:
def construct_candidate_df(searchDf: pd.DataFrame, searchTerms: list) -> pd.DataFrame:
    '''Search dataframe searchDf for conditions containing searchTerms 
    
    Keyword arguments:
    ------------------
    searchDf: pd.DataFrame
        searchable dataframe with the columns [Field, FieldId, Coding, Value, Meaning]
    searchTerms: list(str)
        conditions to include in the search

    Returns:
    --------
    searchDf: pd.DataFrame
        filtererd dataframe with the columns [Field, FieldID, Coding, Value, Meaning]
    
    '''
    searchTerms = [x.lower() for x in searchTerms]
    
    fields = searchDf.Field.str.lower().str.contains('|'.join(searchTerms), na = False)
    meanings = searchDf.Meaning.str.lower().str.contains('|'.join(searchTerms), na = False)
    searchDf = searchDf[fields].merge(searchDf[meanings], how='outer')
    return searchDf

In [73]:
def select_conditions(searchDf: pd.DataFrame, searchLogic: str = "or") -> dict():
    '''Interactively search dataframe searchDf for conditions containing searchTerms. 
    
    Keyword arguments:
    ------------------
    searchDf: pd.DataFrame
        searchable dataframe with the columns [Field, FieldID, Coding, Value, Meaning]
    searchLogic: str: ["or"], "and", "not"
        
    Returns:
    --------
    searchDict: dict
          dictionary with three keys: and, or, and none.
          each entry holds a list of tuples (Column number, 'gp_clinical, read_2', or 'gp_clinical, read_3' and search code or 'any') to be included in the search query.
          by default, "and" and "none" will be empty.
    '''
    
    codeColors = {
        "default": '\033[0m',
        "bold": '\033[1m'
    }
        
    print(codeColors["bold"] + 'The following fields have potentially relevant values. Please choose if you want to include all patients who have any value in this field [a], none [hit enter], or if you would like to choose specific values [c].' + codeColors["default"])
    choice = []
    fieldTuples = [(searchDf.Field.loc[a], searchDf.FieldID.loc[a]) for a in searchDf.index]
    for field in set(fieldTuples):
        include = input("Include {}? [a/c/_] ".format(field[0]))
        fieldCode = field[1]
        if include == "a" or include == "A":
            choice.append((fieldCode,'any'))
        elif include == "c" or include == "C":
            print(codeColors["bold"] + "Please choose which codes to include [i] or skip entry [hit enter], skip rest of field [s]." + codeColors["default"])
            fieldDf = searchDf[searchDf.Field == field[0]]
            meaningTuples = [(fieldDf.Meaning.loc[a], fieldDf.Value.loc[a]) for a in fieldDf.index]
            for meaning in set(meaningTuples):
                include = input('    Include {}? [i/_/s] '.format(meaning[0]))
                if include == 'i' or include == 'I':
                    choice.append((fieldCode,meaning[1]))
                if include == 's' or include == 'S':
                    break
    searchDict = dict()
    searchDict[searchLogic] = choice
    return searchDict

In [74]:
def update_inclusion_logic(searchDict: dict, searchDf: pd.DataFrame) -> dict:
    '''Interactively update for logical search conditions. 
    
    Keyword arguments:
    ------------------
    searchDict: dict
          dictionary with three keys: and, or, and none.
          each entry holds a list of tuples (Column number, 'gp_clinical, read_2', or 'gp_clinical, read_3' and search code or 'any') to be included in the search query.            
    searchDf: pd.DataFrame
        searchable dataframe with the columns [Field, FieldID, Coding, Value, Meaning]

        
    Returns:
    --------
    searchDict: dict
          dictionary with three keys: and, or, and none.
          each entry holds a list of tuples (Column number, 'gp_clinical, read_2', or 'gp_clinical, read_3' and search code or 'any') to be included in the search query.
   '''
    
    
    codeColors = {
        "default": '\033[0m',
        "bold": '\033[1m'
    }
    print(codeColors['bold'] + 'Please choose if the following conditions are mandatory (each patient in your cohort will have this condition) [m], optional (all patients in your cohort will have one or more of these conditions) [o], or undesired (none of the patients in your cohort will have this condition) [e]')
    returnSearchDict = dict()
    returnSearchDict["and"] = []
    returnSearchDict["or"] = []
    returnSearchDict["not"] = []
    
    for logicKey in searchDict.keys():
        for entry in searchDict[logicKey]:
            field = entry[0]
            value = entry[1]
            print(entry)
            print(field)
            print(value)
            if value != 'any':
                dfRow = searchDf.query('FieldID == "{}" and Value == "{}"'.format(field, value)).iloc[0]
                fieldDescription = dfRow['Field']
                valueDescription = dfRow['Meaning']
            else: 
                dfRow = searchDf.query('FieldID == "{}"'.format(field)).iloc[0]
                fieldDescription = dfRow['Field']
                valueDescription = 'any'

            choice = input("{}, {}".format(fieldDescription, valueDescription))

            if choice == 'm' or choice == "M":
                returnSearchDict['and'].append(entry)
            elif choice == 'o' or choice == "O":
                returnSearchDict['or'].append(entry)
            elif choice == 'e' or choice == "E":
                returnSearchDict['not'].append(entry)

    return returnSearchDict

In [63]:
searchDf = _constructSearchDataframe(pathToShowcase='dataFiles/showcase.csv', pathToCoding='dataFiles/codings.csv', pathToReadcode='dataFiles/readcodes.csv')
searchDf = construct_candidate_df(searchDf, ['Borderline glaucoma'])
searchDict = select_conditions(searchDf)
searchDict = update_inclusion_logic(searchDict, searchDf)

[1mThe following fields have potentially relevant values. Please choose if you want to include all patients who have any value in this field [a], none [hit enter], or if you would like to choose specific values [c].[0m
('Diagnoses - ICD9', '41271')


Include Diagnoses - ICD9? [a/c/_]  


('Diagnoses - main ICD9', '41203')


Include Diagnoses - main ICD9? [a/c/_]  


('Type of cancer: ICD9', '40013')


Include Type of cancer: ICD9? [a/c/_]  


('gp_clinicalread_3', 'read_3')


Include gp_clinicalread_3? [a/c/_]  c


[1mPlease choose which codes to include [i] or skip entry [hit enter], skip rest of field [s].[0m
gp_clinicalread_3
5
[('Borderline glaucoma', 'F450.'), ('Borderline glaucoma steroid responder', 'F4503'), ('Borderline glaucoma NOS', 'F450z'), ('(Borderline glaucoma) or (ocular hypertension) or (increased intra-ocular pressure)', 'XE18p'), ('Borderline glaucoma', 'XE18p')]


    Include Borderline glaucoma? [i/_/s]  i
    Include Borderline glaucoma? [i/_/s]  i
    Include Borderline glaucoma NOS? [i/_/s]  
    Include (Borderline glaucoma) or (ocular hypertension) or (increased intra-ocular pressure)? [i/_/s]  
    Include Borderline glaucoma steroid responder? [i/_/s]  


('Diagnoses - secondary ICD9', '41205')


Include Diagnoses - secondary ICD9? [a/c/_]  a


('gp_clinicalread_2', 'read_2')


Include gp_clinicalread_2? [a/c/_]  a


## TODO

* migrate

In [80]:
a = ['ASD', 'asdD', 'ewf']
[x.lower() for x in a]

['asd', 'asdd', 'ewf']

## Radio button test

In [77]:
import ipywidgets
from ipywidgets import RadioButtons
from IPython.display import display
from ipywidgets import HBox, VBox, Label

import functools

In [399]:
def filterSearchDf(searchDf: pd.DataFrame, interactiveMode: str = "notebook") -> pd.DataFrame:
    '''Interactive filtering of searchDf
    
    Keyword arguments:
    ------------------
    searchDf: pd.DataFrame
        searchable dataframe with the columns [Field, Coding, Value, Meaning, include]
    interactiveMode: str ['notebook', 'terminal']
        notebook - displays ipyWidgets to allow for the construction of a cohort
        terminal - uses input() to allow for the construction of a cohort

    Returns:
    --------
    searchDf: pd.DataFrame
        filtererd dataframe with the columns [Field, Coding, Value, Meaning, include]
        where `include` is in {'irrelevant', 'all', 'any', 'not'} 
    '''
    a = 0
    buttons = []
    fields = set(searchDf.Field)
    
    allButtons = dict()
    
    for field in fields:
        fieldDf = searchDf[searchDf.Field == field]
        allButtons[field] = dict()
        choice = RadioButtons(options = ["all", "any", "none", "irrelevant", "some"], value="irrelevant")#, description=field)
        choice.observe(functools.partial(on_field_update, rows = searchDf[searchDf.Field == field]), names="value")
        parent = choice
        allButtons[field]['parent'] = VBox([Label(field), choice])
        allButtons[field]['children'] = []
        for index in fieldDf.index:
            meaning = str(fieldDf.loc[index]["Meaning"])
            choice = RadioButtons(options = ["all", "any", "none", "irrelevant"], value=fieldDf.loc[index]['include'])#, description=str(meaning))
            choice.observe(functools.partial(on_meaning_update, row = fieldDf.loc[index]), names="value")
            allButtons[field]['children'].append(VBox([Label(meaning), choice]))
            
            ipywidgets.dlink((parent, 'value'), (choice, 'value'))
    groups = []
    for i in allButtons:
        groups.append(VBox([allButtons[i]['parent'], HBox(allButtons[i]['children'])]))
    display(VBox(groups))
    return searchDf
    
            

def on_field_update(r, rows):
    value = r['new']
    if value != 'some':
        rows['include'] = value

def on_meaning_update(r, row):
    value = r['new']
    row['include'] = value