In [189]:
import pandas as pd
from fuzzywuzzy import process, fuzz

def queryHospitalData(pathToMainDataset: str, searchCodeDict: dict) -> list:
    """ Search main dataframe for hospital reported conditions.

    This function uses the following columns of the main dataset:
    **41270**	Diagnoses - ICD10
    **41280**	Date of first in-patient diagnosis - ICD10
    **41271**	Diagnoses - ICD9
    **41281**	Date of first in-patient diagnosis - ICD9
    **eid**
    Returns a list of 'eid' values that can subsequently be used to retrieve the genetic data of our cohort.

    Keyword arguments:
    ------------------
    pathToMainDataset: str
        path to main dataset (csv)
    searchCodeDict: dict
        dictionary that was created using createCodingDict function

    Returns:
    --------
    hospital_eids: list(str)
    """

    # read dataset

    main_dataset = get_columns(["41270", "41280","41271", "41281", "eid"], pathToMainDataset)

    # create lists for diagnoses and dates for icd9 and icd10. then extract relevant columns from main dataframe

    icd9_diag_cols = ['eid']
    icd9_date_cols = ['eid']
    icd10_diag_cols = ['eid']
    icd10_date_cols = ['eid']

    icd9_columns = ['eid']
    for i in main_dataset.columns:
        cstart = str(i).split('-')[0]
        if (cstart == '41271'):
            icd9_columns.append(i)
            icd9_diag_cols.append(i)
        elif (cstart == '41281'):
            icd9_columns.append(i)
            icd9_date_cols.append(i)

    hospital_records_icd9 = main_dataset[icd9_columns]

    icd10_columns = ['eid']
    for i in main_dataset.columns:
        cstart = str(i).split('-')[0]
        if (cstart == '41270'):
            icd10_columns.append(i)
            icd10_diag_cols.append(i)
        elif (cstart == '41280'):
            icd10_columns.append(i)
            icd10_date_cols.append(i)

    hospital_records_icd10 = main_dataset[icd10_columns]

    (icd9_diag_rename, icd9_numbers) = makeRenamingDict(icd9_diag_cols, '.')
    (icd9_date_rename, _) = makeRenamingDict(icd9_date_cols, '.')
    (icd10_diag_rename, icd10_numbers) = makeRenamingDict(icd10_diag_cols, '.')
    (icd10_date_rename, _) = makeRenamingDict(icd10_date_cols, '.')


    # steps for icd9 and icd10:
    # 1. split dataset into diagnoses and dates
    # 2. rename columns to everythong after '.' (works for 412xx columns only, might have to be after '-' for other data columns)
    # 3. melt each subset to make data tidy
    # 4. set hierarchical index (eid, visit)
    # 5. join on these indices
    # 6. convert diagnosis code to string and date to datetime object

    icd9_diag = hospital_records_icd9[icd9_diag_cols].rename(columns=icd9_diag_rename).melt(id_vars='eid', value_vars=icd9_numbers, value_name='diagnosis', var_name='visit').set_index(['eid', 'visit'])
    icd9_date = hospital_records_icd9[icd9_date_cols].rename(columns=icd9_date_rename).melt(id_vars='eid', value_vars=icd9_numbers, value_name='diagnosisDate', var_name='visit').set_index(['eid', 'visit'])
    icd9 = icd9_diag.join(icd9_date)
    icd9 = icd9.dropna(subset=['diagnosis', 'diagnosisDate'])
    icd9.diagnosis = icd9.diagnosis.astype('str')
    icd9.diagnosisDate = pd.to_datetime(icd9.diagnosisDate)

    icd10_diag = hospital_records_icd10[icd10_diag_cols].rename(columns=icd10_diag_rename).melt(id_vars='eid', value_vars=icd10_numbers, value_name='diagnosis', var_name='visit').set_index(['eid', 'visit'])
    icd10_date = hospital_records_icd10[icd10_date_cols].rename(columns=icd10_date_rename).melt(id_vars='eid', value_vars=icd10_numbers, value_name='diagnosisDate', var_name='visit').set_index(['eid', 'visit'])
    icd10 = icd10_diag.join(icd10_date)
    icd10 = icd10.dropna(subset=['diagnosis', 'diagnosisDate'])
    icd10.diagnosis = icd10.diagnosis.astype('str')
    icd10.diagnosisDate = pd.to_datetime(icd10.diagnosisDate)

    icd9Query = createPandasQueryString(searchCodeDict, 'icd9', columnName = 'diagnosis')
    icd10Query = createPandasQueryString(searchCodeDict, 'icd10', columnName = 'diagnosis')

    hospital_eids = list(set(icd9.query(icd9Query).append(icd10.query(icd10Query)).reset_index()[['eid']]['eid']))

    return hospital_eids


def querySelfreportedData(pathToMainDataset: str, pathToCodingFile: str, searchCodeDict: dict) -> list:
    """ Search main dataframe for hospital reported conditions.

    This function uses the following columns of the main dataset:
    **20002**	Condition - node_id
    **20008**	Year of reported condition
    **20009**	Age of patient when condition reported
    **eid**
    Returns a list of 'eid' values that can subsequently be used to retrieve the genetic data of our cohort.

    Keyword arguments:
    ------------------
    pathToMainDataset: str
        path to main dataset (csv)
    pathToCodingFile: str
        path to translation from node_id to ICD10 (tsv)
    searchCodeDict: dict
        dictionary that was created using createCodingDict function

    Returns:
    --------
    selfreported_eids: list(str)
    """

    # read dataset

    main_dataset = pd.read_csv(pathToMainDataset)
    coding_dataset = pd.read_csv(pathToCodingFile, delimiter="\t")


    # create lists for condition, year, and age. then extract relevant columns from main dataframe
    diag_cols = main_dataset.filter(regex="20002-*").columns.tolist()
    diag_cols.append('eid')

    year_cols = main_dataset.filter(regex="20008-*").columns.tolist()
    year_cols.append('eid')

    age_cols = main_dataset.filter(regex="20009-*").columns.tolist()
    age_cols.append('eid')

    (diag_rename, diag_numbers) = makeRenamingDict(diag_cols, '-')
    (year_rename, _) = makeRenamingDict(year_cols, '-')
    (age_rename, _) = makeRenamingDict(age_cols, '-')


    diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])


    # steps for icd9 and icd10:
    # 1. split dataset into diagnoses and dates
    # 2. rename columns to everythong after '.' (works for 412xx columns only, might have to be after '-' for other data columns)
    # 3. melt each subset to make data tidy
    # 4. set hierarchical index (eid, visit)
    # 5. join on these indices
    # 6. convert diagnosis code to string and date to datetime object

    diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])
    year = main_dataset[year_cols].rename(columns=year_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisYear', var_name='visits').set_index(['eid', 'visits'])
    age = main_dataset[age_cols].rename(columns=age_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisAge', var_name='visits').set_index(['eid', 'visits'])

    diag_year = diag.join(year)
    diag_year_age = diag_year.join(age)
    filtered = diag_year_age.dropna(subset=['diagnosis', 'diagnosisYear', 'diagnosisAge'])

    icd10Query = createPandasQueryString(searchCodeDict, 'icd10', columnName = 'diagnosis')

    selfreported_eids = list(set(filtered.query(icd10Query).reset_index()[['eid']]['eid']))

    return selfreported_eids

def createPandasQueryString(searchCodeDict: dict, codingType: str, columnName: str) -> str:
    """ Create search query for dataframe given a searchCodeDict.

    Keyword arguments:
    ------------------
    searchCodeDict: dict
        dictionary that was created using createCodingDict function
    codingType: str
        any of the keys in the searchCodeDict
        Most likely one of {'read_2', 'read_3', 'icd9', 'icd10'}

    Returns:
    --------
    queryString: str
    """

    queryString = ""
    for j, item in enumerate(searchCodeDict[codingType]):
        if j != 0:
            queryString += ' or '
        queryString += "{} == '{}'".format(columnName, item)
    return queryString


def createGpClinicalQueryString(searchCodeDict: dict) -> str:
    """ Create search query for UKBB table `gp_clinical` given a searchCodeDict.

    Keyword arguments:
    ------------------
    searchCodeDict: dict
        dictionary that was created using createCodingDict function

    Returns:
    --------
    queryString: str

    The queryString then needs to be copied to the UKBB data website to download a list of relevant eids
    """

    queryString = 'SELECT distinct eid FROM gp_clinical WHERE '
    for i, el in enumerate(['read_2', 'read_3']):
        if i != 0:
            queryString += ' OR '
        queryString += el + ' IN ' + "("
        for j, item in enumerate(searchCodeDict[el]):
            if j != 0:
                queryString += ','
            queryString += "'" + item + "'"
        queryString +=  ")"

    return queryString


def createCodingDict(pathToLookup: str, searchTerm: str, fuzzy: bool = False, fuzzyNumber: int = 100, fuzzyMatchBetterThan: int = 90) -> (dict, list):
    """ Creates searchCodeDict, a dictionary containing search codes for particular searhc term.

    Keyword arguments:
    ------------------
    pathToLookup: str
        path to dataframe with columns ['type', 'code', 'description']
    searchTerm: str
        search term of disease of interest
    fuzzy: {[False], True}
        if False: only descriptions containing exact matches are returned
        if True: the first _fuzzyNumber_ matches of a fuzzy search are returned
    fuzzyNumber: int
        number of matches if fuzzy == True
    fuzzyMatchBetterThan: int
        number between 0 and [100], where 100 = exact match
        cutoff criterion for matches if fuzzy == True
        default value: 90

    Returns:
    --------
    (searchCodeDict, searhCodeArray)
    searchCodeDict: dict
        used in other functions
    searchCodeArray: list((type: str, code:str, description:str))
        not used elsewhere and only returned for sanity check
    """

    lookupDataframe = pd.read_csv(pathToLookup)

    if fuzzy == False:
        mask = lookupDataframe['description'].str.lower().str.contains(searchTerm, na=False)
        codes = lookupDataframe[mask]

    if fuzzy == True:
        choices = (lookupDataframe['description'].astype("str").str.lower())
        matches = process.extract(searchTerm, choices, scorer=fuzz.partial_ratio, limit=fuzzyNumber)
        filteredMatches = [m[0] for m in matches if m[1] > fuzzyMatchBetterThan]
        codes = lookupDataframe[lookupDataframe['description'].astype("str").str.lower().isin(filteredMatches)]

    searchCodeArray = [(i['type'], i['code'], i['description']) for i in codes.iloc()]
    searchCodeDict = arrayToDict(searchCodeArray) 


    return searchCodeDict, searchCodeArray


def makeRenamingDict(columnNames: list, delimiter: str) -> (dict, list):
    """
    Creates dictionary for renaming. Used internally only.
    """
    out = dict()
    numbers = []
    for c in columnNames:
        if c == 'eid':
            out[c] = 'eid'
        else:
            out[c] = c.split(delimiter)[-1]
            numbers.append(c.split(delimiter)[-1])
    return out, numbers


import argparse
import pandas as pd
import pickle

def _getQuery(datafields:list):
    '''
    Generate a regex string for a pd.DataFrame.filter function.
    :param datafields: any datafield collected from the UKBB Data Showcase website e.g 41270
    :return: regex string
    :rtype: str'''
    start = "eid|"
    end = ''.join("{}-*|".format(i) for i in datafields)
    final = start + end
    return final[:-1]


def get_columns(datafields: list, maincsv: str, outfile: str="", nowrite:bool=True, write_datafields:bool=False) -> pd.DataFrame():
    '''
    Generate a dataframe by selecting all relevant columns based on the given datafield(s).
    Optionally write the dataframe to a csv file.
    :param datafields: any datafield collected from the UKBB Data Showcase website e.g 41270
    :param maincsv: the UKBB main dataset in csv format e.g main_ukbbxxxx.csv
    :param outfile: name of the csv file to write the new dataframe to
    :param nowrite: boolean that determines whether to write the new dataframe to a csv file
    :param write_datafields: boolean that determines whether to the relevant columns to a binary file
    :return: dataframe
    :rtype: pd.DataFrame'''
    maindf = pd.read_csv(maincsv, nrows=1, dtype=str)
    datafields_query = _getQuery(datafields)
    col_list = maindf.filter(regex=datafields_query).columns.tolist()
    print("Number of selected columns: {}".format(len(col_list)))
    main = pd.read_csv(maincsv, usecols=col_list, dtype=str)
    if (nowrite) or (outfile=="") :
#         print("Not writing updated main.csv file")
        return main
    else:
        print("Writing file")
        main.to_csv(outfile, index=False)
        return main
    if write_datafields:
        with open("datafields.data", 'wb') as file:
            pickle.dump(col_list)
            return main

In [190]:
pathToLookup = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/lookupCodeDescriptions.csv'
pathToCoding = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/coding19.tsv'
pathToMain = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/ukb41268_head100.csv'
searchTerm = 'glaucoma'

In [191]:
searchCodeDict, searchCodeArray = createCodingDict(pathToLookup=pathToLookup, searchTerm=searchTerm, fuzzy=True)

In [4]:
get_columns(['20002', '20001'], pathToMain)

Number of selected columns: 161


Unnamed: 0,eid,20001-0.0,20001-0.1,20001-0.2,20001-0.3,20001-0.4,20001-0.5,20001-1.0,20001-1.1,20001-1.2,...,20002-3.24,20002-3.25,20002-3.26,20002-3.27,20002-3.28,20002-3.29,20002-3.30,20002-3.31,20002-3.32,20002-3.33
0,1000015,,,,,,,,,,...,,,,,,,,,,
1,1000027,,,,,,,,,,...,,,,,,,,,,
2,1000039,,,,,,,,,,...,,,,,,,,,,
3,1000040,,,,,,,,,,...,,,,,,,,,,
4,1000053,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1000955,,,,,,,,,,...,,,,,,,,,,
95,1000969,,,,,,,,,,...,,,,,,,,,,
96,1000978,,,,,,,,,,...,,,,,,,,,,
97,1000981,,,,,,,,,,...,,,,,,,,,,


In [35]:
queryString = createGpClinicalQueryString(searchCodeDict)

In [5]:
queryHospitalData(pathToMainDataset= pathToMain, searchCodeDict= searchCodeDict)

Number of selected columns: 521


[]

___

# Selenium web crawling

follow instructions to get the driver here: 
https://duo.com/decipher/driving-headless-chrome-with-python

1. download canary: https://www.google.com/chrome/canary/
1. driver download: https://chromedriver.storage.googleapis.com/index.html?path=83.0.4103.14/
1. unzip that file
1. move driver to a directory and add it that directory to the path (`export PATH=$PATH:~/IBM/GEN/ukbb-cohort/prototype_notebooks/going_headless`)
1. execure driver once to make your computer trust it

In [24]:
# !mkdir going_headless
!mv /Users/kiko/Downloads/chromedriver /Users/kiko/IBM/GEN/ukbb-cohort/prototype_notebooks/going_headless/

In [177]:
pathToDriver = "/Users/kiko/IBM/GEN/ukbb-cohort/prototype_notebooks/going_headless/chromedriver"

In [178]:
applicationId = '51064'
userName = 'isabell.kiral@au11.ibm.com'
password = '9NYccunFNB4d'

In [179]:
import credentials

* add credentials.py file to gitignore

In [180]:
import credentials

def getPayload(queryString, applicationId, username, password):
    import os  
    from selenium import webdriver  
    from selenium.webdriver.common.keys import Keys  
    from selenium.webdriver.chrome.options import Options  
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait

    chrome_options = Options()  
    chrome_options.add_argument("--headless")  
    chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
    driver = webdriver.Chrome(executable_path=os.path.abspath(pathToDriver), options=chrome_options)  

    driver.get("https://bbams.ndph.ox.ac.uk/ams/resProjects/dataDownToShowcase?appn_id={}".format(applicationId))



    usernameField = driver.find_element_by_id('id_username')
    passwordField = driver.find_element_by_id('id_password')

    usernameField.send_keys(username)
    passwordField.send_keys(password)

    loginButton = driver.find_element_by_id('id_login')
    loginButton.click()

    dataPortal = driver.find_element_by_link_text("1 Data Portal")
    dataPortal.click()

    connect = driver.find_element_by_class_name("btn_glow")
    connect.click()

    sqlField = driver.find_element_by_id('sq0')
    sqlField.send_keys(queryString)

    fetch = driver.find_element_by_class_name("btn_glow")
    fetch.click()

    try:
        element_present = EC.presence_of_element_located((By.NAME, 'sr'))
        WebDriverWait(driver, 10).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")

    hiddenElement = driver.find_element_by_name('sr')
    value = hiddenElement.get_property('value')
    driver.close()
    payload = "sr=" + value
    return payload

In [181]:
def downloadData(payload: str):
    import requests
    headers= {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
    "cache-control": "max-age=0",
    "content-type": "application/x-www-form-urlencoded",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-site",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1"
    }
    data = payload
    url = "https://biota.ndph.ox.ac.uk/regserv.cgi"
    response = requests.post(url=url, data=data, headers=headers)
    return response

In [182]:
def downloadData(payload: str):
    import requests
    headers= {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
    "cache-control": "max-age=0",
    "content-type": "application/x-www-form-urlencoded",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-site",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1"
    }
    data = payload
    url = "https://biota.ndph.ox.ac.uk/regserv.cgi"
    response = requests.post(url=url, data=data, headers=headers)
    return response

def extractIds(response: dict):
    import io
    data = io.StringIO(response.text)
    df = pd.read_csv(data, sep=",")
    return list(df['eid'])

In [183]:
payload = getPayload(queryString, credentials.applicationId, credentials.userName, credentials.password)
response = downloadData(payload)
ids = extractIds(response)

In [163]:
ids[0:10]

[3256435,
 3407919,
 4816917,
 2496558,
 5207925,
 4059356,
 5292704,
 5869589,
 1219683,
 4076733]

___

# Interactive filtering

In [203]:
from ukbb-cohort import query

In [243]:
searchCodeDict, searchCodeArray = query.createCodingDict(pathToLookup, 'glaucoma')

In [285]:
searchCodeArray = [("read_2", "a code", "not glaucoma"),("read_3", "abc", "definitely not glaucoma"),("read_3", "A.53482", "early glaucoma"),("icd9", "COOODE10", "what is this even listed?"),("icd10","code green","glaucoma")]

def interactiveFilter(searchCodeArray: list) -> (dict, list):
    filteredArray = searchCodeArray.copy()
    codeColors = {
        "read_2": '\033[93m',
        "read_3": '\033[92m',
        "icd9": '\033[94m',
        "icd10": '\033[95m',
        "default": '\033[0m', 
        "underline": '\033[4m', 
        "bold": '\033[1m'
    }
    for el in searchCodeArray: 
        keep = input("Keep '{}{}{}' ({}{}{})? [y/n]: ".format(codeColors['bold'], el[2], codeColors['default'],  codeColors[el[0]], el[0], codeColors['default']))
        if keep == 'n': 
            filteredArray.remove(el)
    filteredDict = _searchArrayToDict(filteredArray)
    return filteredDict, filteredArray

def _searchArrayToDict(searchCodeArray: list) -> dict:
    searchCodeDict = dict()
    for i in searchCodeArray:
        if i[0] in searchCodeDict.keys():
            searchCodeDict[i[0]].append('{}'.format(i[1]))
        else:
            searchCodeDict[i[0]] = ['{}'.format(i[1])]
    return searchCodeDict

newcodedict, newcodearray = interactiveFilter(searchCodeArray)
print("new code array: {}".format(newcodearray))

Keep '[1mnot glaucoma[0m' ([93mread_2[0m)? [y/n]:  n
Keep '[1mdefinitely not glaucoma[0m' ([92mread_3[0m)? [y/n]:  n
Keep '[1mearly glaucoma[0m' ([92mread_3[0m)? [y/n]:  n
Keep '[1mwhat is this even listed?[0m' ([94micd9[0m)? [y/n]:  n
Keep '[1mglaucoma[0m' ([95micd10[0m)? [y/n]:  n


new code array: []


# allow for more search terms 
https://github.ibm.com/isabeki/ukbb-cohort/issues/8

In [298]:

def createCodingDict(pathToLookup: str, searchTerm: str, fuzzy: bool = False, fuzzyNumber: int = 100, fuzzyMatchBetterThan: int = 90) -> (dict, list):
    """ Creates searchCodeDict, a dictionary containing search codes for particular searhc term.

    Keyword arguments:
    ------------------
    pathToLookup: str
        path to dataframe with columns ['type', 'code', 'description']
    searchTerm: str
        search term of disease of interest
    fuzzy: {[False], True}
        if False: only descriptions containing exact matches are returned
        if True: the first _fuzzyNumber_ matches of a fuzzy search are returned
    fuzzyNumber: int
        number of matches if fuzzy == True
    fuzzyMatchBetterThan: int
        number between 0 and [100], where 100 = exact match
        cutoff criterion for matches if fuzzy == True
        default value: 90

    Returns:
    --------
    (searchCodeDict, searhCodeArray)
    searchCodeDict: dict
    searchCodeArray: list((type: str, code:str, description:str))
    """

    lookupDataframe = pd.read_csv(pathToLookup)

    if fuzzy == False:
        mask = lookupDataframe['description'].str.lower().str.contains('|'.join(searchTerms), na=False)
        codes = lookupDataframe[mask]

    if fuzzy == True:
        matches = []
        choices = (lookupDataframe['description'].astype("str").str.lower())
        for searchTerm in searchTerms:
            matches.extend(process.extract(searchTerm, choices, scorer=fuzz.partial_ratio, limit=fuzzyNumber))
        filteredMatches = [m[0] for m in matches if m[1] > fuzzyMatchBetterThan]
        codes = lookupDataframe[lookupDataframe['description'].astype("str").str.lower().isin(filteredMatches)]

    searchCodeArray = [(i['type'], i['code'], i['description']) for i in codes.iloc()]
    searchCodeDict = _searchArrayToDict(searchCodeArray)
    return searchCodeDict, searchCodeArray


In [299]:
searchTerms = ['glaucoma', 'psychosis']
scd, sca = createCodingDict(pathToLookup, searchTerms, fuzzy=True)

In [300]:
sca

[('read_2', '115D.', 'No H/O: Glaucoma'),
 ('read_2', '1221', 'No FH: Glaucoma'),
 ('read_2', '12A1.', 'FH: Glaucoma'),
 ('read_2', '146H.', 'H/O: psychosis'),
 ('read_2', '1482', 'H/O: glaucoma'),
 ('read_2', '1JF..', 'Suspected glaucoma'),
 ('read_2',
  '212T.',
  'Psychosis, schizophrenia and bipolar affective disorder resolved'),
 ('read_2', '212X.', 'Psychosis resolved'),
 ('read_2', '2229', 'O/E - senility - no psychosis'),
 ('read_2', '66T1.', 'Glaucoma monitoring'),
 ('read_2', '68A2.', 'Glaucoma screen'),
 ('read_2', '7259', 'Operations following glaucoma surgery'),
 ('read_2', '72590', 'Needling of bleb following glaucoma surgery'),
 ('read_2', '72591', 'Injection of bleb following glaucoma surgery'),
 ('read_2', '72592', 'Revision of bleb NEC following glaucoma surgery'),
 ('read_2',
  '72593',
  'Removal of releasable suture following glaucoma surgery'),
 ('read_2', '72594', 'Laser suture lysis following glaucoma surgery'),
 ('read_2', '7259y', 'Other specified operations f

# Exclusive search terms

In [15]:
import ukbb-cohort as uk
import ukbb-cohort.preprocessing as preprocessing

In [280]:
def querySelfreportedData(pathToMainDataset: str, pathToCodingFile: str, searchCodeDict: dict, includeCodes: bool = True) -> list:
    """ Search main dataframe for hospital reported conditions.

    This function uses the following columns of the main dataset:
    **20002**	Condition - node_id
    **20008**	Year of reported condition
    **20009**	Age of patient when condition reported
    **eid**
    Returns a list of 'eid' values that can subsequently be used to retrieve the genetic data of our cohort.

    Keyword arguments:
    ------------------
    pathToMainDataset: str
        path to main dataset (csv)
    pathToCodingFile: str
        path to translation from node_id to ICD10 (tsv)
    searchCodeDict: dict
        dictionary that was created using createCodingDict function
    includeCodes: bool [True]
        set to False to get the inverse set
    Returns:
    --------
    selfreported_eids: list(str)
    """

    # read dataset

#     main_dataset = pd.read_csv(pathToMainDataset)
    main_dataset = preprocessing.get_columns(["20002", "20008","20009", "eid"], pathToMainDataset)

    coding_dataset = pd.read_csv(pathToCodingFile, delimiter="\t", dtype=str)


    # create lists for condition, year, and age. then extract relevant columns from main dataframe
    diag_cols = main_dataset.filter(regex="20002-*").columns.tolist()
    diag_cols.append('eid')

    year_cols = main_dataset.filter(regex="20008-*").columns.tolist()
    year_cols.append('eid')

    age_cols = main_dataset.filter(regex="20009-*").columns.tolist()
    age_cols.append('eid')

    (diag_rename, diag_numbers) = _construct_renaming_dict(diag_cols, '-')
    (year_rename, _) = _construct_renaming_dict(year_cols, '-')
    (age_rename, _) = _construct_renaming_dict(age_cols, '-')


    diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])


    # steps for icd9 and icd10:
    # 1. split dataset into diagnoses and dates
    # 2. rename columns to everythong after '.' (works for 412xx columns only, might have to be after '-' for other data columns)
    # 3. melt each subset to make data tidy
    # 4. set hierarchical index (eid, visit)
    # 5. join on these indices
    # 6. convert diagnosis code to string and date to datetime object

    diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])
    year = main_dataset[year_cols].rename(columns=year_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisYear', var_name='visits').set_index(['eid', 'visits'])
    age = main_dataset[age_cols].rename(columns=age_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisAge', var_name='visits').set_index(['eid', 'visits'])

    diag_year = diag.join(year)
    diag_year_age = diag_year.join(age)
    filtered = diag_year_age.dropna(subset=['diagnosis', 'diagnosisYear', 'diagnosisAge'])

    all_codes = [item for sublist in [*searchCodeDict.values()] for item in sublist]
    translatedCodingDic = _searchCoding19(coding_dataset, all_codes, searchCodeDict)

    if not translatedCodingDic['node_ids']:
        print("No matches found in self-reported data. Returning empty list")
        selfreported_eids = []
    else:
        icd9Query = _createPandasQueryString(translatedCodingDic, 'node_ids', columnName = 'diagnosis')
        selfreported_eids = list(set(filtered.query(icd9Query).reset_index()[['eid']]['eid']))
    
    if not includeCodes: 
        selfreported_eids = list(set(main_dataset.query('eid not in {}'.format(selfreported_eids))['eid']))
    
    return selfreported_eids

def queryGpClinicalData(searchCodeDict: dict, pathToCredentials: str, pathToDriver: str,  driverType: str, includeCodes: bool = True) -> list:
    """ Queries UKBB database given a searchCodeDict and returns Eids of matching candidates.

    Keyword arguments:
    ------------------
    searchCodeDict: dict
        dictionary that was created using createCodingDict function
    pathToCredentials: str
    path to a .py file containing the variables:
    applicationId: str
        ID of the project with UKBB
    username: str
        UKBB user name
    password: str
        UKBB password
    pathToDriver: str
        path to the driver `chromedriver` used by selenium
    driverType: str
        driverType for selenium e.g chrome or firefox
    includeCodes: bool [True]
        set to False to get the inverse set
        
    Returns:
    --------
    eids: list
        List of eids matching the search criterion of the searchCodeDict
    """
    supported_drivers = ['chrome', 'firefox']
    driverType = driverType.lower()

    if not path.exists(pathToCredentials):
        sys.exit("Credentials file not found")

    if driverType not in supported_drivers:
        raise Exception("Program only suports {} drivers, you provided {}. Please install relevant driver and browser. Instructions in README.md".format(supported_drivers, driverType))

    from importlib import import_module
    cred = import_module("credentials", package=pathToCredentials)

    queryString = _createGpClinicalQueryString(searchCodeDict)
    payload =  _get_payload(queryString, cred.applicationId, cred.userName, cred.password, pathToDriver, driverType)
    response = _download_data(payload)
    eids = _extract_eids(response)
    
    if not includeCodes:
        queryString = 'SELECT distinct eid FROM gp_clinical'
        payload =  _get_payload(queryString, cred.applicationId, cred.userName, cred.password, pathToDriver, driverType)
        response = _download_data(payload)
        alleids = _extract_eids(response)
        eids = list(set(alleids) - set(eids)) 

    return eids


In [267]:
def queryHospitalData(pathToMainDataset: str, searchCodeDict: dict, includeCodes: bool = True) -> list:
    """ Search main dataframe for hospital reported conditions.

    This function uses the following columns of the main dataset:
    **41270**	Diagnoses - ICD10
    **41280**	Date of first in-patient diagnosis - ICD10
    **41271**	Diagnoses - ICD9
    **41281**	Date of first in-patient diagnosis - ICD9
    **eid**
    Returns a list of 'eid' values that can subsequently be used to retrieve the genetic data of our cohort.

    Keyword arguments:
    ------------------
    pathToMainDataset: str
        path to main dataset (csv)
    searchCodeDict: dict
        dictionary that was created using createCodingDict function
    includeCodes: bool [True]
        set to False to get the inverse set

    Returns:
    --------
    hospital_eids: list(str)
    """

    # read dataset

    main_dataset = preprocessing.get_columns(["41270", "41280","41271", "41281", "eid"], pathToMainDataset)


    # create lists for diagnoses and dates for icd9 and icd10. then extract relevant columns from main dataframe

    icd9_diag_cols = ['eid']
    icd9_date_cols = ['eid']
    icd10_diag_cols = ['eid']
    icd10_date_cols = ['eid']

    icd9_columns = ['eid']
    for i in main_dataset.columns:
        cstart = str(i).split('-')[0]
        if (cstart == '41271'):
            icd9_columns.append(i)
            icd9_diag_cols.append(i)
        elif (cstart == '41281'):
            icd9_columns.append(i)
            icd9_date_cols.append(i)

    hospital_records_icd9 = main_dataset[icd9_columns]

    icd10_columns = ['eid']
    for i in main_dataset.columns:
        cstart = str(i).split('-')[0]
        if (cstart == '41270'):
            icd10_columns.append(i)
            icd10_diag_cols.append(i)
        elif (cstart == '41280'):
            icd10_columns.append(i)
            icd10_date_cols.append(i)

    hospital_records_icd10 = main_dataset[icd10_columns]

    (icd9_diag_rename, icd9_numbers) = _construct_renaming_dict(icd9_diag_cols, '.')
    (icd9_date_rename, _) = _construct_renaming_dict(icd9_date_cols, '.')
    (icd10_diag_rename, icd10_numbers) = _construct_renaming_dict(icd10_diag_cols, '.')
    (icd10_date_rename, _) = _construct_renaming_dict(icd10_date_cols, '.')


    # steps for icd9 and icd10:
    # 1. split dataset into diagnoses and dates
    # 2. rename columns to everythong after '.' (works for 412xx columns only, might have to be after '-' for other data columns)
    # 3. melt each subset to make data tidy
    # 4. set hierarchical index (eid, visit)
    # 5. join on these indices
    # 6. convert diagnosis code to string and date to datetime object

    icd9_diag = hospital_records_icd9[icd9_diag_cols].rename(columns=icd9_diag_rename).melt(id_vars='eid', value_vars=icd9_numbers, value_name='diagnosis', var_name='visit').set_index(['eid', 'visit'])
    icd9_date = hospital_records_icd9[icd9_date_cols].rename(columns=icd9_date_rename).melt(id_vars='eid', value_vars=icd9_numbers, value_name='diagnosisDate', var_name='visit').set_index(['eid', 'visit'])
    icd9 = icd9_diag.join(icd9_date)
    icd9 = icd9.dropna(subset=['diagnosis', 'diagnosisDate'])
    icd9.diagnosis = icd9.diagnosis.astype('str')
    icd9.diagnosisDate = pd.to_datetime(icd9.diagnosisDate)

    icd10_diag = hospital_records_icd10[icd10_diag_cols].rename(columns=icd10_diag_rename).melt(id_vars='eid', value_vars=icd10_numbers, value_name='diagnosis', var_name='visit').set_index(['eid', 'visit'])
    icd10_date = hospital_records_icd10[icd10_date_cols].rename(columns=icd10_date_rename).melt(id_vars='eid', value_vars=icd10_numbers, value_name='diagnosisDate', var_name='visit').set_index(['eid', 'visit'])
    icd10 = icd10_diag.join(icd10_date)
    icd10 = icd10.dropna(subset=['diagnosis', 'diagnosisDate'])
    icd10.diagnosis = icd10.diagnosis.astype('str')
    icd10.diagnosisDate = pd.to_datetime(icd10.diagnosisDate)

    icd9Query = _createPandasQueryString(searchCodeDict, 'icd9', columnName = 'diagnosis')
    icd10Query = _createPandasQueryString(searchCodeDict, 'icd10', columnName = 'diagnosis')
    
    icd9_df = icd9.query(icd9Query)
    icd10_df = icd10.query(icd10Query)

    hospital_eids = list(set(icd9_df.append(icd10_df).reset_index()[['eid']]['eid']))
    
    if not includeCodes: 
        hospital_eids = list(set(main_dataset.query('eid not in {}'.format(hospital_eids))['eid']))

    return hospital_eids

In [276]:
pathToLookup = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/lookupCodeDescriptions.csv'
pathToCoding = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/coding19.tsv'
pathToMainDataset = '/Users/kiko/IBM/GEN/toy-data/ukb41268_datafields_nrows10000.csv'
searchTerms = ['glaucoma']

pathToCredentials = '.'
pathToDriver = "going_headless/chromedriver"

In [310]:
searchCodeDict, searchCodeArray = uk.query.createCodingDict(pathToLookup=pathToLookup, searchTerms=searchTerms)

In [270]:
eids = queryGpClinicalData(searchDict,pathToCredentials, pathToDriver, 'chrome', includeCodes=True)

In [281]:
noneids = queryGpClinicalData(searchDict,pathToCredentials, pathToDriver, 'chrome', includeCodes=False)

In [282]:
print(len(eids))
print(len(noneids))

9770
220326


In [283]:
len(list(set(eids) & set(noneids)))

0

In [284]:
len(eids) + len(noneids)

230096

___

In [329]:
def _searchCoding19(pathToCoding: str, icd10_codes):
    """ Translate icd10 codes to node_ids for selfreported data.
    
    Keyword arguments:
    ------------------
    pathToCoding: str
        dictionary that was created using createCodingDict function
    icd10_codes: list
        list of icd10 codes that need to be translated

    Returns:
    --------
    node_ids: list
        list of node_ids 

    """
    codingDf = pd.read_csv(pathToCoding, sep='\t')
    codingDf['icd10'] = codingDf['meaning'].apply(lambda x: x.split(' ')[0])
    code19Dict = dict(zip(codingDf.icd10, codingDf.node_id))
    node_ids = []
    for i in icd10_codes:
        try:
            node_ids.append(code19Dict[i])
        except:
            pass
    return node_ids


In [392]:
main_dataset = preprocessing.get_columns(["20002", "20008","20009", "eid"], pathToMainDataset)

# create lists for condition, year, and age. then extract relevant columns from main dataframe
diag_cols = main_dataset.filter(regex="20002-*").columns.tolist()
diag_cols.append('eid')

year_cols = main_dataset.filter(regex="20008-*").columns.tolist()
year_cols.append('eid')

age_cols = main_dataset.filter(regex="20009-*").columns.tolist()
age_cols.append('eid')

(diag_rename, diag_numbers) = _construct_renaming_dict(diag_cols, '-')
(year_rename, _) = _construct_renaming_dict(year_cols, '-')
(age_rename, _) = _construct_renaming_dict(age_cols, '-')


diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])


# steps for icd9 and icd10:
# 1. split dataset into diagnoses and dates
# 2. rename columns to everythong after '.' (works for 412xx columns only, might have to be after '-' for other data columns)
# 3. melt each subset to make data tidy
# 4. set hierarchical index (eid, visit)
# 5. join on these indices
# 6. convert diagnosis code to string and date to datetime object

diag = main_dataset[diag_cols].rename(columns=diag_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosis', var_name='visits').set_index(['eid', 'visits'])
year = main_dataset[year_cols].rename(columns=year_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisYear', var_name='visits').set_index(['eid', 'visits'])
age = main_dataset[age_cols].rename(columns=age_rename).melt(id_vars='eid', value_vars=diag_numbers, value_name='diagnosisAge', var_name='visits').set_index(['eid', 'visits'])

diag_year = diag.join(year)
diag_year_age = diag_year.join(age)
filtered = diag_year_age.dropna(subset=['diagnosis', 'diagnosisYear', 'diagnosisAge'])

#     translate from node_ids to icd10 codes and overwrite diagnosis column with new value if found, with "not found" otherwise 
codingDf = pd.read_csv(pathToCodingFile, sep='\t', dtype=str)
codingDf['icd10'] = codingDf['meaning'].apply(lambda x: x.split(' ')[0])
code19Dict = dict(zip(codingDf.node_id, codingDf.icd10))


filtered['diagnosis'] = filtered['diagnosis'].apply(lambda node_id: code19Dict.get(node_id, "code not found"))

#     if not searchCodeDict['node_ids']:
#         print("No matches found in self-reported data. Returning empty list")
#         selfreported_eids = []
#     else:
print("{} node_ids translated; {} node_ids could not be translated into icd10 codes".format(len(filtered.query('not diagnosis == "code not found"')), len(filtered.query('diagnosis == "code not found"'))))


21841 node_ids translated; 610 node_ids could not be translated into icd10 codes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


___

In [514]:
datafields=['31','20002']

In [521]:
def _getQuery(datafields):
    '''Generate a regex string for a pd.DataFrame.filter function.
    
    :param datafields: any datafield collected from the UKBB Data Showcase website e.g 41270
    :return: regex string
    :rtype: str
    '''
    
    start = r"^eid"
    end = r''.join(r"|^{}$|^{}-".format(i,i) for i in datafields)
    final = start + end
    return final

In [522]:
maindf = pd.read_csv(pathToMainDataset, nrows=1, dtype=str)
datafields_query = _getQuery(datafields)
col_list = maindf.filter(regex=datafields_query).columns.tolist()

In [523]:
datafields_query

'^eid|^31$|^31-|^20002$|^20002-'

In [524]:
maindf.filter(regex=datafields_query)

Unnamed: 0,eid,31-0.0,20002-0.0,20002-0.1,20002-0.2,20002-0.3,20002-0.4,20002-0.5,20002-0.6,20002-0.7,...,20002-3.24,20002-3.25,20002-3.26,20002-3.27,20002-3.28,20002-3.29,20002-3.30,20002-3.31,20002-3.32,20002-3.33
0,1000015,1,1065,,,,,,,,...,,,,,,,,,,


In [503]:
maindf.columns

Index(['eid', '31-0.0', '87-0.31', '87-0.33', '87-1.31', '87-1.33', '87-2.31',
       '87-2.33', '87-3.31', '87-3.33',
       ...
       '131994-0.0', '131995-0.0', '131996-0.0', '131997-0.0', '131998-0.0',
       '131999-0.0', '132031-0.0', '132033-0.0', '132131-0.0', '132133-0.0'],
      dtype='object', length=2904)

## stats

In [None]:
import ukbb

In [525]:
pathToLookup = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/lookupCodeDescriptions.csv'
pathToCoding = '/Users/kiko/IBM/GEN/modellingScripts/isabell/cohortPipeline/coding19.tsv'
pathToMainDataset = '/Users/kiko/IBM/GEN/toy-data/ukb41268_datafields_nrows10000.csv'
searchTerm = 'glaucoma'

pathToCredentials = '.'
pathToDriver = "prototype_notebooks/going_headless/chromedriver"

In [526]:
maindf = pd.read_csv(pathToMainDataset)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
filtered_df = uk.utils

___

# round 2 

In [3]:
import pandas as pd