In [2]:
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool
import numpy as np
import pandas as pd

In [None]:
'''Phenotype extraction code based on UKB_coding_mapping_table and self_report_ICD10_mapping_treeRespect.tsv code files from https://github.com/rivas-lab/public-resources/tree/master/uk_biobank/digital_phenotyping'''

In [5]:
##Non_cancer
non_cancer=pd.read_csv('/work/biobank/ukb_data/UKB_coding_mapping_table.tsv',sep="\t")#.to_csv("/work/biobank/ukb_data/good.tsv",sep="\t",index=False)

In [6]:
##cancer
cancer=pd.read_csv('/work/biobank/ukb_data/self_report_ICD10_mapping_treeRespect.tsv',sep="\t")

In [8]:
cancer_columns=["40006-"+str(y)+".0"  for y in range(13)]+["20001-"+str(x)+"."+str(y) for x in range(4) for y in range(6)]
non_cancer_columns=["20002-"+str(x)+"."+str(y) for x in range(4) for y in range(33)]+["41202-0."+str(y)  for y in range(66)]
phenos_cancer=pd.read_csv("/work/biobank/ukb_data/new_data/ukb49767.csv",usecols=["eid"]+cancer_columns,low_memory=False)
phenos_non_cancer=pd.read_csv("/work/biobank/ukb_data/new_data/ukb49767.csv",usecols=["eid"]+non_cancer_columns,low_memory=False)

In [9]:
phenos_cancer=phenos_cancer.set_index("eid")
phenos_non_cancer=phenos_non_cancer.set_index("eid")

In [18]:
'''Get core functions'''
def initializer(ii):
    global illness_ids
    illness_ids=ii

def fetch_individuals(phenos_cancer_splitted):
    result=[]
    count=0
    number_done=1
    global illness_ids
    assert illness_ids is not None
    for eid in phenos_cancer_splitted.index.values:
        for code in illness_ids:
            found=int(code in list(phenos_cancer_splitted.loc[eid,:].values))
            try:
                found2=int(float(code) in list(phenos_cancer_splitted.loc[eid,:] ))
            except:
                found2=0
            if found==1 or found2==1:
                break
        if found2==1:
            result.append(found2)
        else:
            result.append(found)

        number_done+=1
    del illness_ids
    return result

#return the process result
def process_frame_pheno(phenos_splitted):
    return fetch_individuals(phenos_splitted) 



In [None]:
'''Implementation of a threaded phnotype retriever for cancer and non cancer diseases (separatly). The phenoypes are mapped to 0 (control) or 1 (case)'''

In [27]:
def retrieval_non_cancer(non_cancer_info,phenos_non_cancer):
    num_cores = multiprocessing.cpu_count()-1  #number of cpus
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(phenos_non_cancer, num_cores) #split array --> scatter
    total_results=[]
    names=[]
    for i in non_cancer_info.index.values:
        vals_int=non_cancer_info.loc[i,"UKB_coding6"]
        vals=non_cancer_info.loc[i,"UKB_coding19"]
        name=non_cancer_info.loc[i,"GBE_short_name"]
        print(name)
        ii=[vals_int]
        try:
            ii+=vals.split(',')
        except:
            pass
        print(ii) 
        pool = Pool(processes=num_cores,initializer=initializer,initargs=[ii,])
        print("start")
        result = pool.map(process_frame_pheno, df_split)# broadcast run process and gather
        # concat the final results in on array
        final=[]
        count=0
        for i in result:
            final+=i
            for k in i:
                if k==1:
                    count+=1
        total_results.append(final)
        names.append(name)
        print("count:"+str(count))
        pool.close()
        pool.join()
        del pool
    return total_results,names

In [28]:
def retrieval_cancer(cancer_info,phenos_cancer):
    num_cores = multiprocessing.cpu_count()-1  #number of cpus
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(phenos_cancer, num_cores) #split array --> scatter
    total_results=[]
    names=[]
    for i in cancer_info.index.values:
        vals_int=cancer.loc[i,"self-reported coding"]
        vals=cancer.loc[i,"ICD-10 Codes"]
        name=cancer.loc[i,"meaning"]
        print(name)
        ii=[num+".0" for num  in vals_int.split(',')]+vals.split(',')
        print(ii) 
        pool = Pool(processes=num_cores,initializer=initializer,initargs=[ii,])
        print("start")
        result = pool.map(process_frame_pheno, df_split)# broadcast run process and gather
        # concat the final results in on array
        final=[]
        count=0
        for i in result:
            final+=i
            for k in i:
                if k==1:
                    count+=1
        total_results.append(final)
        names.append(name)
        print("count:"+str(count))
        pool.close()
        pool.join()
        del pool
    return total_results,names

In [None]:
total_results_cancer,names_cancer=retrieval_cancer(cancer,phenos_cancer)
total_results_non_cancer,names_non_cancer=retrieval_non_cancer(non_cancer,phenos_non_cancer)

In [None]:
pd.DataFrame(np.array(total_results).T,columns=names,index=phenos_cancer.index.values).to_csv("./cancer_pheno.csv")
pd.DataFrame(np.array(total_results).T,columns=names,index=phenos_non_cancer.index.values).to_csv("./non_cancer_pheno.csv")