# **Disease prediction based on genetic and lifestyle information**

## Step 1: Split phenotype information (.csv file) into .npy files

In [None]:
import numpy as np
import os
import csv
import sys
csv.field_size_limit(sys.maxsize)

NUM = 502628 # number of individuals

def save_data(dir_save, names, data):
    if not os.path.exists(dir_save):
        os.makedirs(dir_save)
    if type(names) is list:
        for i, name in enumerate(names):
            np.save(os.path.join(dir_save, str(name)), data[i])
    else:
        np.save(os.path.join(dir_save, str(names)), data)
    
def get_data(dir_file, names):
    # Get the data of item names for all individuals
    # return the data list if names is char; a list containing data lists if names is a list
    
    if type(names) is list:
        data = [[] for i in names]
    else:
        data = []
    
    inds = names # get_ind(dir_file, names)
        
    with open(dir_file, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for i, row in enumerate(reader):
            if np.mod(i, 10000) == 0:
                print(i)
            if type(names) is list:
                for j, ind in enumerate(inds):
                    #print(str(len(row))+'\t'+str(ind))
                    data[j].append(row[ind])
            else:
                data.append(row[inds])
    return data

def generate_data(dir_file, dir_save, names):
    names_new = []
    # only generate data which are not generated before
    for name in names:
        if not os.path.isfile(os.path.join(dir_save, str(name)) + '.npy'):
            names_new.append(name)
            
    # if all items are generated before
    if not names_new:
        return
    data_new = get_data(dir_file, names_new)
    save_data(dir_save, names_new, data_new)

names=[]
dir_save = '/oak/stanford/groups/arend/Eric/UKBB/phenotype/phenotype_13721/'
dir_file = '/scratch/PI/eriking/ukb/app1372/processed/ukb9430.csv/'


## Step2: Identify white british participants

In [None]:
all_person_id=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/0.npy')
col_6669=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6669.npy')
col_6670=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6670.npy')
col_6671=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6671.npy')
#merge three columns and filter first column
individual=all_person_id[1:]
ethnics=col_6669[1:]
for i in range(502628):
    if ethnics[i]=='' and col_6670[i+1]!='':
        ethnics[i]=col_6670[i+1]
for i in range(502628):
    if ethnics[i]=='' and col_6671[i+1]!='':
        ethnics[i]=col_6671[i+1]
outfile=open('/oak/stanford/groups/jamesz/eric/extract_ind','w')
for i in range(502628):
    if ethnics[i]=='1001' or ethnics[i]=='2001' or ethnics[i]=='3001' or ethnics[i]=='4001':
        outfile.write(individual[i]+'\t'+individual[i]+'\n')
outfile.close()

## Step3: Extract individuals by using Plink

**plink --bfile allsamples --remove white_withdraw.ind --mak-bed --out merge_white_Britich_clean**

## Step4: Generate information of self-reported cancers/diseases

All the self-reported cancers/diseases should be first grouped into the second level of the disease tree structure according to [Data-Coding 3](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=3) (cancers) and [Data-Coding 6](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=6) (diseases) 

Next, we generated .data and .list files for self-reported cancers and diseases, respectively. There remained 21 diseases and 1 cancer, whose sample size are larger than 6,000

**1. .list files include two columns: disease id and the number of patients **

**2. .data files include the affection status, each row represent one disease/cancer(with the same order as in .list), where 1: affected; 0: unaffected**



In [None]:
outdata1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer.data','w')
outdata2=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease.data','w')

outlist1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer.list','w')
outlist2=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease.list','w')

diseaseid_exit=[]
cancer=defaultdict(list)
disease=defaultdict(list)

for i in range(4136,4154):
    col=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/'+str(i)+'.npy')
    for j in range(1,502629):
        if col[j]!='' and col[j]!='99999':
            if cancer_map[col[j]]=='':
                print(col[j])
            cancer[cancer_map[col[j]]].append(j-1)
    
for i in range(4154,4241):
    col=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/'+str(i)+'.npy')
    for j in range(1,502629):
        if col[j]!='' and col[j]!='99999':
            if disease_map[col[j]]=='':
                print(col[j])
            disease[disease_map[col[j]]].append(j-1)
print('finished')

for key,value in cancer.items():
    sum_num=0
    for i in range(502628):
        if i in value:
            sum_num+=1
            outdata1.write('1'+'\t')
        else:
            outdata1.write('0'+'\t')
    outdata1.write('\n')
    outlist1.write(key+'\t')
    outlist1.write(str(sum_num)+'\n')
outdata1.close()
outlist1.close()
    

for key,value in disease.items():
    sum_num=0
    for i in range(502628):
        if i in value:
            sum_num+=1
            outdata2.write('1'+'\t')
        else:
            outdata2.write('0'+'\t')
    outdata2.write('\n')
    outlist2.write(key+'\t')
    outlist2.write(str(sum_num)+'\n')
    
outdata2.close()
outlist2.close()    

## Step5: Generate information of the cancers/diseases from hospitalization records

We generate disease_status and disease_info based on ICD 10 codes in hospitalization records. 

### disease_info has three columns: 

**1. ICD 10 code;**

**2. number of patients before baseline;**

**3. number of patients after baseline**

### disease_status is a matrix, each ICD 10 code in disease_info has two rows. 

**1. The odd rows are the diagnosis outcomes for ICD 10 codes (the same order as disease_info), where -1: before baseline; 1: after baseline; 0: not affected; **

**2. the even rows represent the corresponding diagnosis time.**

In [None]:
import time
disease=defaultdict(disease_status)
file=open('/scratch/PI/eriking/ukb/app1372/hes/app1372_dbtable_hesin_2017aug22.tsv','r')
outfile_data=open('/oak/stanford/groups/arend/Eric/UKBB/phenotype_merge/disease_status','w')
outfile_info=open('/oak/stanford/groups/arend/Eric/UKBB/phenotype_merge/disease_info','w')
index=0
originaldate=''
for line in file:
    if index==0:
        index+=1
        continue
    A=line.strip('\n').split('\t')
    B=A[3]
    if B=='':
        continue
    if A[2]!='':
        newdate = time.strptime(A[2], "%Y-%m-%d")
        originaldate=A[2]
    elif A[5]!='': 
        newdate = time.strptime(A[5], "%Y-%m-%d")
        originaldate=A[5]
    elif A[6]!='': 
        newdate = time.strptime(A[6], "%Y-%m-%d")
        originaldate=A[6]
    elif A[7]!='': 
        newdate = time.strptime(A[7], "%Y-%m-%d")
        originaldate=A[7]
    if A[0] in disease[B[0:3]].ind.keys():
        olddate = disease[B[0:3]].ind[A[0]]
        if  newdate< time.strptime(olddate, "%Y-%m-%d"):
            disease[B[0:3]].ind[A[0]]=originaldate
    else:
        disease[B[0:3]].ind[A[0]]=originaldate
    index+=1
print('finish 1')
for key,value in disease.items():
    print(key)
    outfile_info.write(key+'\t')
    new_patient=0
    old_patient=0
    X=[]
    Y=[]
    for i in range(1,502629):
        if individual[i] in value.ind.keys():
            Y.append(value.ind[individual[i]])
            if time.strptime(ind_time[individual[i]], "%Y-%m-%d")<time.strptime(value.ind[individual[i]], "%Y-%m-%d"):
                X.append('1')
                new_patient+=1
            else:
                old_patient+=1
                X.append('-1')
        else:
            X.append('0')
            Y.append('0-0-0')
    outfile_info.write(str(new_patient)+'\t')  
    outfile_info.write(str(old_patient)+'\n')    
    for i in range(502628):
        outfile_data.write(X[i]+'\t')
    outfile_data.write('\n')
    for i in range(502628):
        outfile_data.write(Y[i]+'\t')
    outfile_data.write('\n')
    
outfile_info.close()        
outfile_data.close()                

## Step 6: Integrate self-reported diseases and ICD 10 codes 


### 1. Define the matching table between ICD 10 codes and self-report diseases (see Table S2) 

### 2 merge the self-reported diseases (self_cancer.data and self_disease.data) with the disease from hospitalization records (disease_info);

For each disease:

 \*the participants would classified as prevalent cases if they were annotated as '1' in self_cancer.data (self_disease.data) or '-1' in disease_info

 \*the participants would classified as incident cases if they were annotated as '0' in self_cancer.data (self_disease.data) and '1' in disease_info


In [None]:
from collections import defaultdict
patient=defaultdict(str)
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
for disease in phenotype:
    print(disease)
    for index in ['1','2','3']:
        infile=open(disease+'_'+index+'.fam')
        for line in infile:
            A=line.rsplit()
            if A[5]=='2':
                patient[A[0]]='0'
infile1=open('merge_white_Britich_clean.fam','r')
out1=open('health_control','w')
out2=open('health_control_male','w')
out3=open('health_control_female','w')

all_control=0
male_control=0
female_control=0

for line in infile1:
    A=line.rsplit()
    if A[0] not in patient.keys():
        out1.write(A[0]+'\n')
        all_control+=1
        if A[4]=='1':
            out2.write(A[0]+'\n')
            male_control+=1
        if A[4]=='2':
            out3.write(A[0]+'\n')
            female_control+=1
infile1.close()
print(all_control)
print(male_control)
print(female_control)
out1.close()
out2.close()
out3.close()

## Step 7: Genome wide association study for  