# **Disease prediction based on genetic and lifestyle information**

## Step 1: Split phenotype information (.csv file) into .npy files

In [None]:
import numpy as np
import os
import csv
import sys
csv.field_size_limit(sys.maxsize)

NUM = 502628 # number of individuals

def save_data(dir_save, names, data):
    if not os.path.exists(dir_save):
        os.makedirs(dir_save)
    if type(names) is list:
        for i, name in enumerate(names):
            np.save(os.path.join(dir_save, str(name)), data[i])
    else:
        np.save(os.path.join(dir_save, str(names)), data)
    
def get_data(dir_file, names):
    # Get the data of item names for all individuals
    # return the data list if names is char; a list containing data lists if names is a list
    
    if type(names) is list:
        data = [[] for i in names]
    else:
        data = []
    
    inds = names # get_ind(dir_file, names)
        
    with open(dir_file, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for i, row in enumerate(reader):
            if np.mod(i, 10000) == 0:
                print(i)
            if type(names) is list:
                for j, ind in enumerate(inds):
                    #print(str(len(row))+'\t'+str(ind))
                    data[j].append(row[ind])
            else:
                data.append(row[inds])
    return data

def generate_data(dir_file, dir_save, names):
    names_new = []
    # only generate data which are not generated before
    for name in names:
        if not os.path.isfile(os.path.join(dir_save, str(name)) + '.npy'):
            names_new.append(name)
            
    # if all items are generated before
    if not names_new:
        return
    data_new = get_data(dir_file, names_new)
    save_data(dir_save, names_new, data_new)

names=[]
dir_save = '/oak/stanford/groups/arend/Eric/UKBB/phenotype/phenotype_13721/'
dir_file = '/scratch/PI/eriking/ukb/app1372/processed/ukb9430.csv/'


## Step2: Identify white british participants

In [None]:
all_person_id=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/0.npy')
col_6669=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6669.npy')
col_6670=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6670.npy')
col_6671=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/6671.npy')
#merge three columns and filter first column
individual=all_person_id[1:]
ethnics=col_6669[1:]
for i in range(502628):
    if ethnics[i]=='' and col_6670[i+1]!='':
        ethnics[i]=col_6670[i+1]
for i in range(502628):
    if ethnics[i]=='' and col_6671[i+1]!='':
        ethnics[i]=col_6671[i+1]
outfile=open('/oak/stanford/groups/jamesz/eric/extract_ind','w')
for i in range(502628):
    if ethnics[i]=='1001' or ethnics[i]=='2001' or ethnics[i]=='3001' or ethnics[i]=='4001':
        outfile.write(individual[i]+'\t'+individual[i]+'\n')
outfile.close()

## Step3: Extract individuals by using Plink

**plink --bfile allsamples --remove white_withdraw.ind --mak-bed --out merge_white_Britich_clean**

where white_withdraw.ind include the sample id in extract_ind and the participants decided to withdraw their data

## Step 4: Extract 65 lifestyles and environment features (L&E) from .csv file.

The L&E information was included in two files, Raw_pheno_final.matrix and Raw_pheno_final.info: 

1. Raw_pheno_final.matrix: each row represents one L&E feature.
2. Raw_pheno_final.info: has two columns (feature type (Categorical, Integer or Continuous) and feature name)








## Step 5: Impute missing valules of L&E features

Raw_pheno_final_impute.matrix and Raw_pheno_final_impute.info

In [None]:
from sklearn.preprocessing import Imputer
import numpy as np
#input missing value in phenotype data
infile1=open('Raw_pheno_final.matrix','r')
infile2=open('Raw_pheno_final.info','r')
outfile1=open('Raw_pheno_final_impute.matrix','w')
outfile2=open('Raw_pheno_final_impute.info','w')
datatype=[]
datainfo=[]
for line in infile2:
    datatype.append(line.strip(' \n').split('\t')[0])
    datainfo.append(line)
index=0
for line in infile1:
    print(index)
    A=line.strip(' \n').split(' ')
    print(len(A))
    if datatype[index]=='Integer' or datatype[index]=='Continuous':
        missing=0
        newdata=[]
        for d in A:
            if d=='NAN':
                missing+=1
                newdata.append(111111)
            elif float(d)<0:
                missing+=1
                newdata.append(111111)
            else:
                newdata.append(float(d))
        if missing>50000:
            index+=1
            continue
        else:
            imputer=Imputer(missing_values=111111, strategy='median', axis=0, verbose=0, copy=True)
            X=imputer.fit_transform(np.array(newdata).reshape(-1,1))
            for x in X:
                outfile1.write(str(x[0])+' ')
            outfile1.write('\n')
            outfile2.write(datainfo[index])
    elif 'Categorical' in datatype[index]:
        missing=0
        newdata=[]
        for d in A:
            if d=='NAN':
                missing+=1
                newdata.append(111111)
            elif float(d)<0:
                missing+=1
                newdata.append(111111)
            else:
                newdata.append(float(d))
        if missing>50000:
            index+=1
            continue
        else:
            imputer=Imputer(missing_values=111111, strategy='most_frequent', axis=0, verbose=0, copy=True)
            X=imputer.fit_transform(np.array(newdata).reshape(-1,1))
            for x in X:
                outfile1.write(str(x[0])+' ')
            outfile1.write('\n')
            outfile2.write(datainfo[index])
    index+=1
infile1.close()
infile2.close()
outfile1.close()
outfile2.close()

## Step 6: Convert Categorical features into dummy variables 

In [None]:
from scipy import stats
import pandas as pd
import numpy as np
infile_data=open('Raw_pheno_final_impute.matrix','r')
infile_info=open('Raw_pheno_final_impute.info','r')
description=[]
datatype=[]
for line in infile_info:
    A=line.strip(' \n').split('\t')
    description.append(A[1])
    datatype.append(A[0])
infile_info.close()
outfile_result=open('Raw_pheno_final_impute_dummy.matrix','w')
outfile_label=open('Raw_pheno_final_impute_dummy.info','w')

index=0
for line in infile_data:
    print(index)
    A=line.strip(' \n').split(' ')
    print(len(A))
    mylist = list(set(A))
    if len(mylist)==1:
        index+=1
        continue
    if 'Categorical' in datatype[index]:
        if len(mylist)==2:
            outfile_label.write(description[index]+'\n')
            outfile_result.write(line)
        else:
            df = pd.DataFrame(data=np.asarray(A).transpose(),columns=['data'])
            just_dummies = pd.get_dummies(df['data'])
            step_1 = pd.concat([df, just_dummies], axis=1)
            step_1.drop(['data'], inplace=True, axis=1)
            step_1 = step_1.applymap(np.int)
            Y=step_1.columns
            X=np.asarray(step_1).transpose()
            sizelabel=X.shape[0]
            print(sizelabel)
            for i in range(sizelabel):
                outfile_label.write(description[index]+'\t'+str(Y[i])+'\n')
                outfile_result.write(" ".join(map(str,X[i])))
                outfile_result.write('\n')
    else:
        outfile_label.write(description[index]+'\n')
        outfile_result.write(line)
    index+=1
outfile_result.close()
infile_data.close()
outfile_label.close()

## Step7: Generate information of self-reported cancers/diseases

All the self-reported cancers/diseases should be first grouped into the second level of the disease tree structure according to [Data-Coding 3](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=3) (cancers) and [Data-Coding 6](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=6) (diseases) 

Next, we generated .data and .list files for self-reported cancers and diseases, respectively. And only kept the cancers/diseases whose sample size were larger than 6,000

1. .list files include two columns: disease id and the number of patients. 

2. .data files include the affection status, each row represent one disease/cancer(with the same order as in .list), where 1: affected; 0: unaffected.



In [None]:
outdata1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer.data','w')
outdata2=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease.data','w')

outlist1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer.list','w')
outlist2=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease.list','w')

diseaseid_exit=[]
cancer=defaultdict(list)
disease=defaultdict(list)

for i in range(4136,4154):
    col=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/'+str(i)+'.npy')
    for j in range(1,502629):
        if col[j]!='' and col[j]!='99999':
            if cancer_map[col[j]]=='':
                print(col[j])
            cancer[cancer_map[col[j]]].append(j-1)
    
for i in range(4154,4241):
    col=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/'+str(i)+'.npy')
    for j in range(1,502629):
        if col[j]!='' and col[j]!='99999':
            if disease_map[col[j]]=='':
                print(col[j])
            disease[disease_map[col[j]]].append(j-1)
print('finished')

for key,value in cancer.items():
    sum_num=0
    for i in range(502628):
        if i in value:
            sum_num+=1
            outdata1.write('1'+'\t')
        else:
            outdata1.write('0'+'\t')
    outdata1.write('\n')
    outlist1.write(key+'\t')
    outlist1.write(str(sum_num)+'\n')
outdata1.close()
outlist1.close()
    

for key,value in disease.items():
    sum_num=0
    for i in range(502628):
        if i in value:
            sum_num+=1
            outdata2.write('1'+'\t')
        else:
            outdata2.write('0'+'\t')
    outdata2.write('\n')
    outlist2.write(key+'\t')
    outlist2.write(str(sum_num)+'\n')
    
outdata2.close()
outlist2.close()    

## Step 8: Remove the inappropriate participants from the self-reported patients and L&E feature files.


Only keep the participants in merge_white_Britich_clean.fam (generated in **Step 3**)

In [None]:
import numpy as np
all_person_id=np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/0.npy')
infile=open('/oak/stanford/groups/jamesz/eric/genotype/merge_white_Britich_clean.fam','r')
individual=all_person_id[1:].tolist()
ind_genetics_index=[]
for line in infile:
    ind=line.split()[0]
    ind_genetics_index.append(individual.index(ind))
infile.close()
infile_cancer=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer.data','r')
infile_disease=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease.data','r')
outfile_cancer=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_cancer_sub.data','w')
outfile_disease=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/self_disease_sub.data','w')
outfile_cancer_num=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/cancer_num','w')
outfile_disease_num=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/disease_num','w')

for line in infile_cancer:
    A=line.strip().split()
    sum_num=0
    for XX in ind_genetics_index:
        outfile_cancer.write(A[XX]+'\t')
        sum_num=sum_num+int(A[XX])
    outfile_cancer_num.write(str(sum_num)+'\n')
    outfile_cancer.write('\n')

for line in infile_disease:
    A=line.strip().split()
    sum_num=0
    for XX in ind_genetics_index:
        outfile_disease.write(A[XX]+'\t')
        sum_num=sum_num+int(A[XX])
    outfile_disease_num.write(str(sum_num)+'\n')
    outfile_disease.write('\n')
infile_cancer.close()
infile_disease.close()
outfile_cancer.close()
outfile_disease.close()
outfile_cancer_num.close()
outfile_disease_num.close()
infile=open('Raw_pheno_final_impute_dummy.matrix','r')
outfile=open('Raw_pheno_final_impute_dummy_sub.matrix','w')
for line in infile:
    A=line.strip().split()
    for XX in ind_genetics_index:
        outfile.write(A[XX]+'\t')
    outfile.write('\n')
infile.close()
outfile.close()

## Step9: Generate information of the cancers/diseases from hospitalization records

We generate disease_status and disease_info based on ICD 10 codes in hospitalization records. 

### disease_info has three columns: 

>1. ICD 10 code;

>2. number of patients before baseline;

>3. number of patients after baseline.

### disease_status is a matrix, each ICD 10 code in disease_info has two rows. 

>1. The odd rows are the diagnosis outcomes for ICD 10 codes (the same order as disease_info), where -1: before baseline; 1: after baseline; 0: not affected; 

>2. the even rows represent the corresponding diagnosis time.

In [None]:
import time
disease=defaultdict(disease_status)
file=open('/scratch/PI/eriking/ukb/app1372/hes/app1372_dbtable_hesin_2017aug22.tsv','r')
outfile_data=open('/oak/stanford/groups/arend/Eric/UKBB/phenotype_merge/disease_status','w')
outfile_info=open('/oak/stanford/groups/arend/Eric/UKBB/phenotype_merge/disease_info','w')
index=0
originaldate=''
for line in file:
    if index==0:
        index+=1
        continue
    A=line.strip('\n').split('\t')
    B=A[3]
    if B=='':
        continue
    if A[2]!='':
        newdate = time.strptime(A[2], "%Y-%m-%d")
        originaldate=A[2]
    elif A[5]!='': 
        newdate = time.strptime(A[5], "%Y-%m-%d")
        originaldate=A[5]
    elif A[6]!='': 
        newdate = time.strptime(A[6], "%Y-%m-%d")
        originaldate=A[6]
    elif A[7]!='': 
        newdate = time.strptime(A[7], "%Y-%m-%d")
        originaldate=A[7]
    if A[0] in disease[B[0:3]].ind.keys():
        olddate = disease[B[0:3]].ind[A[0]]
        if  newdate< time.strptime(olddate, "%Y-%m-%d"):
            disease[B[0:3]].ind[A[0]]=originaldate
    else:
        disease[B[0:3]].ind[A[0]]=originaldate
    index+=1
print('finish 1')
for key,value in disease.items():
    print(key)
    outfile_info.write(key+'\t')
    new_patient=0
    old_patient=0
    X=[]
    Y=[]
    for i in range(1,502629):
        if individual[i] in value.ind.keys():
            Y.append(value.ind[individual[i]])
            if time.strptime(ind_time[individual[i]], "%Y-%m-%d")<time.strptime(value.ind[individual[i]], "%Y-%m-%d"):
                X.append('1')
                new_patient+=1
            else:
                old_patient+=1
                X.append('-1')
        else:
            X.append('0')
            Y.append('0-0-0')
    outfile_info.write(str(new_patient)+'\t')  
    outfile_info.write(str(old_patient)+'\n')    
    for i in range(502628):
        outfile_data.write(X[i]+'\t')
    outfile_data.write('\n')
    for i in range(502628):
        outfile_data.write(Y[i]+'\t')
    outfile_data.write('\n')
    
outfile_info.close()        
outfile_data.close()                

## Step 10: Remove the inappropriate participants from ICD 10 codes file (disease_status).

Only keep the participants in merge_white_Britich_clean.fam (generated in **Step 3**)

In [None]:
import numpy as np
from operator import itemgetter
personallid=list(np.load('/oak/stanford/groups/arend/Eric/UKBB/phenotype_13721/0.npy'))
personusedid=np.genfromtxt('/oak/stanford/groups/jamesz/eric/genotype/merge_white_Britich_clean.fam',dtype=str)
indexnum=0
index_col=[]
for i in range(337536):
    personid=personusedid[i,0]
    personindex=personallid.index(personid)-1
    index_col.append(personindex)
ICDdata=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/disease_status')
ICDdata_new=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/disease_status_337536','w')
linenum=0
for line in ICDdata:
    if linenum%2==0:
        case_one=list(itemgetter(*index_col)(line.strip('\n').rsplit()))
        ICDdata_new.write(' '.join(case_one)+'\n')
    linenum+=1
ICDdata.close()
ICDdata_new.close()

## Step 11: Integrate self-reported diseases and ICD 10 codes 


### 1. Define the matching table between ICD 10 codes and self-report diseases ("icd10", see Table S2); 

### 2. Merge the self-reported diseases (related_self_data) with the diseases from hospitalization records (related_icd_data).

For each disease:

> \*the participants would classified as prevalent cases if they were annotated as '1' in self_cancer.data (self_disease.data) or '-1' in disease_info

> \*the participants would classified as incident cases if they were annotated as '0' in self_cancer.data (self_disease.data) and '1' in disease_info


In [None]:
import numpy as np
phenotype=['1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1473','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
infile1=open('self_cancer_sub.data')
infile2=open('self_disease.list')
outfile1=open('related_self_list','w')
outfile2=open('related_self_data','w')
linenum=0
outfile1.write('1002\n')
for line in infile1:
    if linenum==1:
        outfile2.write(line)
    linenum+=1
alldata=np.genfromtxt('self_disease_sub.data',dtype=str)
linenum=0
diseaselist=[]
for line in infile2:
    A=line.strip('\n').rsplit()
    diseaselist.append(A[0])
for index in phenotype:
    print(linenum)
    aa=diseaselist.index(index)
    outfile1.write(index+'\n')
    outfile2.write(' '.join(alldata[aa])+'\n')
    linenum+=1
outfile1.close()
outfile2.close()

In [None]:
import numpy as np
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1473','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
#input selfreport_icd10
from collections import defaultdict
selfid=defaultdict(list)
infile=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/icd10')
for line in infile:
    A=line.strip('\n').split('\t')
    if A[0]!='1243' and A[0]!='1242':
        selfid[A[0]]=A[2:]
    elif A[0]=='1243':
        X=[]
        for i in range(99):
            if i<9:
                X.append('F0'+str(i+1))
            else:
                X.append('F'+str(i+1))
        selfid[A[0]]=X
    elif A[0]=='1242':
        X=[]
        for i in range(60):
            if i<10:
                X.append('H0'+str(i))
            else:
                 X.append('H'+str(i))
        selfid[A[0]]=X
#print(selfid)
#filterout icd10
selfid_to_line=defaultdict(list)
infile=open('disease_info')
outfile1=open('related_icd_data','w')
outfile2=open('related_icd_id','w')
linenum=0
for line in infile:
    A=line.strip('\n').rsplit()
    for key,values in selfid.items():
        if A[0] in values:
            selfid_to_line[key].append(linenum)
    linenum+=1
icddata=np.genfromtxt('disease_status_337536',dtype=int)
for index in phenotype:
    values=selfid_to_line[index]
    outfile2.write(index+'\t')
    linenum=0
    sumdata=[]
    for line in values:
        if linenum==0:
            sumdata=icddata[line,:]
        else:
            for t in range(337536):
                if icddata[line,t]==-1:
                    sumdata[t]=-1
                elif icddata[line,t]==1 and sumdata[t]!=-1:
                    sumdata[t]=1
        linenum+=1
    ABC=' '.join(map(str,sumdata))
    outfile2.write(str(ABC.count('-1'))+'\t'+str(ABC.count('1'))+'\n')
    XYZ=ABC.replace('1','2').replace('-2','0')
    outfile1.write(XYZ+'\n')
outfile1.close()
outfile2.close()

In [None]:
import numpy as np
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1473','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
icddata=np.genfromtxt('related_icd_data',dtype=str)
selfdata=np.genfromtxt('related_self_data',dtype=str)
sex=np.genfromtxt('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/fam_disease/white_british/chip/merge_white_Britich_clean.fam',dtype=str)
out_list=open('merge.list','w')
out_data=open('merge.data','w')
for m in range(23):
    self_disease_man=0
    self_disease_woman=0
    self_disease_total=0
    icd_disease_man=0
    icd_disease_woman=0
    icd_disease_total=0
    control_man=0
    control_woman=0
    control_total=0
    out_list.write(phenotype[m]+'\t')
    for n in range(337536):
        if selfdata[m,n]=='1':
            out_data.write('1'+' ')
            if sex[n,4]=='1':
                self_disease_man+=1
            elif sex[n,4]=='2':
                self_disease_woman+=1
            self_disease_total+=1
        elif selfdata[m,n]=='0' and icddata[m,n]=='0':
            if sex[n,4]=='1':
                control_man+=1
            elif sex[n,4]=='2':
                control_woman+=1
            out_data.write('0'+' ')
            control_total+=1
        elif selfdata[m,n]=='0' and icddata[m,n]=='2':
            out_data.write('2'+' ')
            if sex[n,4]=='1':
                icd_disease_man+=1
            elif sex[n,4]=='2':
                icd_disease_woman+=1
            icd_disease_total+=1
    out_data.write('\n')
    out_list.write(str(self_disease_man)+'\t'+str(self_disease_woman)+'\t'+str(self_disease_total)+'\t'+str(icd_disease_man)+'\t'+str(icd_disease_woman)+'\t'+str(icd_disease_total)+'\t'+str(control_man)+'\t'+str(control_woman)+'\t'+str(control_total)+'\n')
out_data.close()
out_list.close()

## Step 12: generate four fam files (prevalent cases: id_1.fam, id_2.fam, id_3.fam; incident cases: id_4.fam )

In [None]:
import numpy as np
from collections import defaultdict
class indinfo(object):
    def __init__(self):
        self.id=""
        self.gender=""

infile1=open('merge.list','r')
infile2=open('merge.data','r')
infile3=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/fam_disease/white_british/chip/merge_white_Britich_clean.fam','r')
indlist=[]
disease_num_old_man=defaultdict(list)
disease_num_old_woman=defaultdict(list)
disease_num_old_all=defaultdict(list)
disease_num_new_man=defaultdict(list)
disease_num_new_woman=defaultdict(list)
disease_num_new_all=defaultdict(list)
disease_num_control_man=defaultdict(list)
disease_num_control_woman=defaultdict(list)
disease_num_control_all=defaultdict(list)
total_women=0
total_man=0
for line in infile3:
    A=line.strip('\n').split()
    X=indinfo()
    X.id=A[0]
    #print(A[4])
    X.gender=A[4]
    indlist.append(X)
phenotype=[]
for line in infile1:
    A=line.strip('\n').rsplit()
    disease_num_old_man[A[0]]=int(A[1])
    disease_num_old_woman[A[0]]=int(A[2])
    disease_num_old_all[A[0]]=int(A[3])

    disease_num_new_man[A[0]]=int(A[4])
    disease_num_new_woman[A[0]]=int(A[5])
    disease_num_new_all[A[0]]=int(A[6])

    disease_num_control_man[A[0]]=int(A[7])
    disease_num_control_woman[A[0]]=int(A[8])
    disease_num_control_all[A[0]]=int(A[9])
    phenotype.append(A[0])
line_index=0
for line in infile2:
    print(phenotype[line_index])
    A=line.strip('\n').split()
    outfile1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/'+phenotype[line_index]+'_1.fam','w')
    outfile2=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/'+phenotype[line_index]+'_2.fam','w')
    outfile3=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/'+phenotype[line_index]+'_3.fam','w')
    outfile4=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/matchICD10/'+phenotype[line_index]+'_4.fam','w')
    case_old_man=int(disease_num_old_man[phenotype[line_index]])
    case_old_woman=int(disease_num_old_woman[phenotype[line_index]])
    case_old_all=int(disease_num_old_all[phenotype[line_index]])
    case_new_man=int(disease_num_new_man[phenotype[line_index]])
    case_new_woman=int(disease_num_new_woman[phenotype[line_index]])
    case_new_all=int(disease_num_new_all[phenotype[line_index]])
    control_man=int(disease_num_control_man[phenotype[line_index]])
    control_woman=int(disease_num_control_woman[phenotype[line_index]])
    control_all=int(disease_num_control_all[phenotype[line_index]])
    case_1=0
    case_2=0
    case_3=0
    case_4=0
    control_1=0
    control_2=0
    control_3=0
    control_4=0
    if phenotype[line_index]=='1002':
        case_1=int(case_old_woman/3)
        case_2=case_1*2
        case_3=case_old_woman
        case_4=case_new_woman
        controleffect=control_woman-case_new_woman
        control_1=int(controleffect/3)
        control_2=control_1*2
        control_3=controleffect
        control_4=control_woman
    elif phenotype[line_index]=='1348':
        case_1=int(case_old_woman/3)
        case_2=case_1*2
        case_3=case_old_woman
        case_4=case_new_woman
        controleffect=control_woman-case_new_woman
        #print(control_woman)
        control_1=int(controleffect/3)
        control_2=control_1*2
        control_3=controleffect
        control_4=control_woman
    elif phenotype[line_index]=='1207':
        case_1=int(case_old_man/3)
        case_2=case_1*2    
        case_3=case_old_man
        case_4=case_new_man
        controleffect=control_man-case_new_man
        control_1=int(controleffect/3)
        control_2=control_1*2
        control_3=controleffect
        control_4=control_man
    else:
        case_1=int(case_old_all/3)
        case_2=case_1*2
        case_3=case_old_all
        case_4=case_new_all
        controleffect=control_all-case_new_all
        control_1=int(controleffect/3)
        control_2=control_1*2
        control_3=controleffect
        control_4=control_all

    case_num=0
    control_num=0
    sexlist=[]
    if phenotype[line_index]=='1002' or phenotype[line_index]=='1348':
        sexlist=['2']
    elif phenotype[line_index]=='1207':
        sexlist=['1']
    else:
        sexlist=['1','2']
    for i in range(len(indlist)):
        X=indlist[i]
            #print(X)
            #print(X.gender)
        if X.gender not in sexlist:
            outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
        if X.gender in sexlist and A[i]=='1':
            if case_num<case_1:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'2'+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            if  case_num>=case_1 and case_num<case_2:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'2'+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            if case_num>=case_2:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')    
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'2'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            case_num+=1
        elif X.gender in sexlist and A[i]=='2':
            outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'2'+'\n')
        elif X.gender in sexlist and A[i]=='0':
            if control_num<=control_1:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'1'+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            elif control_num>control_1 and control_num<=control_2:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'1'+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+"-9"+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            elif control_num>control_2 and control_num<=control_3:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+"-9"+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+"-9"+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'1'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
            elif control_num>control_3 and control_num<=control_4:
                outfile1.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+"-9"+'\n')
                outfile2.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+"-9"+'\n')
                outfile3.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'-9'+'\n')
                outfile4.write(X.id+' '+X.id+' '+'0'+' '+'0'+' '+X.gender+' '+'1'+'\n')
            control_num+=1
    outfile1.close()
    outfile2.close()
    outfile3.close()
    outfile4.close()
    line_index+=1
infile1.close()
infile2.close()
infile3.close()

## Step 13: Generate control pool 

In [None]:
from collections import defaultdict
patient=defaultdict(str)
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
for disease in phenotype:
    print(disease)
    for index in ['1','2','3','4']:
        infile=open(disease+'_'+index+'.fam')
        for line in infile:
            A=line.rsplit()
            if A[5]=='2':
                patient[A[0]]='0'
infile1=open('merge_white_Britich_clean.fam','r')
out1=open('health_control','w')
out2=open('health_control_male','w')
out3=open('health_control_female','w')

all_control=0
male_control=0
female_control=0

for line in infile1:
    A=line.rsplit()
    if A[0] not in patient.keys():
        out1.write(A[0]+'\n')
        all_control+=1
        if A[4]=='1':
            out2.write(A[0]+'\n')
            male_control+=1
        if A[4]=='2':
            out3.write(A[0]+'\n')
            female_control+=1
infile1.close()
print(all_control)
print(male_control)
print(female_control)
out1.close()
out2.close()
out3.close()

## Step 14: Generate training, validation and test sets (id_train.fam,id_valid.fam,id_test.fam)

In [None]:
from collections import defaultdict
import random
import multiprocessing
infile1=open('merge_white_Britich_clean.fam','r')
infile2=open('health_control','r')
infile3=open('health_control_male','r')
infile4=open('health_control_female','r')

all_fam=[]
indid=[]
for line in infile1:
    A=line.rsplit()
    all_fam.append(A[0]+' '+A[1]+' '+A[2]+' '+A[3]+' '+A[4]+' ')
    indid.append(A[0])
controllist=[]
control_male=[]
control_female=[]
for line in infile2:
    controllist.append(line.strip('\n'))
for line in infile3:
    control_male.append(line.strip('\n'))
for line in infile4:
    control_female.append(line.strip('\n'))
infile1.close()
infile2.close()
infile3.close()
infile4.close()
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1473','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def generate(disease):
    print(disease)
    out_train=open(disease+'_train.fam','w')
    out_valid=open(disease+'_valid.fam','w')
    out_test=open(disease+'_test.fam','w')
    case_train=[]
    control_train=[]
    case_valid=[]
    control_valid=[]
    case_test=[]
    control_test=[]

    for index in ['1','2']:
        infile=open(disease+'_'+index+'.fam')
        for line in infile:
            A=line.strip('\n').rsplit()
            if A[5]=='2':
                case_train.append(A[0])
        infile.close()

    infile=open(disease+'_'+'3.fam')
    for line in infile:
        A=line.strip('\n').rsplit()
        if A[5]=='2':
            case_valid.append(A[0])
    infile.close()

    infile=open(disease+'_'+'4.fam')
    for line in infile:
        A=line.strip('\n').rsplit()
        if A[5]=='2':
            case_test.append(A[0])
    infile.close()

    if disease=='1002' or disease=='1348':
        if len(case_train)>=len(control_female):
            control_train=control_female
        else:
            control_train=random.sample(control_female,len(case_train))
        if len(case_valid)>=len(control_female):
            control_valid=control_female
        else:
            control_valid=random.sample(control_female,len(case_valid))
        if len(case_test)>=len(control_female):
            control_test=control_female
        else:
            control_test=random.sample(control_female,len(case_test))
    elif disease=='1207':
        if len(case_train)>=len(control_male):
            control_train=control_male
        else:
            control_train=random.sample(control_male,len(case_train))
        if len(case_valid)>=len(control_male):
            control_valid=control_male
        else:
            control_valid=random.sample(control_male,len(case_valid))
        if len(case_test)>=len(control_male):
            control_test=control_male
        else:
            control_test=random.sample(control_male,len(case_test))
    else:                
        if len(case_train)>=len(controllist):
            control_train=controllist
        else:
            control_train=random.sample(controllist,len(case_train))

        if len(case_valid)>=len(controllist):
            control_valid=controllist
        else:
            control_valid=random.sample(controllist,len(case_valid))
        if len(case_test)>=len(controllist):
            control_test=controllist
        else:
            control_test=random.sample(controllist,len(case_test))

    for x in range(len(all_fam)):
        label='-9'
        if indid[x] in case_train:
            label='2'
        elif indid[x] in control_train:
            label='1'
        out_train.write(all_fam[x]+label+'\n')
        label='-9'
        if indid[x] in case_valid:
            label='2'
        elif indid[x] in control_valid:
            label='1'
        out_valid.write(all_fam[x]+label+'\n')
        label='-9'
        if indid[x] in case_test:
            label='2'
        elif indid[x] in control_test:
            label='1'
        out_test.write(all_fam[x]+label+'\n')
    out_train.close()
    out_valid.close()
    out_test.close()
for disease in phenotype:
    generate(disease)                      

## Step 15: Perform genome wide association study for 22 diseases
"covar_full.list" includes age, gender and top 10 principal components

In [None]:
import os
import multiprocessing
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
for x in phenotype:
    print(x)
    out=open(x+'_assoc.sh','w')
    out.write('cp '+x+'_train.fam merge_white_Britich_clean_'+x+'.fam'+'\n')
    if x in ['1002','1348','1207']:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --covar-number 1,3-12 --ci 0.95 --out "+str(x)+"_assoc_train_adjust\n")
    else:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --ci 0.95 --out "+str(x)+"_assoc_train_adjust\n")

    out.write('cp '+x+'_valid.fam merge_white_Britich_clean_'+x+'.fam'+'\n')
    if x in ['1002','1348','1207']:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --covar-number 1,3-12 --ci 0.95 --out "+str(x)+"_assoc_valid_adjust\n")
    else:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --ci 0.95 --out "+str(x)+"_assoc_valid_adjust\n")


    out.write('cp '+x+'_test.fam merge_white_Britich_clean_'+x+'.fam'+'\n')
    if x in ['1002','1348','1207']:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --covar-number 1,3-12 --ci 0.95 --out "+str(x)+"_assoc_test_adjust\n")
    else:
        out.write("plink --bfile merge_white_Britich_clean_"+x+" --geno 0.1 --hwe 0.000001 --maf 0.01 --mind 0.1 --adjust --logistic --covar covar_full.list --ci 0.95 --out "+str(x)+"_assoc_test_adjust\n") 
    out.close()

## Step 16: The pipeline of calculate polygenic risk score

### Generate SNP list  with p-values < 5E-3, < 5E-4, < 5E-5, < 5E-6

In [None]:
from collections import defaultdict
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def generate(index):
    print(index)
    valid=defaultdict(float)
    test=defaultdict(float)
    infile1=open('../'+index+'_assoc_train_adjust.assoc.logistic.adjusted')
    snplist1=open('./SNPselection/'+index+'_5E3_SNP','w')
    snplist2=open('./SNPselection/'+index+'_5E4_SNP','w')
    snplist3=open('./SNPselection/'+index+'_5E5_SNP','w')
    snplist4=open('./SNPselection/'+index+'_5E6_SNP','w')

    SNP1=0
    SNP2=0
    SNP3=0
    SNP4=0

    linenum=0
    for line in infile1:
        if linenum>0:
            A=line.strip('\n').rsplit()
            if float(A[3])<0.005:
                snplist1.write(A[1]+'\n')
                SNP1+=1
            if float(A[3])<0.0005:
                snplist2.write(A[1]+'\n')
                SNP2+=1
            if float(A[3])<0.00005:
                snplist3.write(A[1]+'\n')
                SNP3+=1
            if float(A[3])<0.000005:
                snplist4.write(A[1]+'\n')
                SNP4+=1
        linenum+=1
    snpvalidate.write(str(index)+' '+str(SNP1)+' '+str(SNP2)+' '+str(SNP3)+' '+str(SNP4)+'\n')
    infile1.close()
    snplist1.close()
    snplist2.close()
    snplist3.close()
    snplist4.close()
snpvalidate=open('./SNPselection/stat','w')
for index in phenotype:
    print(index)
    generate(index)
snpvalidate.close()

### Extract the SNP genotypes from SNP list

In [None]:
import os
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def generate(index):
    print(index)
    os.system('cp ../'+index+'_train.fam ../merge_white_Britich_clean_'+index+'.fam')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E6_SNP --make-bed --out ./SNPextraction/'+index+'_5E6_train_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E5_SNP --make-bed --out ./SNPextraction/'+index+'_5E5_train_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E4_SNP --make-bed --out ./SNPextraction/'+index+'_5E4_train_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E3_SNP --make-bed --out ./SNPextraction/'+index+'_5E3_train_extract'+'\n')

    os.system('cp ../'+index+'_valid.fam ../merge_white_Britich_clean_'+index+'.fam')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E6_SNP --make-bed --out ./SNPextraction/'+index+'_5E6_valid_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E5_SNP --make-bed --out ./SNPextraction/'+index+'_5E5_valid_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E4_SNP --make-bed --out ./SNPextraction/'+index+'_5E4_valid_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E3_SNP --make-bed --out ./SNPextraction/'+index+'_5E3_valid_extract'+'\n')

    os.system('cp ../'+index+'_test.fam ../merge_white_Britich_clean_'+index+'.fam')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E6_SNP --make-bed --out ./SNPextraction/'+index+'_5E6_test_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E5_SNP --make-bed --out ./SNPextraction/'+index+'_5E5_test_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E4_SNP --make-bed --out ./SNPextraction/'+index+'_5E4_test_extract'+'\n')
    os.system('plink --bfile ../merge_white_Britich_clean_'+index+' --extract ./SNPselection/'+index+'_5E3_SNP --make-bed --out ./SNPextraction/'+index+'_5E3_test_extract'+'\n')
for index in phenotype:
    generate(index)    
    
    

### Calculate linkage disequilibrium

In [None]:
import os
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def generate(index):
    print(index)
    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_train_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/train_5E6_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_train_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/train_5E5_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_train_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/train_5E4_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_train_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/train_5E3_'+index+'\n')

    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_valid_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/valid_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_valid_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/valid_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_valid_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/valid_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_train_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/valid_'+index+'\n')

    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_test_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/test_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_test_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/test_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_test_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/test_'+index+'\n')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_test_extract --indep-pairwise 50 5 0.5 --out ./LDpruning/test_'+index+'\n')
for index in phenotype:
    generate(index)

### Extract independent SNPs 

In [None]:
import os
import multiprocessing
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def generate(index):
    print(index)
    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_train_extract --extract ./LDpruning/train_5E6_'+index+'.prune.in --make-bed --out ./final/'+index+'_train_5E6_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_train_extract --extract ./LDpruning/train_5E5_'+index+'.prune.in --make-bed --out ./final/'+index+'_train_5E5_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_train_extract --extract ./LDpruning/train_5E4_'+index+'.prune.in --make-bed --out ./final/'+index+'_train_5E4_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_train_extract --extract ./LDpruning/train_5E3_'+index+'.prune.in --make-bed --out ./final/'+index+'_train_5E3_SNP')

    os.system('plink --bfile ./final/'+index+'_train_5E6_SNP --recodeA --out ./final/'+index+'_train_5E6_recodeA')
    os.system('plink --bfile ./final/'+index+'_train_5E5_SNP --recodeA --out ./final/'+index+'_train_5E5_recodeA')
    os.system('plink --bfile ./final/'+index+'_train_5E4_SNP --recodeA --out ./final/'+index+'_train_5E4_recodeA')
    os.system('plink --bfile ./final/'+index+'_train_5E3_SNP --recodeA --out ./final/'+index+'_train_5E3_recodeA')


    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_valid_extract --extract ./LDpruning/train_5E6_'+index+'.prune.in --make-bed --out ./final/'+index+'_valid_5E6_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_valid_extract --extract ./LDpruning/train_5E5_'+index+'.prune.in --make-bed --out ./final/'+index+'_valid_5E5_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_valid_extract --extract ./LDpruning/train_5E4_'+index+'.prune.in --make-bed --out ./final/'+index+'_valid_5E4_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_valid_extract --extract ./LDpruning/train_5E3_'+index+'.prune.in --make-bed --out ./final/'+index+'_valid_5E3_SNP')

    os.system('plink --bfile ./final/'+index+'_valid_5E6_SNP --recodeA --out ./final/'+index+'_valid_5E6_recodeA')
    os.system('plink --bfile ./final/'+index+'_valid_5E5_SNP --recodeA --out ./final/'+index+'_valid_5E5_recodeA')
    os.system('plink --bfile ./final/'+index+'_valid_5E4_SNP --recodeA --out ./final/'+index+'_valid_5E4_recodeA')
    os.system('plink --bfile ./final/'+index+'_valid_5E3_SNP --recodeA --out ./final/'+index+'_valid_5E3_recodeA')


    os.system('plink --bfile ./SNPextraction/'+index+'_5E6_test_extract --extract ./LDpruning/train_5E6_'+index+'.prune.in --make-bed --out ./final/'+index+'_test_5E6_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E5_test_extract --extract ./LDpruning/train_5E5_'+index+'.prune.in --make-bed --out ./final/'+index+'_test_5E5_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E4_test_extract --extract ./LDpruning/train_5E4_'+index+'.prune.in --make-bed --out ./final/'+index+'_test_5E4_SNP')
    os.system('plink --bfile ./SNPextraction/'+index+'_5E3_test_extract --extract ./LDpruning/train_5E3_'+index+'.prune.in --make-bed --out ./final/'+index+'_test_5E3_SNP')

    os.system('plink --bfile ./final/'+index+'_test_5E6_SNP --recodeA --out ./final/'+index+'_test_5E6_recodeA')
    os.system('plink --bfile ./final/'+index+'_test_5E5_SNP --recodeA --out ./final/'+index+'_test_5E5_recodeA')
    os.system('plink --bfile ./final/'+index+'_test_5E4_SNP --recodeA --out ./final/'+index+'_test_5E4_recodeA')
    os.system('plink --bfile ./final/'+index+'_test_5E3_SNP --recodeA --out ./final/'+index+'_test_5E3_recodeA')


for index in phenotype:
    generate(index)    
    

### Calculated polygenic risk score

In [None]:
from collections import defaultdict
from sklearn.metrics import roc_auc_score
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def cal_GRS(infile,EF):
    linenum=0
    total_snp=0
    diseaseid=[]
    effectsize_0=[]
    effectsize_1=[]
    effectsize_2=[]
    GRSlist=[]
    for line in infile:
        A=line.strip('\n').split(' ')
        GRS=0
        if linenum==0:
            total_snp=len(A)-6
            for x in range(total_snp):
                B=A[x+6].split('_')
                C=EF[B[0]].split('_')
                if B[1]==C[0]:
                    effectsize_0.append(0)
                    effectsize_1.append(float(C[1]))
                    effectsize_2.append(2*float(C[1]))
                else:
                    effectsize_2.append(0)
                    effectsize_1.append(float(C[1]))
                    effectsize_0.append(2*float(C[1]))
        else:
            if A[5]!='-9':
                if A[5]=='2':
                    diseaseid.append(1)
                elif A[5]=='1':
                    diseaseid.append(0)
                for x in range(total_snp):
                    if A[x+6]=='0':
                        GRS=GRS+effectsize_0[x]
                    elif A[x+6]=='1':
                        GRS=GRS+effectsize_1[x]
                    elif A[x+6]=='2':
                        GRS=GRS+effectsize_2[x]
                GRSlist.append(GRS)
        linenum+=1
    return GRSlist,diseaseid


def generate(index):
    EF=defaultdict(str)
    GRS_valid_5E3=[]
    GRS_valid_5E4=[]
    GRS_valid_5E5=[]
    GRS_valid_5E6=[]
    disease_valid_5E3=[]
    disease_valid_5E4=[]
    disease_valid_5E5=[]
    disease_valid_5E6=[]

    infile1=open('../'+index+'_assoc_train_adjust.assoc.logistic')
    infile2=open('./final/'+index+'_valid_5E3_recodeA.raw')
    infile3=open('./final/'+index+'_valid_5E4_recodeA.raw')
    infile4=open('./final/'+index+'_valid_5E5_recodeA.raw')
    infile5=open('./final/'+index+'_valid_5E6_recodeA.raw')

    linenum=0
    for line in infile1:
        A=line.strip('\n').rsplit()
        if linenum>0 and A[4]=='ADD':
            EF[A[1]]=A[3]+'_'+A[10]
        linenum+=1

    GRS_valid_5E3,disease_valid_5E3=cal_GRS(infile2,EF)
    GRS_valid_5E4,disease_valid_5E4=cal_GRS(infile3,EF)
    GRS_valid_5E5,disease_valid_5E5=cal_GRS(infile4,EF)
    GRS_valid_5E6,disease_valid_5E6=cal_GRS(infile5,EF)

    score1=roc_auc_score(disease_valid_5E3,GRS_valid_5E3)
    score2=roc_auc_score(disease_valid_5E4,GRS_valid_5E4)
    score3=roc_auc_score(disease_valid_5E5,GRS_valid_5E5)
    score4=roc_auc_score(disease_valid_5E6,GRS_valid_5E6)
    score=[]
    score.append(score1)
    score.append(score2)
    score.append(score3)
    score.append(score4)
    maxscore=max(score)
    indexmax=score.index(maxscore)
    scoretest=0
    if indexmax==0:
        infile=open('./final/'+index+'_test_5E3_recodeA.raw')
        GRS_test,disease_test=cal_GRS(infile,EF)
        scoretest=roc_auc_score(disease_test,GRS_test)
        infile.close()
    elif indexmax==1:
        infile=open('./final/'+index+'_test_5E4_recodeA.raw')
        GRS_test,disease_test=cal_GRS(infile,EF)
        scoretest=roc_auc_score(disease_test,GRS_test)
        infile.close()
    elif indexmax==2:
        infile=open('./final/'+index+'_test_5E5_recodeA.raw')
        GRS_test,disease_test=cal_GRS(infile,EF)
        scoretest=roc_auc_score(disease_test,GRS_test)
        infile.close()
    elif indexmax==3:
        infile=open('./final/'+index+'_test_5E6_recodeA.raw')
        GRS_test,disease_test=cal_GRS(infile,EF)
        scoretest=roc_auc_score(disease_test,GRS_test)
        infile.close()
    infile1.close()
    infile2.close()
    infile3.close()
    infile4.close()
    infile5.close()
    return scoretest

result=open('./GRS_result','w')
for index in phenotype:
    scoretest=generate(index)
    result.write(index+'\t'+str(scoretest)+'\n')
result.close()

## Step 17: Reform genotype data, L&E, covariate  information into .npy matrix

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
import multiprocessing
import operator
import scipy
from itertools import groupby
from operator import itemgetter 
def generate_G_P(path_phenotype,path_data):
    phenotype=np.genfromtxt(path_phenotype,dtype=str) 
    data=np.genfromtxt(path_data,dtype=str)
    SNPindexlist=[]
    SNP_id=[]
    ind_id=[]
    indindexlist=[]
    ind_genotype=[]
    all_disease=[]
    totalind=phenotype.shape[0]
    print(totalind)
    for i in range(totalind):
        print(i)
        if i==0:
            snpnum=data[0].shape[0]
            for j in range(6,snpnum):
                SNPindexlist.append(j)
                SNP_id.append(data[0][j].split('_')[0])
        else:
                if len(SNPindexlist)==1:
                    tmp=data[i][6]
                    if tmp=='NA':
                        tmp='0'
                else:
                    tmp=list(itemgetter(*SNPindexlist)(data[i]))
                    for x in range(len(tmp)):
                        if tmp[x]=='NA':
                            tmp[x]='0'
                indindexlist.append(i-1)
                ind_genotype.append(list(map(int,tmp)))   
                ind_id.append(phenotype[i-1][0])
                all_disease.append(int(data[i][5])-1)
    all_genotype=sta.zscore(ind_genotype)
    return all_genotype,all_disease,ind_id,SNP_id


def Cal_score(index):
    #print(index)
    outpath1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
    outpath2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
    outpath3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'

    out1_disease=outpath1+str(index)+'_disease_train'
    out2_disease=outpath2+str(index)+'_disease_valid'
    out3_disease=outpath3+str(index)+'_disease_test'
     
    out1_covar=outpath1+str(index)+'_covar_train'
    out2_covar=outpath2+str(index)+'_covar_valid'
    out3_covar=outpath3+str(index)+'_covar_test'
    
    out1_geno_train=outpath1+str(index)+'_genotype_train_5E3'
    out2_geno_train=outpath1+str(index)+'_genotype_train_5E4'
    out3_geno_train=outpath1+str(index)+'_genotype_train_5E5'
    out4_geno_train=outpath1+str(index)+'_genotype_train_5E6'

    out1_geno_valid=outpath2+str(index)+'_genotype_valid_5E3'
    out2_geno_valid=outpath2+str(index)+'_genotype_valid_5E4'
    out3_geno_valid=outpath2+str(index)+'_genotype_valid_5E5'
    out4_geno_valid=outpath2+str(index)+'_genotype_valid_5E6'

    out1_geno_test=outpath3+str(index)+'_genotype_test_5E3'
    out2_geno_test=outpath3+str(index)+'_genotype_test_5E4'
    out3_geno_test=outpath3+str(index)+'_genotype_test_5E5'
    out4_geno_test=outpath3+str(index)+'_genotype_test_5E6'

    out_snpid_5E3=outpath1+str(index)+'_snpid_5E3'
    out_snpid_5E4=outpath1+str(index)+'_snpid_5E4'
    out_snpid_5E5=outpath1+str(index)+'_snpid_5E5'
    out_snpid_5E6=outpath1+str(index)+'_snpid_5E6'


    out1_id=outpath1+str(index)+'_indid_train'
    out2_id=outpath2+str(index)+'_indid_valid'
    out3_id=outpath3+str(index)+'_indid_test'

    [train_genotype_1,train_disease_1,train_indid_1,train_SNPid_1]=generate_G_P(dir_path+index+'_train_5E3_SNP.fam',dir_path+index+'_train_5E3_recodeA.raw')
    [valid_genotype_1,valid_disease_1,valid_indid_1,valid_SNPid_1]=generate_G_P(dir_path+index+'_valid_5E3_SNP.fam',dir_path+index+'_valid_5E3_recodeA.raw')
    [test_genotype_1,test_disease_1,test_indid_1,test_SNPid_1]=generate_G_P(dir_path+index+'_test_5E3_SNP.fam',dir_path+index+'_test_5E3_recodeA.raw')

    np.save(out_snpid_5E3,train_SNPid_1)
    np.save(out1_geno_train,train_genotype_1)
    np.save(out1_geno_valid,valid_genotype_1)
    np.save(out1_geno_test,test_genotype_1)
    np.save(out1_id,train_indid_1)
    np.save(out2_id,valid_indid_1)
    np.save(out3_id,test_indid_1)
    np.save(out1_disease,train_disease_1)
    np.save(out2_disease,valid_disease_1)
    np.save(out3_disease,test_disease_1)

    train_genotype_1=[]
    train_disease_1=[]
    train_indid_1=[]
    train_SNPid_1=[]
    valid_genotype_1=[]
    valid_disease_1=[]
    valid_indid_1=[]
    valid_SNPid_1=[]
    test_genotype_1=[]
    test_disease_1=[]
    test_indid_1=[]
    test_SNPid_1=[]

    [train_genotype_2,train_disease_2,train_indid_2,train_SNPid_2]=generate_G_P(dir_path+index+'_train_5E4_SNP.fam',dir_path+index+'_train_5E4_recodeA.raw')
    [valid_genotype_2,valid_disease_2,valid_indid_2,valid_SNPid_2]=generate_G_P(dir_path+index+'_valid_5E4_SNP.fam',dir_path+index+'_valid_5E4_recodeA.raw')
    [test_genotype_2,test_disease_2,test_indid_2,test_SNPid_2]=generate_G_P(dir_path+index+'_test_5E4_SNP.fam',dir_path+index+'_test_5E4_recodeA.raw')
 
    np.save(out_snpid_5E4,train_SNPid_2)
    np.save(out2_geno_train,train_genotype_2)
    np.save(out2_geno_valid,valid_genotype_2)
    np.save(out2_geno_test,test_genotype_2)

    train_genotype_2=[]
    train_disease_2=[]
    train_indid_2=[]
    train_SNPid_2=[]
    valid_genotype_2=[]
    valid_disease_2=[]
    valid_indid_2=[]
    valid_SNPid_2=[]
    test_genotype_2=[]
    test_disease_2=[]
    test_indid_2=[]
    test_SNPid_2=[]
    
    [train_genotype_3,train_disease_3,train_indid_3,train_SNPid_3]=generate_G_P(dir_path+index+'_train_5E5_SNP.fam',dir_path+index+'_train_5E5_recodeA.raw')
    [valid_genotype_3,valid_disease_3,valid_indid_3,valid_SNPid_3]=generate_G_P(dir_path+index+'_valid_5E5_SNP.fam',dir_path+index+'_valid_5E5_recodeA.raw')
    [test_genotype_3,test_disease_3,test_indid_3,test_SNPid_3]=generate_G_P(dir_path+index+'_test_5E5_SNP.fam',dir_path+index+'_test_5E5_recodeA.raw')
    

    np.save(out_snpid_5E5,train_SNPid_3)
    np.save(out3_geno_train,train_genotype_3)
    np.save(out3_geno_valid,valid_genotype_3)
    np.save(out3_geno_test,test_genotype_3)

    train_genotype_3=[]
    train_disease_3=[]
    train_indid_3=[]
    train_SNPid_3=[]
    valid_genotype_3=[]
    valid_disease_3=[]
    valid_indid_3=[]
    valid_SNPid_3=[]
    test_genotype_3=[]
    test_disease_3=[]
    test_indid_3=[]
    test_SNPid_3=[]

    [train_genotype_4,train_disease_4,train_indid_4,train_SNPid_4]=generate_G_P(dir_path+index+'_train_5E6_SNP.fam',dir_path+index+'_train_5E6_recodeA.raw')
    [valid_genotype_4,valid_disease_4,valid_indid_4,valid_SNPid_4]=generate_G_P(dir_path+index+'_valid_5E6_SNP.fam',dir_path+index+'_valid_5E6_recodeA.raw')
    [test_genotype_4,test_disease_4,test_indid_4,test_SNPid_4]=generate_G_P(dir_path+index+'_test_5E6_SNP.fam',dir_path+index+'_test_5E6_recodeA.raw')


    np.save(out_snpid_5E6,train_SNPid_4)
    np.save(out4_geno_train,train_genotype_4)
    np.save(out4_geno_valid,valid_genotype_4)
    np.save(out4_geno_test,test_genotype_4)

    train_genotype_4=[]
    train_disease_4=[]
    train_indid_4=[]
    train_SNPid_4=[]
    valid_genotype_4=[]
    valid_disease_4=[]
    valid_indid_4=[]
    valid_SNPid_4=[]
    test_genotype_4=[]
    test_disease_4=[]
    test_indid_4=[]
    test_SNPid_4=[]

    num1=len(train_indid_1)
    num2=len(valid_indid_1)
    num3=len(test_indid_1)

    cov1=[]
    cov2=[]
    cov3=[]

    for i in range(num1):
        if len(cov1)==0:
            cov1=covlist[indall.index(train_indid_1[i]),2:]
        else:
            cov1=np.vstack((cov1,covlist[indall.index(train_indid_1[i]),2:]))
    for i in range(num2):
        if len(cov2)==0:
            cov2=covlist[indall.index(valid_indid_1[i]),2:]
        else:
            cov2=np.vstack((cov2,covlist[indall.index(valid_indid_1[i]),2:]))
    for i in range(num3):
        if len(cov3)==0:
            cov3=covlist[indall.index(test_indid_1[i]),2:]
        else:
            cov3=np.vstack((cov3,covlist[indall.index(test_indid_1[i]),2:]))
    np.save(out1_covar, cov1)
    np.save(out2_covar, cov2)
    np.save(out3_covar, cov3)

covlist=np.loadtxt('covar_full.list',dtype=float)
indall=[]
infile1=open('/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/fam_disease/white_british/chip/merge_white_Britich_clean_train.fam')
for line in infile1:
    A=line.strip('\n').split()
    indall.append(A[0])
infile1.close()

phenotype=['threshold']
dir_path='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/fam_disease/white_british/chip/threshold_hypertension/final/'
for index in phenotype:
    print(index)
    Cal_score(index)


## Step 18: Remove the effects of age and gender from L&E features

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
#phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
phenotype=['1242']
for index in phenotype: 
    print(index)
    Pheno_train=np.load(dir_path1+index+'_phenotype_train.npy')
    Pheno_valid=np.load(dir_path2+index+'_phenotype_valid.npy')
    Pheno_test=np.load(dir_path3+index+'_phenotype_test.npy')

    Cov_train=np.load(dir_path1+index+'_covar_train.npy')
    Cov_valid=np.load(dir_path2+index+'_covar_valid.npy')
    Cov_test=np.load(dir_path3+index+'_covar_test.npy')

    Matrix_residual_train=[]
    Matrix_residual_valid=[]
    Matrix_residual_test=[]

    for i in range(88):
        print(i)
        A_train=Pheno_train[:,i]
        B_train=Cov_train[:,0:2]
        reg_train=LinearRegression().fit(B_train,A_train)
        residual_train=np.expand_dims(reg_train.predict(B_train)-A_train,axis=1)

        A_valid=Pheno_valid[:,i]
        B_valid=Cov_valid[:,0:2]
        reg_valid=LinearRegression().fit(B_valid,A_valid)
        residual_valid=np.expand_dims(reg_valid.predict(B_valid)-A_valid,axis=1)

        A_test=Pheno_test[:,i]
        B_test=Cov_test[:,0:2]
        reg_test=LinearRegression().fit(B_test,A_test)
        residual_test=np.expand_dims(reg_test.predict(B_test)-A_test,axis=1)

        if i==0:
            Matrix_residual_train=residual_train
            Matrix_residual_valid=residual_valid
            Matrix_residual_test=residual_test
        else:           
            Matrix_residual_train=np.concatenate((Matrix_residual_train,residual_train),axis=1)
            Matrix_residual_valid=np.concatenate((Matrix_residual_valid,residual_valid),axis=1)
            Matrix_residual_test=np.concatenate((Matrix_residual_test,residual_test),axis=1)

    np.save('./train/'+index+'_train_residual.npy',Matrix_residual_train)
    np.save('./valid/'+index+'_valid_residual.npy',Matrix_residual_valid)
    np.save('./test/'+index+'_test_residual.npy',Matrix_residual_test)



## Step 19: Disease prediction by genotype only

### Lasso regression

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
#phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
phenotype=['1242']
def predict(index):
    print('loading train')
    Geno_train_5E3=np.load(dir_path1+index+'_genotype_train_5E3.npy')
    Geno_train_5E4=np.load(dir_path1+index+'_genotype_train_5E4.npy')
    Geno_train_5E5=np.load(dir_path1+index+'_genotype_train_5E5.npy')
    Geno_train_5E6=np.load(dir_path1+index+'_genotype_train_5E6.npy')
    print('loading train')

    print('loading valid')
    Geno_valid_5E3=np.load(dir_path2+index+'_genotype_valid_5E3.npy')
    Geno_valid_5E4=np.load(dir_path2+index+'_genotype_valid_5E4.npy')
    Geno_valid_5E5=np.load(dir_path2+index+'_genotype_valid_5E5.npy')
    Geno_valid_5E6=np.load(dir_path2+index+'_genotype_valid_5E6.npy')
    print('loading valid')

    print('loading test')
    Geno_test_5E3=np.load(dir_path3+index+'_genotype_test_5E3.npy')
    Geno_test_5E4=np.load(dir_path3+index+'_genotype_test_5E4.npy')
    Geno_test_5E5=np.load(dir_path3+index+'_genotype_test_5E5.npy')
    Geno_test_5E6=np.load(dir_path3+index+'_genotype_test_5E6.npy')
    print('loading test')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=0
    s_best=0

    for s in [1,2,3,4]:
        print(s)
        for Cx in [0.0001,0.001,0.01,0.1]:
            LR1 = LogisticRegression(penalty='l1', C=Cx, max_iter=10000)
            AUC_valid=0
            if s==1:
                LR1.fit(Geno_train_5E3,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E3)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==2:
                LR1.fit(Geno_train_5E4,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E4)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==3:
                LR1.fit(Geno_train_5E5,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E5)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==4:
                LR1.fit(Geno_train_5E6,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E6)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            if AUC_valid>AUC_max:
                AUC_max=AUC_valid
                Cx_max=Cx
                s_best=s
            
    LR2 = LogisticRegression(penalty='l1', C=Cx_max, max_iter=10000)
    AUC_test=0
    if s_best==1:
        LR2.fit(Geno_train_5E3,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E3)
        Y2=LR2.predict_proba(Geno_test_5E3)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./LR/'+index+'_valid.npy',Y1)
        np.save('./LR/'+index+'_test.npy',Y2)
    elif s_best==2:
        LR2.fit(Geno_train_5E4,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E4)
        Y2=LR2.predict_proba(Geno_test_5E4)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./LR/'+index+'_valid.npy',Y1)
        np.save('./LR/'+index+'_test.npy',Y2)
    elif s_best==3:
        LR2.fit(Geno_train_5E5,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E5)
        Y2=LR2.predict_proba(Geno_test_5E5)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./LR/'+index+'_valid.npy',Y1)
        np.save('./LR/'+index+'_test.npy',Y2)
    elif s_best==4:
        LR2.fit(Geno_train_5E6,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E6)
        Y2=LR2.predict_proba(Geno_test_5E6)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./LR/'+index+'_valid.npy',Y1)
        np.save('./LR/'+index+'_test.npy',Y2)
    out=open('./LR/LR_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)


### Neural Network

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from  sklearn.neural_network import MLPClassifier
dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    print('loading train')
    Geno_train_5E3=np.load(dir_path1+index+'_genotype_train_5E3.npy')
    Geno_train_5E4=np.load(dir_path1+index+'_genotype_train_5E4.npy')
    Geno_train_5E5=np.load(dir_path1+index+'_genotype_train_5E5.npy')
    Geno_train_5E6=np.load(dir_path1+index+'_genotype_train_5E6.npy')
    print('loading train')

    print('loading valid')
    Geno_valid_5E3=np.load(dir_path2+index+'_genotype_valid_5E3.npy')
    Geno_valid_5E4=np.load(dir_path2+index+'_genotype_valid_5E4.npy')
    Geno_valid_5E5=np.load(dir_path2+index+'_genotype_valid_5E5.npy')
    Geno_valid_5E6=np.load(dir_path2+index+'_genotype_valid_5E6.npy')
    print('loading valid')

    print('loading test')
    Geno_test_5E3=np.load(dir_path3+index+'_genotype_test_5E3.npy')
    Geno_test_5E4=np.load(dir_path3+index+'_genotype_test_5E4.npy')
    Geno_test_5E5=np.load(dir_path3+index+'_genotype_test_5E5.npy')
    Geno_test_5E6=np.load(dir_path3+index+'_genotype_test_5E6.npy')
    print('loading test')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=0
    s_best=0

    for s in [1,2,3,4]:
        print(s)
        for Cx in [(20,20,20),(30,30),(10,10,10,10),(30,20,10)]:
            LR1=MLPClassifier(hidden_layer_sizes=Cx,max_iter=1000)
            AUC_valid=0
            if s==1:
                LR1.fit(Geno_train_5E3,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E3)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==2:
                LR1.fit(Geno_train_5E4,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E4)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==3:
                LR1.fit(Geno_train_5E5,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E5)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==4:
                LR1.fit(Geno_train_5E6,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E6)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            if AUC_valid>AUC_max:
                AUC_max=AUC_valid
                Cx_max=Cx
                s_best=s
    LR2=MLPClassifier(hidden_layer_sizes=Cx_max,max_iter=1000)
    AUC_test=0
    if s_best==1:
        LR2.fit(Geno_train_5E3,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E3)
        Y2=LR2.predict_proba(Geno_test_5E3)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./NN/'+index+'_valid.npy',Y1)
        np.save('./NN/'+index+'_test.npy',Y2)
    elif s_best==2:
        LR2.fit(Geno_train_5E4,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E4)
        Y2=LR2.predict_proba(Geno_test_5E4)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./NN/'+index+'_valid.npy',Y1)
        np.save('./NN/'+index+'_test.npy',Y2)
    elif s_best==3:
        LR2.fit(Geno_train_5E5,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E5)
        Y2=LR2.predict_proba(Geno_test_5E5)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./NN/'+index+'_valid.npy',Y1)
        np.save('./NN/'+index+'_test.npy',Y2)
    elif s_best==4:
        LR2.fit(Geno_train_5E6,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E6)
        Y2=LR2.predict_proba(Geno_test_5E6)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./NN/'+index+'_valid.npy',Y1)
        np.save('./NN/'+index+'_test.npy',Y2)
    out=open('./NN/NN_'+index,'w')
    outstructure=open('./NN/select_structure_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    outstructure.write(index+'\t'+str(s_best)+'\t'+str(Cx_max)+'\n')
    out.close() 
    outstructure.close()
for m in phenotype:
    predict(m)


### Random Forest

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.ensemble import RandomForestClassifier

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    print('loading train')
    Geno_train_5E3=np.load(dir_path1+index+'_genotype_train_5E3.npy')
    Geno_train_5E4=np.load(dir_path1+index+'_genotype_train_5E4.npy')
    Geno_train_5E5=np.load(dir_path1+index+'_genotype_train_5E5.npy')
    Geno_train_5E6=np.load(dir_path1+index+'_genotype_train_5E6.npy')
    print('loading train')

    print('loading valid')
    Geno_valid_5E3=np.load(dir_path2+index+'_genotype_valid_5E3.npy')
    Geno_valid_5E4=np.load(dir_path2+index+'_genotype_valid_5E4.npy')
    Geno_valid_5E5=np.load(dir_path2+index+'_genotype_valid_5E5.npy')
    Geno_valid_5E6=np.load(dir_path2+index+'_genotype_valid_5E6.npy')
    print('loading valid')

    print('loading test')
    Geno_test_5E3=np.load(dir_path3+index+'_genotype_test_5E3.npy')
    Geno_test_5E4=np.load(dir_path3+index+'_genotype_test_5E4.npy')
    Geno_test_5E5=np.load(dir_path3+index+'_genotype_test_5E5.npy')
    Geno_test_5E6=np.load(dir_path3+index+'_genotype_test_5E6.npy')
    print('loading test')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')


    AUC_max=0
    Cx_max=0
    s_best=0

    for s in [1,2,3,4]:
        print(s)
        for Cx in [0.0001,0.001,0.01,0.1]:
            LR1 = RandomForestClassifier(min_impurity_decrease=Cx)
            AUC_valid=0
            if s==1:
                LR1.fit(Geno_train_5E3,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E3)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==2:
                LR1.fit(Geno_train_5E4,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E4)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==3:
                LR1.fit(Geno_train_5E5,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E5)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==4:
                LR1.fit(Geno_train_5E6,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E6)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            if AUC_valid>AUC_max:
                AUC_max=AUC_valid
                Cx_max=Cx
                s_best=s
            
    LR2 = RandomForestClassifier(min_impurity_decrease=Cx_max)
    AUC_test=0
    if s_best==1:
        LR2.fit(Geno_train_5E3,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E3)
        Y2=LR2.predict_proba(Geno_test_5E3)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./RF/'+index+'_valid.npy',Y1)
        np.save('./RF/'+index+'_test.npy',Y2)
    elif s_best==2:
        LR2.fit(Geno_train_5E4,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E4)
        Y2=LR2.predict_proba(Geno_test_5E4)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./RF/'+index+'_valid.npy',Y1)
        np.save('./RF/'+index+'_test.npy',Y2)
    elif s_best==3:
        LR2.fit(Geno_train_5E5,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E5)
        Y2=LR2.predict_proba(Geno_test_5E5)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./RF/'+index+'_valid.npy',Y1)
        np.save('./RF/'+index+'_test.npy',Y2)
    elif s_best==4:
        LR2.fit(Geno_train_5E6,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E6)
        Y2=LR2.predict_proba(Geno_test_5E6)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./RF/'+index+'_valid.npy',Y1)
        np.save('./RF/'+index+'_test.npy',Y2)
    out=open('./RF/RF_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)


### Adaboost

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    print('loading train')
    Geno_train_5E3=np.load(dir_path1+index+'_genotype_train_5E3.npy')
    Geno_train_5E4=np.load(dir_path1+index+'_genotype_train_5E4.npy')
    Geno_train_5E5=np.load(dir_path1+index+'_genotype_train_5E5.npy')
    Geno_train_5E6=np.load(dir_path1+index+'_genotype_train_5E6.npy')
    print('loading train')

    print('loading valid')
    Geno_valid_5E3=np.load(dir_path2+index+'_genotype_valid_5E3.npy')
    Geno_valid_5E4=np.load(dir_path2+index+'_genotype_valid_5E4.npy')
    Geno_valid_5E5=np.load(dir_path2+index+'_genotype_valid_5E5.npy')
    Geno_valid_5E6=np.load(dir_path2+index+'_genotype_valid_5E6.npy')
    print('loading valid')

    print('loading test')
    Geno_test_5E3=np.load(dir_path3+index+'_genotype_test_5E3.npy')
    Geno_test_5E4=np.load(dir_path3+index+'_genotype_test_5E4.npy')
    Geno_test_5E5=np.load(dir_path3+index+'_genotype_test_5E5.npy')
    Geno_test_5E6=np.load(dir_path3+index+'_genotype_test_5E6.npy')
    print('loading test')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=''
    s_best=0

    for s in [1,2,3,4]:
        print(s)
        for Cx in [DecisionTreeClassifier(),LogisticRegression(),ExtraTreeClassifier(),GaussianNB()]:
            print(Cx)
            LR1 = AdaBoostClassifier(base_estimator=Cx)
            AUC_valid=0
            if s==1:
                LR1.fit(Geno_train_5E3,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E3)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==2:
                LR1.fit(Geno_train_5E4,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E4)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==3:
                LR1.fit(Geno_train_5E5,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E5)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==4:
                LR1.fit(Geno_train_5E6,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E6)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            if AUC_valid>AUC_max:
                AUC_max=AUC_valid
                Cx_max=Cx
                s_best=s
            
    LR2 = AdaBoostClassifier(base_estimator=Cx_max)
    AUC_test=0
    if s_best==1:
        LR2.fit(Geno_train_5E3,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E3)
        Y2=LR2.predict_proba(Geno_test_5E3)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./ada/'+index+'_valid.npy',Y1)
        np.save('./ada/'+index+'_test.npy',Y2)
    elif s_best==2:
        LR2.fit(Geno_train_5E4,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E4)
        Y2=LR2.predict_proba(Geno_test_5E4)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./ada/'+index+'_valid.npy',Y1)
        np.save('./ada/'+index+'_test.npy',Y2)
    elif s_best==3:
        LR2.fit(Geno_train_5E5,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E5)
        Y2=LR2.predict_proba(Geno_test_5E5)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./ada/'+index+'_valid.npy',Y1)
        np.save('./ada/'+index+'_test.npy',Y2)
    elif s_best==4:
        LR2.fit(Geno_train_5E6,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E6)
        Y2=LR2.predict_proba(Geno_test_5E6)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./ada/'+index+'_valid.npy',Y1)
        np.save('./ada/'+index+'_test.npy',Y2)
    out=open('./ada/ada_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)

### Gradient Boosting

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    print('loading train')
    Geno_train_5E3=np.load(dir_path1+index+'_genotype_train_5E3.npy')
    Geno_train_5E4=np.load(dir_path1+index+'_genotype_train_5E4.npy')
    Geno_train_5E5=np.load(dir_path1+index+'_genotype_train_5E5.npy')
    Geno_train_5E6=np.load(dir_path1+index+'_genotype_train_5E6.npy')
    print('loading train')

    print('loading valid')
    Geno_valid_5E3=np.load(dir_path2+index+'_genotype_valid_5E3.npy')
    Geno_valid_5E4=np.load(dir_path2+index+'_genotype_valid_5E4.npy')
    Geno_valid_5E5=np.load(dir_path2+index+'_genotype_valid_5E5.npy')
    Geno_valid_5E6=np.load(dir_path2+index+'_genotype_valid_5E6.npy')
    print('loading valid')

    print('loading test')
    Geno_test_5E3=np.load(dir_path3+index+'_genotype_test_5E3.npy')
    Geno_test_5E4=np.load(dir_path3+index+'_genotype_test_5E4.npy')
    Geno_test_5E5=np.load(dir_path3+index+'_genotype_test_5E5.npy')
    Geno_test_5E6=np.load(dir_path3+index+'_genotype_test_5E6.npy')
    print('loading test')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')
    AUC_max=0
    Cx_max=0
    s_best=0

    for s in [1,2,3,4]:
        print(s)
        for Cx in [0.0001,0.001,0.01,0.1]:
            LR1 = GradientBoostingClassifier(min_impurity_decrease=Cx)
            AUC_valid=0
            if s==1:
                LR1.fit(Geno_train_5E3,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E3)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==2:
                LR1.fit(Geno_train_5E4,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E4)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==3:
                LR1.fit(Geno_train_5E5,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E5)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            elif s==4:
                LR1.fit(Geno_train_5E6,disease_train)
                Y1=LR1.predict_proba(Geno_valid_5E6)
                AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
            if AUC_valid>AUC_max:
                AUC_max=AUC_valid
                Cx_max=Cx
                s_best=s
    LR2 = GradientBoostingClassifier(min_impurity_decrease=Cx_max)
    AUC_test=0
    if s_best==1:
        LR2.fit(Geno_train_5E3,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E3)
        Y2=LR2.predict_proba(Geno_test_5E3)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./GB/'+index+'_valid.npy',Y1)
        np.save('./GB/'+index+'_test.npy',Y2)
    elif s_best==2:
        LR2.fit(Geno_train_5E4,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E4)
        Y2=LR2.predict_proba(Geno_test_5E4)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./GB/'+index+'_valid.npy',Y1)
        np.save('./GB/'+index+'_test.npy',Y2)
    elif s_best==3:
        LR2.fit(Geno_train_5E5,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E5)
        Y2=LR2.predict_proba(Geno_test_5E5)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./GB/'+index+'_valid.npy',Y1)
        np.save('./GB/'+index+'_test.npy',Y2)
    elif s_best==4:
        LR2.fit(Geno_train_5E6,disease_train)
        Y1=LR2.predict_proba(Geno_valid_5E6)
        Y2=LR2.predict_proba(Geno_test_5E6)
        AUC_test=roc_auc_score(disease_test,Y2[:,1])
        np.save('./GB/'+index+'_valid.npy',Y1)
        np.save('./GB/'+index+'_test.npy',Y2)
    out=open('./GB/GB_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)


## Step 19: Disease prediction by L&E only

### Lasso Regression

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    
    Pheno_train=np.load('./train/'+index+'_train_residual.npy')
    Pheno_valid=np.load('./valid/'+index+'_valid_residual.npy')
    Pheno_test=np.load('./test/'+index+'_test_residual.npy')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=0

    for Cx in [0.0001,0.001,0.01,0.1]:
        LR1 = LogisticRegression(penalty='l1', C=Cx, max_iter=10000)
        AUC_valid=0
        LR1.fit(Pheno_train,disease_train)
        Y1=LR1.predict_proba(Pheno_valid)
        AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
        if AUC_valid>AUC_max:
            AUC_max=AUC_valid
            Cx_max=Cx
                    
    LR2 = LogisticRegression(penalty='l1', C=Cx_max, max_iter=10000)
    AUC_test=0
    LR2.fit(Pheno_train,disease_train)
    Y2=LR2.predict_proba(Pheno_test)
    AUC_test=roc_auc_score(disease_test,Y2[:,1])
    np.save('./LR/'+index+'_valid.npy',Y1)
    np.save('./LR/'+index+'_test.npy',Y2)
    out=open('./LR/LR_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    print(m)
    predict(m)



### Neural Network

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from  sklearn.neural_network import MLPClassifier

dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
#phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
phenotype=['1242']
def predict(index):
    Pheno_train=np.load('./train/'+index+'_train_residual.npy')
    Pheno_valid=np.load('./valid/'+index+'_valid_residual.npy')
    Pheno_test=np.load('./test/'+index+'_test_residual.npy')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=0

    for Cx in [(20,20,20),(30,30),(10,10,10,10),(30,20,10)]:
        LR1=MLPClassifier(hidden_layer_sizes=Cx)
        AUC_valid=0
        LR1.fit(Pheno_train,disease_train)
        Y1=LR1.predict_proba(Pheno_valid)
        AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
        if AUC_valid>AUC_max:
            AUC_max=AUC_valid
            Cx_max=Cx
                    
    LR2=MLPClassifier(hidden_layer_sizes=Cx_max)
    AUC_test=0
    LR2.fit(Pheno_train,disease_train)
    Y2=LR2.predict_proba(Pheno_test)
    AUC_test=roc_auc_score(disease_test,Y2[:,1])
    np.save('./NN/'+index+'_valid.npy',Y1)
    np.save('./NN/'+index+'_test.npy',Y2)
    out=open('./NN/NN_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)



### Random Forest

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.ensemble import RandomForestClassifier
dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    Pheno_train=np.load('./train/'+index+'_train_residual.npy')
    Pheno_valid=np.load('./valid/'+index+'_valid_residual.npy')
    Pheno_test=np.load('./test/'+index+'_test_residual.npy')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=0

    for Cx in [0.0001,0.001,0.01,0.1]:
        LR1 = RandomForestClassifier(min_impurity_decrease=Cx)
        AUC_valid=0
        LR1.fit(Pheno_train,disease_train)
        Y1=LR1.predict_proba(Pheno_valid)
        AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
        if AUC_valid>AUC_max:
            AUC_max=AUC_valid
            Cx_max=Cx
                    
    LR2 = RandomForestClassifier(min_impurity_decrease=Cx_max)
    AUC_test=0
    LR2.fit(Pheno_train,disease_train)
    Y2=LR2.predict_proba(Pheno_test)
    AUC_test=roc_auc_score(disease_test,Y2[:,1])
    np.save('./RF/'+index+'_valid.npy',Y1)
    np.save('./RF/'+index+'_test.npy',Y2)
    out=open('./RF/RF_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)


### Adaboost

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    
    Pheno_train=np.load('./train/'+index+'_train_residual.npy')
    Pheno_valid=np.load('./valid/'+index+'_valid_residual.npy')
    Pheno_test=np.load('./test/'+index+'_test_residual.npy')

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

    AUC_max=0
    Cx_max=''

    for Cx in [DecisionTreeClassifier(),LogisticRegression(),ExtraTreeClassifier(),GaussianNB()]:
        LR1 = AdaBoostClassifier(base_estimator=Cx)
        AUC_valid=0
        LR1.fit(Pheno_train,disease_train)
        Y1=LR1.predict_proba(Pheno_valid)
        AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
        if AUC_valid>AUC_max:
            AUC_max=AUC_valid
            Cx_max=Cx
                    
    LR2 = AdaBoostClassifier(base_estimator=Cx_max)
    AUC_test=0
    LR2.fit(Pheno_train,disease_train)
    Y2=LR2.predict_proba(Pheno_test)
    AUC_test=roc_auc_score(disease_test,Y2[:,1])
    np.save('./ada/'+index+'_valid.npy',Y1)
    np.save('./ada/'+index+'_test.npy',Y2)
    out=open('./ada/ada_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    predict(m)


### Gradient boosting

In [None]:
from collections import defaultdict
import matplotlib
import re
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import operator
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from scipy.stats import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import pickle
from sklearn.ensemble import GradientBoostingClassifier
dir_path1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/train/'
dir_path2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
dir_path3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
def predict(index):
    
    Pheno_train=np.load('./train/'+index+'_train_residual.npy')
    Pheno_valid=np.load('./valid/'+index+'_valid_residual.npy')
    Pheno_test=np.load('./test/'+index+'_test_residual.npy')   

    disease_train=np.load(dir_path1+index+'_disease_train.npy')
    disease_valid=np.load(dir_path2+index+'_disease_valid.npy')
    disease_test=np.load(dir_path3+index+'_disease_test.npy')

   
    AUC_max=0
    Cx_max=0

    for Cx in [0.0001,0.001,0.01,0.1]:
        LR1 = GradientBoostingClassifier(min_impurity_decrease=Cx)
        AUC_valid=0
        LR1.fit(Pheno_train,disease_train)
        Y1=LR1.predict_proba(Pheno_valid)
        AUC_valid=roc_auc_score(disease_valid,Y1[:,1])
        if AUC_valid>AUC_max:
            AUC_max=AUC_valid
            Cx_max=Cx
                    
    LR2 = GradientBoostingClassifier(min_impurity_decrease=Cx_max)
    AUC_test=0
    LR2.fit(Pheno_train,disease_train)
    Y2=LR2.predict_proba(Pheno_test)
    AUC_test=roc_auc_score(disease_test,Y2[:,1])
    np.save('./GB/'+index+'_valid.npy',Y1)
    np.save('./GB/'+index+'_test.npy',Y2)
    out=open('./GB/GB_'+index,'w')
    out.write(index+'\t'+str(AUC_max)+'\t'+str(AUC_test)+'\n')
    out.close() 
for m in phenotype:
    print(m)
    predict(m)



## Step 20: Disease prediction by joint model (score aggregation)

In [None]:
from collections import defaultdict
import matplotlib
import re
import statsmodels.api as sm
from sklearn.decomposition import PCA
import numpy as np
import operator
from scipy import stats as sta
import math
from scipy.interpolate import spline
import multiprocessing
import operator
import scipy
from itertools import groupby
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from sklearn.linear_model import LogisticRegression
from scipy.stats.stats import pearsonr
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
def generate(index,out_test):
    print(index)   
    dir_data0='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/valid/'
    dir_data1='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/meta_analysis/data_extraction/test/'
    dir_data2='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/prediction_final/prediction/final/ML_genotype/'
    dir_data3='/oak/stanford/groups/arend/Eric/UKBB/Genenvironment/prediction_final/prediction/final/ML_phenotype/remove_age_effect/'
    LR_genotype_valid=np.load(dir_data2+'/LR/'+index+'_valid.npy')
    NN_genotype_valid=np.load(dir_data2+'/NN/'+index+'_valid.npy')
    RF_genotype_valid=np.load(dir_data2+'/RF/'+index+'_valid.npy')
    GB_genotype_valid=np.load(dir_data2+'/GB/'+index+'_valid.npy')
    ada_genotype_valid=np.load(dir_data2+'/ada/'+index+'_valid.npy')
    LR_phenotype_valid=np.load(dir_data3+'/LR/'+index+'_valid.npy')
    NN_phenotype_valid=np.load(dir_data3+'/NN/'+index+'_valid.npy')
    RF_phenotype_valid=np.load(dir_data3+'/RF/'+index+'_valid.npy')
    GB_phenotype_valid=np.load(dir_data3+'/GB/'+index+'_valid.npy')
    ada_phenotype_valid=np.load(dir_data3+'/ada/'+index+'_valid.npy')
    disease_valid=np.load(dir_data0+index+'_disease_valid.npy')

    LR_genotype_test=np.load(dir_data2+'/LR/'+index+'_test.npy')
    NN_genotype_test=np.load(dir_data2+'/NN/'+index+'_test.npy')
    RF_genotype_test=np.load(dir_data2+'/RF/'+index+'_test.npy')
    GB_genotype_test=np.load(dir_data2+'/GB/'+index+'_test.npy')
    ada_genotype_test=np.load(dir_data2+'/ada/'+index+'_test.npy')
    LR_phenotype_test=np.load(dir_data3+'/LR/'+index+'_test.npy')
    NN_phenotype_test=np.load(dir_data3+'/NN/'+index+'_test.npy')
    RF_phenotype_test=np.load(dir_data3+'/RF/'+index+'_test.npy')
    GB_phenotype_test=np.load(dir_data3+'/GB/'+index+'_test.npy')
    ada_phenotype_test=np.load(dir_data3+'/ada/'+index+'_test.npy')
    disease_test=np.load(dir_data1+index+'_disease_test.npy')


    LR_merge_valid=np.concatenate((LR_genotype_valid,LR_phenotype_valid),axis=1)
    NN_merge_valid=np.concatenate((NN_genotype_valid,NN_phenotype_valid),axis=1)
    RF_merge_valid=np.concatenate((RF_genotype_valid,RF_phenotype_valid),axis=1)
    GB_merge_valid=np.concatenate((GB_genotype_valid,GB_phenotype_valid),axis=1)
    ada_merge_valid=np.concatenate((ada_genotype_valid,ada_phenotype_valid),axis=1)

    LR_merge_test=np.concatenate((LR_genotype_test,LR_phenotype_test),axis=1)
    NN_merge_test=np.concatenate((NN_genotype_test,NN_phenotype_test),axis=1)
    RF_merge_test=np.concatenate((RF_genotype_test,RF_phenotype_test),axis=1)
    GB_merge_test=np.concatenate((GB_genotype_test,GB_phenotype_test),axis=1)
    ada_merge_test=np.concatenate((ada_genotype_test,ada_phenotype_test),axis=1)



    LR_LR = LogisticRegression(penalty='l1', C=10000, max_iter=10000)
    LR_LR.fit(LR_merge_valid,disease_valid)
    Y_LR=LR_LR.predict_proba(LR_merge_test)
    AUC_LR=roc_auc_score(disease_test,Y_LR[:,1])
    np.save('./LR/'+index+'_valid.npy',Y_LR)
    np.save('./LR/'+index+'_test.npy',Y_LR)


    LR_NN = LogisticRegression(penalty='l1', C=10000, max_iter=10000)
    LR_NN.fit(NN_merge_valid,disease_valid)
    Y_NN=LR_NN.predict_proba(NN_merge_test)
    AUC_NN=roc_auc_score(disease_test,Y_NN[:,1])
    np.save('./NN/'+index+'_valid.npy',Y_NN)
    np.save('./NN/'+index+'_test.npy',Y_NN)



    LR_RF = LogisticRegression(penalty='l1', C=10000, max_iter=10000)
    LR_RF.fit(RF_merge_valid,disease_valid)
    Y_RF=LR_RF.predict_proba(RF_merge_test)
    AUC_RF=roc_auc_score(disease_test,Y_RF[:,1])
    np.save('./RF/'+index+'_valid.npy',Y_RF)
    np.save('./RF/'+index+'_test.npy',Y_RF)


    LR_GB = LogisticRegression(penalty='l1', C=10000, max_iter=10000)
    LR_GB.fit(GB_merge_valid,disease_valid)
    Y_GB=LR_GB.predict_proba(GB_merge_test)
    AUC_GB=roc_auc_score(disease_test,Y_GB[:,1])
    np.save('./GB/'+index+'_valid.npy',Y_GB)
    np.save('./GB/'+index+'_test.npy',Y_GB)


    LR_ada = LogisticRegression(penalty='l1', C=10000, max_iter=10000)
    LR_ada.fit(ada_merge_valid,disease_valid)
    Y_ada=LR_ada.predict_proba(ada_merge_test)
    AUC_ada=roc_auc_score(disease_test,Y_ada[:,1])
    np.save('./ada/'+index+'_valid.npy',Y_ada)
    np.save('./ada/'+index+'_test.npy',Y_ada)

    out_test.write(str(index)+'\t'+str(round(AUC_LR,3))+'\t'+str(round(AUC_RF,3))+'\t'+str(round(AUC_NN,3))+'\t'+str(round(AUC_ada,3))+'\t'+str(round(AUC_GB,3))+'\n')
phenotype=['1002','1348','1066','1111','1134','1154','1220','1242','1265','1294','1297','1374','1065','1068','1113','1136','1207','1224','1243','1293','1295','1452']
out_test=open('all_mergescore','w')
for index in phenotype:
    print(index)
    generate(index,out_test)
out_test.close()
