In [1]:
import sys
import hail as hl
import pandas as pd

In [2]:
def remove_unnamed(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

## Make SAMPLE QC TABLE

DEFINE INPUT AND OUTPUT

In [23]:
SAMPLE_QC_FILE = "/mnt/i/UKB_DATA/imputed_UKB/qc_sphericity_index.tsv"
SAMPLE_QC_TABLE = "/mnt/i/UKB_DATA/imputed_UKB/qc_sphericity_index.kt"

READ SAMPLE QC

In [24]:
df = hl.import_table(SAMPLE_QC_FILE)

2022-05-16 01:10:38 Hail: INFO: Loading 64 fields. Counts by type:
  str: 64


In [25]:
df = df.annotate(PC1 = hl.float64(df['PC1']),
                 PC2 = hl.float64(df['PC2']),
                 PC3 = hl.float64(df['PC3']),
                 PC4 = hl.float64(df['PC4']),
                 PC5 = hl.float64(df['PC5']),
                 PC6 = hl.float64(df['PC6']),
                 PC7 = hl.float64(df['PC7']),
                 PC8 = hl.float64(df['PC8']),
                 PC9 = hl.float64(df['PC9']),
                 PC10 = hl.float64(df['PC10']),
                 age_at_MRI = hl.float64(df['age_at_MRI']),
                 eid = df['eid'],
                 in_white_British_ancestry_subset = df['ethnicity'] == "1.0",
                 used_in_pca_calculation = df['used.in.pca.calculation'] == "1.0",
                 no_excess_relatives = df['excess.relatives']=="0.0",
                 putative_sex_chromosome_aneuploidy = df['putative.sex.chromosome.aneuploidy']=="1.0",
                 isFemale = df['Inferred.Gender'] == '0.0')

In [26]:
df = df.filter(df.in_white_British_ancestry_subset==True)

In [27]:
df = df.filter(df.no_excess_relatives==True)

In [8]:
#df = df.filter(df.used_in_pca_calculation==True)

In [28]:
df=df.select('eid',
           'isFemale',
           'age_at_MRI',
           'PC1',
           'PC2',
           'PC3',
           'PC4',
           'PC5',
           'PC6',
           'PC7',
           'PC8',
           'PC9',
           'PC10')

In [29]:
df.write(SAMPLE_QC_TABLE,overwrite=True)

2022-05-16 01:10:58 Hail: INFO: wrote table with 22590 rows in 1 partition to /mnt/i/UKB_DATA/imputed_UKB/qc_sphericity_index.kt


In [30]:
df.count()

22590

## BUILD PIPLINES

In [31]:
import sys
import hail as hl
import pandas as pd

In [64]:
pheno = "BSA_length"
SAMPLE_QC_TABLE = '/mnt/i/UKB_DATA/imputed_UKB/qc_sphericity_index.kt'
kt = hl.read_table(SAMPLE_QC_TABLE)
n_samples = kt.count()

PHESANT_FILE = f"/mnt/i/UKB_DATA/tsv_pheno/{pheno}.tsv"

traits = hl.import_table(PHESANT_FILE,impute=True)
traits = traits.annotate(idx = hl.str(traits.idx), pheno=hl.float64(traits.pheno))


traits=traits.key_by('idx')
kt=kt.key_by('eid')
kt=kt.annotate(pheno = traits[kt.eid].pheno)
kt.write(f"/mnt/i/UKB_DATA/pipelines/{pheno}.kt",overwrite=True)

22590


2022-05-16 01:29:29 Hail: INFO: Reading table to impute column types
2022-05-16 01:29:30 Hail: INFO: Finished type imputation
  Loading field '' as type int32 (imputed)
  Loading field 'Unnamed: 0' as type int32 (imputed)
  Loading field 'idx' as type int32 (imputed)
  Loading field 'pheno' as type float64 (imputed)
2022-05-16 01:29:30 Hail: INFO: Coerced sorted dataset
2022-05-16 01:29:30 Hail: INFO: Coerced sorted dataset
2022-05-16 01:29:31 Hail: INFO: wrote table with 22590 rows in 1 partition to /mnt/i/UKB_DATA/pipelines/BSA_length.kt
2022-05-16 01:43:26 Hail: INFO: Reading table to impute column types
2022-05-16 01:43:28 Hail: INFO: Finished type imputation
  Loading field '' as type int32 (imputed)
  Loading field 'Unnamed: 0' as type int32 (imputed)
  Loading field 'idx' as type int32 (imputed)
  Loading field 'pheno' as type float64 (imputed)
2022-05-16 01:43:29 Hail: INFO: Coerced sorted dataset
2022-05-16 01:43:29 Hail: INFO: Coerced sorted dataset
2022-05-16 01:43:30 Hail: 

In [63]:
kt.show(10)

2022-05-16 01:29:18 Hail: INFO: Coerced sorted dataset
2022-05-16 01:29:19 Hail: INFO: Coerced sorted dataset


eid,isFemale,age_at_MRI,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,pheno
str,bool,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
"""1000158""",False,44.0,-11.6,2.74,-1.26,1.58,-7.08,-3.52,0.97,-3.36,1.05,1.33,43.3
"""1000243""",False,59.0,-13.9,4.2,-3.87,0.394,1.53,-1.22,-0.483,-2.18,1.72,-0.786,50.9
"""1000262""",True,58.0,-14.6,5.0,-1.88,4.13,-0.57,0.544,-1.3,1.67,-4.42,1.21,44.4
"""1000489""",False,66.0,-11.3,4.54,-3.7,2.77,-8.5,-1.19,-0.19,-1.82,8.88,5.65,52.3
"""1000563""",True,49.0,-13.6,3.44,-1.51,2.02,5.62,-3.65,2.92,-0.856,-26.0,4.15,44.8
"""1000670""",False,68.0,-12.2,4.78,-1.68,-0.0945,-5.13,1.35,-1.17,-2.67,-3.86,0.803,46.2
"""1001023""",False,58.0,-12.6,4.62,-0.956,0.922,1.01,3.02,-0.558,0.269,-8.24,0.787,49.5
"""1001525""",False,62.0,-12.8,5.11,-1.49,4.08,9.27,2.14,-0.0849,0.208,2.99,-0.63,49.8
"""1001614""",False,58.0,-13.2,2.34,0.0218,3.12,-1.06,-0.326,0.317,-0.828,5.16,-2.01,41.6
"""1001620""",False,62.0,-12.2,0.521,-0.495,-2.0,-7.73,1.52,2.84,-1.15,2.28,-2.41,48.4


In [59]:
traits.show(10)

Unnamed: 0_level_0,Unnamed: 0,idx,pheno
int32,int32,str,float64
0,0,"""1000158""",43.3
1,1,"""1000243""",50.9
2,2,"""1000262""",44.4
3,3,"""1000369""",52.3
4,4,"""1000489""",52.3
5,5,"""1000490""",52.1
6,6,"""1000542""",37.6
7,7,"""1000563""",44.8
8,8,"""1000670""",46.2
9,9,"""1000816""",49.8


In [46]:
kt.count()

22590

In [43]:
table_joined.show(10)

2022-05-16 01:19:52 Hail: INFO: Coerced sorted dataset
2022-05-16 01:19:52 Hail: INFO: Coerced sorted dataset


idx,Unnamed: 1_level_0,Unnamed: 0,pheno,isFemale,age_at_MRI,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
str,int32,int32,float64,bool,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
"""1000158""",0,0,43.3,False,44.0,-11.6,2.74,-1.26,1.58,-7.08,-3.52,0.97,-3.36,1.05,1.33
"""1000243""",1,1,50.9,False,59.0,-13.9,4.2,-3.87,0.394,1.53,-1.22,-0.483,-2.18,1.72,-0.786
"""1000262""",2,2,44.4,True,58.0,-14.6,5.0,-1.88,4.13,-0.57,0.544,-1.3,1.67,-4.42,1.21
"""1000489""",4,4,52.3,False,66.0,-11.3,4.54,-3.7,2.77,-8.5,-1.19,-0.19,-1.82,8.88,5.65
"""1000563""",7,7,44.8,True,49.0,-13.6,3.44,-1.51,2.02,5.62,-3.65,2.92,-0.856,-26.0,4.15
"""1000670""",8,8,46.2,False,68.0,-12.2,4.78,-1.68,-0.0945,-5.13,1.35,-1.17,-2.67,-3.86,0.803
"""1001023""",10,10,49.5,False,58.0,-12.6,4.62,-0.956,0.922,1.01,3.02,-0.558,0.269,-8.24,0.787
"""1001525""",15,15,49.8,False,62.0,-12.8,5.11,-1.49,4.08,9.27,2.14,-0.0849,0.208,2.99,-0.63
"""1001614""",16,16,41.6,False,58.0,-13.2,2.34,0.0218,3.12,-1.06,-0.326,0.317,-0.828,5.16,-2.01
"""1001620""",17,17,48.4,False,62.0,-12.2,0.521,-0.495,-2.0,-7.73,1.52,2.84,-1.15,2.28,-2.41
