In [1]:
import hail as hl

In [3]:
table = hl.import_table('data/1kg_annotations.txt', impute=True).key_by('Sample')

mt = hl.read_matrix_table('data/1kg.mt')
mt = mt.annotate_cols(pheno = table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

mt = mt.annotate_cols(scores = pcs[mt.s].scores)
gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.2
SparkUI available at http://fc320f200603:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.74-0c3a74d12093
LOGGING: writing to /workspace/UK_Biobank_GWAS/imputed-v2-gwas/hail-20210912-0326-0.2.74-0c3a74d12093.log
2021-09-12 03:26:23 Hail: INFO: Reading table to impute column types
2021-09-12 03:26:24 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'Population' as type str (imputed)
  Loading field 'SuperPopulation' as type str (imputed)
  Loading field 'isFemale' as type bool (imputed)
  Loading field 'PurpleHair' as type bool (imputed)
  Loading field 'CaffeineConsumption' as type int32 (imputed)
2021-09-12 03:26:43 Hail: INFO: hwe_normalized_pca: running PCA using 9087 variants.
2021-09-12 03:26:44 Hail: INFO: pca: running PCA with 10 components...
2021-09-12 03:26:51 Hail: INFO: linear_regr

In [4]:
from bokeh.io import output_file, save
from bokeh.layouts import gridplot

In [5]:
p = hl.plot.manhattan(gwas.p_value)
output_file("man.html")
save(p)

p = hl.plot.qq(gwas.p_value)
output_file("qq_plot.html")
save(p)

2021-09-12 03:27:11 Hail: INFO: Ordering unsorted dataset with network shuffle


'/workspace/UK_Biobank_GWAS/imputed-v2-gwas/qq_plot.html'

In [3]:
pip = hl.read_table("pipeline.kt")

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.2
SparkUI available at http://fc320f200603:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.74-0c3a74d12093
LOGGING: writing to /workspace/UK_Biobank_GWAS/imputed-v2-gwas/hail-20210912-1023-0.2.74-0c3a74d12093.log


In [5]:
pip.count()

20982

In [6]:
import pandas as pd

In [None]:
pd.read_csv("/data/NAS/UKBB_data/")

In [8]:
pd.read_csv("hwpheno.tsv")

Unnamed: 0,\tidx\thw
0,0\t1000012\t1.8028148864390081
1,1\t1000158\t1.9981274638633375
2,3\t1000262\t1.6039497124755615
3,4\t1000369\t1.655383891336973
4,5\t1000489\t1.8030013882610332
...,...
31712,36263\t6024290\t1.9076921986708153
31713,36264\t6024323\t1.8359100394696672
31714,36265\t6024624\t1.9466454526215264
31715,36266\t6024638\t1.358076329850924


In [7]:
%ls

[0m[01;32m1_merge_mfi.sh[0m*
2_fam_sqc_merge.R
3_make_sample_qc_table.py
4_build_pipelines.py
5_make_variant_annotation_vds.py
6_filter_gwas_variants.py
7_run_linreg3.py
8_export_results.py
GwasTutorial.ipynb
[01;34mHRC.mt[0m/
HRC.r1-1.GRCh37.wgs.mac5.sites.tab
HRC.r1-1.GRCh37.wgs.mac5.sites.vcf
HRC.vcf.bgz
Manhattan_plot.R
PracticeHail.ipynb
QQ_plot.R
README.md
[01;34mall_variants.vds[0m/
chin_traits.csv
collect.py
[01;34mdata[0m/
file.vcf.bgz
[01;32mgwas.rc[0m*
[01;32mgwas.rc~[0m*
hail-20210912-0306-0.2.74-6106cbc36a0b.log
hail-20210912-0307-0.2.74-6106cbc36a0b.log
hail-20210912-0310-0.2.74-0c3a74d12093.log
hail-20210912-0311-0.2.74-6106cbc36a0b.log
hail-20210912-0312-0.2.74-6106cbc36a0b.log
hail-20210912-0316-0.2.74-6106cbc36a0b.log
hail-20210912-0317-0.2.74-6106cbc36a0b.log
hail-20210912-0320-0.2.74-0c3a74d12093.log
hail-20210912-0321-0.2.74-6106cbc36a0b.log
hail-20210912-0326-0.2.74-0c3a74d12093.log
hail-20210912-0332-0.2.74-6106cbc36

In [1]:
import hail as hl

In [2]:
BGEN_FILES = '/workspace/UKBB_data/imputed/ukb22828_c1_b0_v3.bgen/'
SAMPLE_FILE = "/workspace/UKBB_data/imputed/joined.sample"
MFI_TABLE ='mfi_joined.kt'

In [3]:
bgen = hl.import_bgen(path = BGEN_FILES,
                      sample_file= SAMPLE_FILE, entry_fields=['GT', 'GP','dosage'])

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.2
SparkUI available at http://fc320f200603:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.74-0c3a74d12093
LOGGING: writing to /workspace/UK_Biobank_GWAS/imputed-v2-gwas/hail-20210914-0131-0.2.74-0c3a74d12093.log
2021-09-14 01:31:46 Hail: INFO: Number of BGEN files parsed: 1
2021-09-14 01:31:46 Hail: INFO: Number of samples in BGEN files: 487409
2021-09-14 01:31:46 Hail: INFO: Number of variants across all BGEN files: 7402791


In [4]:
bgen.count()

(7402791, 487409)

In [5]:
mfi_table = hl.read_table(MFI_TABLE).select('rsid','info')

In [6]:
mfi_table.count()

52900288

In [7]:
data=bgen.annotate_rows(mfi=mfi_table[bgen.row.rsid])


In [8]:
data=data.filter_rows(data.mfi.info > 0.8) 


In [9]:
data.count()

KeyboardInterrupt: 