## SNPs matrix

### RA_control_white1

In [1]:
%env DATASET_58404439_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/0268f8aa-fa72-418c-ad7b-6ea510b443d4/vcfs

env: DATASET_58404439_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/0268f8aa-fa72-418c-ad7b-6ea510b443d4/vcfs


In [2]:
import os
import subprocess

# The extraction workflow outputs a manifest file upon completion.
manifest_file = os.environ['DATASET_58404439_VCF_DIR'] + '/manifest.txt'

assert subprocess.run(['gsutil', '-q', 'stat', manifest_file]).returncode == 0, (
  "!" * 100 + "\n\n" +
  "VCF extraction has not completed.\n" +
  "Please monitor the extraction sidepanel for completion before continuing.\n\n" +
  "!" * 100
)

print("VCF extraction has completed, continuing")


VCF extraction has completed, continuing


In [3]:
# Confirm Spark is installed.
try:
    import pyspark
except ModuleNotFoundError:
    print("!" * 100 + "\n\n"
          "In the Researcher Workbench, Hail can only be used on a Dataproc cluster.\n"
          "Please use the 'Cloud Analysis Environment' side panel to update your runtime compute type.\n\n" +
          "!" * 100)

# Initialize Hail
import hail as hl
import os
from hail.plot import show

hl.init(default_reference='GRCh38')
hl.plot.output_notebook()


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.


Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-22602-m.us-central1-c.c.terra-vpc-sc-39ac9e8b.internal:45053
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/multimodeltestzhiyu/hail-20250214-1626-0.2.130.post1-c69cd67afb8b.log


In [4]:
# Create Hail Matrix table
workspace_bucket = os.environ['WORKSPACE_BUCKET']
vcf_dir = os.environ['DATASET_58404439_VCF_DIR']
hail_matrix_table_gcs = f'{workspace_bucket}/dataset_58404439.mt'

In [5]:
# This can take a few hours for a dataset with hundreds of participants
#hl.import_vcf(f'{vcf_dir}/*.vcf.gz', force_bgz=True, array_elements_required=False).write(hail_matrix_table_gcs)

2025-02-07 04:04:18.683 Hail: INFO: scanning VCF for sortedness...0 + 3) / 2013]
2025-02-07 04:34:09.818 Hail: INFO: Coerced sorted VCF - no additional import work to do
2025-02-07 12:40:39.994 Hail: INFO: wrote matrix table with 72178139 rows and 4361 columns in 2050 partitions to gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/dataset_58404439.mt


In [5]:
mt = hl.read_matrix_table(hail_matrix_table_gcs)

In [6]:
mt = mt.annotate_rows(snp_id=mt.locus.contig.replace("chr", "") + ":" + hl.str(mt.locus.position) + ":" + mt.alleles[0] + ":" + mt.alleles[1])
snp_ids = [
    "14:104920174:G:A", "6:159082054:A:G", "14:68287978:G:A",
    "6:36414159:G:GA", "13:39781776:T:C", "12:45976333:C:G",
    "12:111446804:T:C", "9:34710263:G:A", "5:143224856:A:G",
    "1:116738074:C:T"
]
# Filter the rows to keep only SNPs of interest
snp_set = hl.set(snp_ids)
mt_filtered = mt.filter_rows(snp_set.contains(mt.snp_id))
mt_filtered = mt_filtered.checkpoint("ra_case_filtered_mt_checkpoint.mt", overwrite=True)

2025-02-14 17:09:49.310 Hail: INFO: wrote matrix table with 10 rows and 4361 columns in 2050 partitions to ra_case_filtered_mt_checkpoint.mt


In [7]:
mt_filtered.rows().show()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info,info,info,info,info,Unnamed: 12_level_0
locus,alleles,rsid,qual,filters,AC,AF,AN,AS_QUALapprox,AS_VQSLOD,AS_YNG,QUALapprox,snp_id
locus<GRCh38>,array<str>,str,float64,set<str>,array<int32>,array<float64>,int32,str,array<str>,array<str>,int32,str
chr1:116738074,"[""C"",""T""]",,-10.0,,[6026],[6.91e-01],8718,"""0|576337""","[""20.0147""]","[""Y""]",163,"""1:116738074:C:T"""
chr5:143224856,"[""A"",""G""]",,-10.0,,[1206],[1.38e-01],8722,"""0|110507""","[""20.3526""]","[""Y""]",85,"""5:143224856:A:G"""
chr6:36414159,"[""G"",""GA""]",,-10.0,,[1182],[1.36e-01],8722,"""0|113787""","[""19.5238""]","[""Y""]",183,"""6:36414159:G:GA"""
chr6:159082054,"[""A"",""G""]",,-10.0,,[1536],[1.76e-01],8722,"""0|144762""","[""16.2122""]","[""Y""]",144,"""6:159082054:A:G"""
chr9:34710263,"[""G"",""A""]",,-10.0,,[3995],[4.58e-01],8722,"""0|386966""","[""20.6603""]","[""Y""]",166,"""9:34710263:G:A"""
chr12:45976333,"[""C"",""G""]",,-10.0,,[2526],[2.90e-01],8722,"""0|251734""","[""20.4946""]","[""Y""]",190,"""12:45976333:C:G"""
chr12:111446804,"[""T"",""C""]",,-10.0,,[2644],[3.03e-01],8722,"""0|255167""","[""20.503""]","[""Y""]",85,"""12:111446804:T:C"""
chr13:39781776,"[""T"",""C""]",,-10.0,,[706],[8.10e-02],8720,"""0|67083""","[""19.9256""]","[""Y""]",85,"""13:39781776:T:C"""
chr14:68287978,"[""G"",""A""]",,-10.0,,[177],[2.00e-02],8722,"""0|16296""","[""20.1817""]","[""Y""]",85,"""14:68287978:G:A"""
chr14:104920174,"[""G"",""A""]",,-10.0,,[5211],[5.97e-01],8722,"""0|553334""","[""19.5663""]","[""G""]",211,"""14:104920174:G:A"""


In [8]:
# Annotate entries with the genotype allele count (number of alternate alleles per individual)
mt_snps = mt_filtered.annotate_entries(allele_count=hl.case()
                         .when(mt_filtered.GT.is_hom_ref(), 0)  # Homozygous reference → 0 alt alleles
                         .when(mt_filtered.GT.is_het(), 1)      # Heterozygous → 1 alt allele
                         .when(mt_filtered.GT.is_hom_var(), 2)  # Homozygous alternate → 2 alt alleles
                         .or_missing())  # Missing data remains missing

In [9]:
# Extract only the necessary columns
table = mt_snps.entries()
table = table.key_by()
table = table.select('s', 'snp_id', 'allele_count')
# Convert to a wide format: row = individuals, columns = SNPs
snp_matrix = table.to_pandas().pivot(index="s", columns="snp_id", values="allele_count")

2025-02-14 17:30:26.472 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'

In [10]:
snp_matrix = snp_matrix.reset_index().rename(columns={"s": "person_id"})
snp_matrix['person_id'] = snp_matrix['person_id'].astype(int) 

In [11]:
snp_matrix.isna().sum()

snp_id
person_id           0
12:111446804:T:C    0
12:45976333:C:G     0
13:39781776:T:C     1
14:104920174:G:A    0
14:68287978:G:A     0
1:116738074:C:T     2
5:143224856:A:G     0
6:159082054:A:G     0
6:36414159:G:GA     0
9:34710263:G:A      0
dtype: int64

In [12]:
snp_matrix.to_csv("ra_control_white1_selected_snp_matrix.csv", index=False)

### RA_control_white2

In [2]:
%env DATASET_47502446_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/f2b7294d-ed6d-459c-b7d9-025910fce7ba/vcfs

env: DATASET_47502446_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/f2b7294d-ed6d-459c-b7d9-025910fce7ba/vcfs


In [3]:
import os
import subprocess

# The extraction workflow outputs a manifest file upon completion.
manifest_file = os.environ['DATASET_47502446_VCF_DIR'] + '/manifest.txt'

assert subprocess.run(['gsutil', '-q', 'stat', manifest_file]).returncode == 0, (
  "!" * 100 + "\n\n" +
  "VCF extraction has not completed.\n" +
  "Please monitor the extraction sidepanel for completion before continuing.\n\n" +
  "!" * 100
)

print("VCF extraction has completed, continuing")


VCF extraction has completed, continuing


In [4]:
# Initialize Hail
import hail as hl
import os
from hail.plot import show

#hl.init(default_reference='GRCh38')
hl.plot.output_notebook()


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.


Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-22602-m.us-central1-c.c.terra-vpc-sc-39ac9e8b.internal:35359
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/multimodeltestzhiyu/hail-20250214-2114-0.2.130.post1-c69cd67afb8b.log


In [5]:
# Create Hail Matrix table
# This can take a few hours for a dataset with hundreds of participants
workspace_bucket = os.environ['WORKSPACE_BUCKET']
vcf_dir = os.environ['DATASET_47502446_VCF_DIR']
hail_matrix_table_gcs = f'{workspace_bucket}/dataset_47502446.mt'

In [None]:
#hl.import_vcf(f'{vcf_dir}/*.vcf.gz', force_bgz=True, array_elements_required=False).write(hail_matrix_table_gcs)

2025-02-07 15:42:53.960 Hail: INFO: scanning VCF for sortedness...0 + 3) / 2013]
2025-02-07 16:03:04.328 Hail: INFO: Coerced sorted VCF - no additional import work to do

In [6]:
mt = hl.read_matrix_table(hail_matrix_table_gcs)

In [7]:
mt = mt.annotate_rows(snp_id=mt.locus.contig.replace("chr", "") + ":" + hl.str(mt.locus.position) + ":" + mt.alleles[0] + ":" + mt.alleles[1])
snp_ids = [
    "14:104920174:G:A", "6:159082054:A:G", "14:68287978:G:A",
    "6:36414159:G:GA", "13:39781776:T:C", "12:45976333:C:G",
    "12:111446804:T:C", "9:34710263:G:A", "5:143224856:A:G",
    "1:116738074:C:T"
]
# Filter the rows to keep only SNPs of interest
snp_set = hl.set(snp_ids)
mt_filtered = mt.filter_rows(snp_set.contains(mt.snp_id))
mt_filtered = mt_filtered.checkpoint("ra_case_filtered_mt_checkpoint.mt", overwrite=True)

2025-02-14 21:41:24.693 Hail: INFO: wrote matrix table with 10 rows and 2964 columns in 2024 partitions to ra_case_filtered_mt_checkpoint.mt


In [8]:
# Annotate entries with the genotype allele count (number of alternate alleles per individual)
mt_snps = mt_filtered.annotate_entries(allele_count=hl.case()
                         .when(mt_filtered.GT.is_hom_ref(), 0)  # Homozygous reference → 0 alt alleles
                         .when(mt_filtered.GT.is_het(), 1)      # Heterozygous → 1 alt allele
                         .when(mt_filtered.GT.is_hom_var(), 2)  # Homozygous alternate → 2 alt alleles
                         .or_missing())  # Missing data remains missing

# Extract only the necessary columns
table = mt_snps.entries()
table = table.key_by()
table = table.select('s', 'snp_id', 'allele_count')
# Convert to a wide format: row = individuals, columns = SNPs
snp_matrix = table.to_pandas().pivot(index="s", columns="snp_id", values="allele_count")

2025-02-14 21:51:33.797 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'

In [9]:
snp_matrix = snp_matrix.reset_index().rename(columns={"s": "person_id"})
snp_matrix['person_id'] = snp_matrix['person_id'].astype(int) 

In [10]:
snp_matrix.isna().sum()

snp_id
person_id           0
12:111446804:T:C    0
12:45976333:C:G     0
13:39781776:T:C     1
14:104920174:G:A    0
14:68287978:G:A     0
1:116738074:C:T     1
5:143224856:A:G     0
6:159082054:A:G     0
6:36414159:G:GA     0
9:34710263:G:A      0
dtype: int64

In [11]:
snp_matrix.to_csv("ra_control_white2_selected_snp_matrix.csv", index=False)

### RA_control_nonwhite1

In [12]:
%env DATASET_72301771_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/0089eb69-463d-453b-bef3-29c70fd81fa4/vcfs

env: DATASET_72301771_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/0089eb69-463d-453b-bef3-29c70fd81fa4/vcfs


In [13]:
import os
import subprocess

# The extraction workflow outputs a manifest file upon completion.
manifest_file = os.environ['DATASET_72301771_VCF_DIR'] + '/manifest.txt'

assert subprocess.run(['gsutil', '-q', 'stat', manifest_file]).returncode == 0, (
  "!" * 100 + "\n\n" +
  "VCF extraction has not completed.\n" +
  "Please monitor the extraction sidepanel for completion before continuing.\n\n" +
  "!" * 100
)

print("VCF extraction has completed, continuing")


VCF extraction has completed, continuing


In [14]:
# Initialize Hail
import hail as hl
import os
from hail.plot import show

#hl.init(default_reference='GRCh38')
hl.plot.output_notebook()

In [15]:
# Create Hail Matrix table
# This can take a few hours for a dataset with hundreds of participants
workspace_bucket = os.environ['WORKSPACE_BUCKET']
vcf_dir = os.environ['DATASET_72301771_VCF_DIR']
hail_matrix_table_gcs = f'{workspace_bucket}/dataset_72301771.mt'

In [None]:
#hl.import_vcf(f'{vcf_dir}/*.vcf.gz', force_bgz=True, array_elements_required=False).write(hail_matrix_table_gcs)

2025-02-09 16:29:15.966 Hail: INFO: scanning VCF for sortedness...9 + 4) / 2013]
2025-02-09 17:11:08.446 Hail: INFO: Coerced sorted VCF - no additional import work to do
[Stage 3:===>                                                  (129 + 8) / 2113]

In [16]:
mt = hl.read_matrix_table(hail_matrix_table_gcs)

In [17]:
mt = mt.annotate_rows(snp_id=mt.locus.contig.replace("chr", "") + ":" + hl.str(mt.locus.position) + ":" + mt.alleles[0] + ":" + mt.alleles[1])
snp_ids = [
    "14:104920174:G:A", "6:159082054:A:G", "14:68287978:G:A",
    "6:36414159:G:GA", "13:39781776:T:C", "12:45976333:C:G",
    "12:111446804:T:C", "9:34710263:G:A", "5:143224856:A:G",
    "1:116738074:C:T"
]
# Filter the rows to keep only SNPs of interest
snp_set = hl.set(snp_ids)
mt_filtered = mt.filter_rows(snp_set.contains(mt.snp_id))
mt_filtered = mt_filtered.checkpoint("ra_case_filtered_mt_checkpoint.mt", overwrite=True)

2025-02-14 22:56:04.039 Hail: INFO: wrote matrix table with 10 rows and 4393 columns in 2113 partitions to ra_case_filtered_mt_checkpoint.mt


In [18]:
# Annotate entries with the genotype allele count (number of alternate alleles per individual)
mt_snps = mt_filtered.annotate_entries(allele_count=hl.case()
                         .when(mt_filtered.GT.is_hom_ref(), 0)  # Homozygous reference → 0 alt alleles
                         .when(mt_filtered.GT.is_het(), 1)      # Heterozygous → 1 alt allele
                         .when(mt_filtered.GT.is_hom_var(), 2)  # Homozygous alternate → 2 alt alleles
                         .or_missing())  # Missing data remains missing

# Extract only the necessary columns
table = mt_snps.entries()
table = table.key_by()
table = table.select('s', 'snp_id', 'allele_count')
# Convert to a wide format: row = individuals, columns = SNPs
snp_matrix = table.to_pandas().pivot(index="s", columns="snp_id", values="allele_count")



In [19]:
snp_matrix = snp_matrix.reset_index().rename(columns={"s": "person_id"})
snp_matrix['person_id'] = snp_matrix['person_id'].astype(int) 

In [20]:
snp_matrix.isna().sum()

snp_id
person_id           0
12:111446804:T:C    1
12:45976333:C:G     0
13:39781776:T:C     0
14:104920174:G:A    0
14:68287978:G:A     0
1:116738074:C:T     1
5:143224856:A:G     0
6:159082054:A:G     0
6:36414159:G:GA     0
9:34710263:G:A      0
dtype: int64

In [21]:
snp_matrix.to_csv("ra_control_nonwhite1_selected_snp_matrix.csv", index=False)

### RA_control_nonwhite2

In [22]:
%env DATASET_99921087_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/a1955e08-d316-4126-81f4-91cdd10bcce1/vcfs

env: DATASET_99921087_VCF_DIR=gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/genomic-extractions/a1955e08-d316-4126-81f4-91cdd10bcce1/vcfs


In [23]:
import os
import subprocess

# The extraction workflow outputs a manifest file upon completion.
manifest_file = os.environ['DATASET_99921087_VCF_DIR'] + '/manifest.txt'

assert subprocess.run(['gsutil', '-q', 'stat', manifest_file]).returncode == 0, (
  "!" * 100 + "\n\n" +
  "VCF extraction has not completed.\n" +
  "Please monitor the extraction sidepanel for completion before continuing.\n\n" +
  "!" * 100
)

print("VCF extraction has completed, continuing")


VCF extraction has completed, continuing


In [24]:
# Initialize Hail
import hail as hl
import os
from hail.plot import show

#hl.init(default_reference='GRCh38')
hl.plot.output_notebook()

In [25]:
# Create Hail Matrix table
# This can take a few hours for a dataset with hundreds of participants
workspace_bucket = os.environ['WORKSPACE_BUCKET']
vcf_dir = os.environ['DATASET_99921087_VCF_DIR']
hail_matrix_table_gcs = f'{workspace_bucket}/dataset_99921087.mt'

In [None]:
#hl.import_vcf(f'{vcf_dir}/*.vcf.gz', force_bgz=True, array_elements_required=False).write(hail_matrix_table_gcs)

2025-02-11 16:00:17.822 Hail: INFO: scanning VCF for sortedness...2 + 1) / 2013]
2025-02-11 16:31:25.390 Hail: INFO: Coerced sorted VCF - no additional import work to do

In [26]:
mt = hl.read_matrix_table(hail_matrix_table_gcs)

In [27]:
mt = mt.annotate_rows(snp_id=mt.locus.contig.replace("chr", "") + ":" + hl.str(mt.locus.position) + ":" + mt.alleles[0] + ":" + mt.alleles[1])
snp_ids = [
    "14:104920174:G:A", "6:159082054:A:G", "14:68287978:G:A",
    "6:36414159:G:GA", "13:39781776:T:C", "12:45976333:C:G",
    "12:111446804:T:C", "9:34710263:G:A", "5:143224856:A:G",
    "1:116738074:C:T"
]
# Filter the rows to keep only SNPs of interest
snp_set = hl.set(snp_ids)
mt_filtered = mt.filter_rows(snp_set.contains(mt.snp_id))
mt_filtered = mt_filtered.checkpoint("ra_case_filtered_mt_checkpoint.mt", overwrite=True)

2025-02-15 01:18:15.043 Hail: INFO: wrote matrix table with 10 rows and 3225 columns in 2045 partitions to ra_case_filtered_mt_checkpoint.mt


In [28]:
# Annotate entries with the genotype allele count (number of alternate alleles per individual)
mt_snps = mt_filtered.annotate_entries(allele_count=hl.case()
                         .when(mt_filtered.GT.is_hom_ref(), 0)  # Homozygous reference → 0 alt alleles
                         .when(mt_filtered.GT.is_het(), 1)      # Heterozygous → 1 alt allele
                         .when(mt_filtered.GT.is_hom_var(), 2)  # Homozygous alternate → 2 alt alleles
                         .or_missing())  # Missing data remains missing

# Extract only the necessary columns
table = mt_snps.entries()
table = table.key_by()
table = table.select('s', 'snp_id', 'allele_count')
# Convert to a wide format: row = individuals, columns = SNPs
snp_matrix = table.to_pandas().pivot(index="s", columns="snp_id", values="allele_count")



In [29]:
snp_matrix = snp_matrix.reset_index().rename(columns={"s": "person_id"})
snp_matrix['person_id'] = snp_matrix['person_id'].astype(int) 

In [30]:
snp_matrix.isna().sum()

snp_id
person_id           0
12:111446804:T:C    0
12:45976333:C:G     0
13:39781776:T:C     0
14:104920174:G:A    0
14:68287978:G:A     0
1:116738074:C:T     0
5:143224856:A:G     0
6:159082054:A:G     0
6:36414159:G:GA     0
9:34710263:G:A      0
dtype: int64

In [31]:
snp_matrix.to_csv("ra_control_nonwhite2_selected_snp_matrix.csv", index=False)