In [1]:
import hail as hl
import json
import time

In [2]:
# ht_url can be a gs:// path, or a file:// local path

ht_url = "gs://gcp-public-data--gnomad/release/4.0/ht/exomes/gnomad.exomes.v4.0.sites.ht"
ht_version = "4.0.0"

print(ht_url)
ht = hl.read_table(ht_url)


Initializing Hail with default parameters...


gs://gcp-public-data--gnomad/release/4.0/ht/exomes/gnomad.exomes.v4.0.sites.ht


SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.
SLF4J: Class path contains SLF4J bindings targeting slf4j-api versions 1.7.x or earlier.
SLF4J: Ignoring binding found at [jar:file:/Users/kferrite/dev/gnomad_methods/venv/lib/python3.11/site-packages/pyspark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See https://www.slf4j.org/codes.html#ignoredBindings for an explanation.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.2
SparkUI available at http://192.168.1.7:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.125-6e6f46797aed
LOGGING: writing to /Users/kferrite/dev/gnomad_methods/notebooks/hail-20240207-1303-0.2.125-6e6f46797aed.log


In [3]:
from gnomad.resources.grch38.gnomad import gnomad_gks

# 1-55051215-G-A

locus = hl.Locus("chr1", 55051215)
alleles = ["G", "A"]

ivl =hl.locus_interval(
    locus.contig,
    locus.position,
    locus.position + 1,
    reference_genome="GRCh38")

ht_filtered = hl.filter_intervals(ht, [ivl])


gks_records = gnomad_gks(
    ivl,
    version=ht_version,
    data_type="exomes",
    custom_ht=ht_filtered,
    skip_checkpoint=True
)


Schema version http://json-schema.org/draft-07/schema not recognized. Some keywords and features may not be supported.



In [4]:
def pprint(o):
    print(json.dumps(o, indent=2))

# example
pprint(gks_records[0])


{
  "locus": {
    "contig": "chr1",
    "position": 55051215,
    "reference_genome": "GRCh38"
  },
  "alleles": [
    "G",
    "A"
  ],
  "gks_vrs_variant": {
    "_id": "ga4gh:VA.x1scufKNK1m96pjYYsKo2qFk59MbS5c1",
    "type": "Allele",
    "location": {
      "type": "SequenceLocation",
      "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
      "interval": {
        "start": {
          "type": "Number",
          "value": 55051214
        },
        "end": {
          "type": "Number",
          "value": 55051215
        },
        "type": "SequenceInterval"
      },
      "_id": "ga4gh:VSL.LrIJ26u0udG_h_cLLfZyCHJWTujlB65R"
    },
    "state": {
      "type": "LiteralSequenceExpression",
      "sequence": "A"
    }
  },
  "gks_va_freq": {
    "id": "gnomAD-4.0.0-chr1-55051215-G-A",
    "type": "CohortAlleleFrequency",
    "label": "Overall Cohort Allele Frequency for chr1-55051215-G-A",
    "derivedFrom": {
      "id": "gnomAD4.0.0",
      "type": "DataSet",
      "lab

In [5]:
import json
import requests
import jsonschema

def get_json_http(url):
    r = requests.get(url)
    if r.status_code != 200:
        raise RuntimeError(f"Request failed:\n{r.status_code} {r.content}")
    return json.loads(r.content.decode("utf-8"))

schema = get_json_http("https://raw.githubusercontent.com/ga4gh/va-spec/1.0-alpha/schema/cohortAlleleFreq.json")

for gks_record in gks_records:
    jsonschema.validate(
        instance=gks_record["gks_va_freq"],
        schema=schema)