# Installing Hail and the Google Cloud Storage filesystem connector on a GCP Vertex AI Workbench instance

## Install the last version of Hail

In [12]:
!pip install hail



## Install the Google Cloud Storage filesystem connector

The JSON key file needs to be generated in the GCP console (https://console.cloud.google.com/iam-admin/serviceaccounts/)

In [4]:
!curl https://raw.githubusercontent.com/broadinstitute/install-gcs-connector/master/install_gcs_connector.py > ../install_gcs_connector.py

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10989  100 10989    0     0   145k      0 --:--:-- --:--:-- --:--:--  147k


In [6]:
!python3 ../install_gcs_connector.py -h

usage: install_gcs_connector.py [-h] [-a AUTH_TYPE] [-k KEY_FILE_PATH]
                                [--gcs-requester-pays-project GCS_REQUESTER_PAYS_PROJECT]

options:
  -h, --help            show this help message and exit
  -a AUTH_TYPE, --auth-type AUTH_TYPE
                        How to authenticate. For Spark <3.5.0, this option
                        must be unspecified. For Spark >=3.5.0, use --auth-
                        type APPLICATION_DEFAULT for your laptop. Use --auth-
                        type COMPUTE_ENGINE for GCE VMs. Use
                        SERVICE_ACCOUNT_JSON_KEYFILE with --key-file-path for
                        an explicit key file location. For Spark >=3.5.0, we
                        default to APPLICATION_DEFAULT.
  -k KEY_FILE_PATH, --key-file-path KEY_FILE_PATH
                        Required for Spark <3.5.0. Service account key .json
                        path. This path is just added to the spark config
                        file. The

In [9]:
!python3 ../install_gcs_connector.py -k ../psychic-rhythm-198903-23c401baac7b.json

2024-04-01 01:27:02,219 INFO     Downloading https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop2-2.2.21/gcs-connector-hadoop2-2.2.21-shaded.jar
2024-04-01 01:27:02,219 INFO        to /opt/conda/lib/python3.10/site-packages/pyspark/jars/gcs-connector-hadoop2-2.2.21-shaded.jar
2024-04-01 01:27:02,501 INFO     Updating /opt/conda/lib/python3.10/site-packages/pyspark/conf/spark-defaults.conf json.keyfile
2024-04-01 01:27:02,501 INFO     Setting json.keyfile = ../psychic-rhythm-198903-23c401baac7b.json


## Ensure that Java is installed

In [14]:
!java -version

openjdk version "11.0.22" 2024-01-16
OpenJDK Runtime Environment (build 11.0.22+7-post-Debian-1deb11u1)
OpenJDK 64-Bit Server VM (build 11.0.22+7-post-Debian-1deb11u1, mixed mode, sharing)


## Test connection to gnomAD v4.0

In [13]:
import hail as hl;
hl.init()

24/04/01 01:47:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.4
SparkUI available at http://loic-powersfs-v2-motebook.us-central1-a.c.psychic-rhythm-198903.internal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.128-eead8100a1c1
LOGGING: writing to /home/jupyter/powerSFS/hail-20240401-0147-0.2.128-eead8100a1c1.log


In [15]:
ht = hl.read_table('gs://gcp-public-data--gnomad/release/4.0/ht/exomes/gnomad.exomes.v4.0.sites.ht')
ht.describe()

----------------------------------------
Global fields:
    'freq_meta': array<dict<str, str>> 
    'freq_index_dict': dict<str, int32> 
    'freq_meta_sample_count': array<int32> 
    'faf_meta': array<dict<str, str>> 
    'faf_index_dict': dict<str, int32> 
    'joint_freq_meta': array<dict<str, str>> 
    'joint_freq_index_dict': dict<str, int32> 
    'joint_freq_meta_sample_count': array<int32> 
    'joint_faf_meta': array<dict<str, str>> 
    'joint_faf_index_dict': dict<str, int32> 
    'age_distribution': struct {
        bin_edges: array<float64>, 
        bin_freq: array<int32>, 
        n_smaller: int32, 
        n_larger: int32
    } 
    'downsamplings': dict<str, array<int32>> 
    'filtering_model': struct {
        filter_name: str, 
        score_name: str, 
        snv_cutoff: struct {
            bin: int32, 
            min_score: float64
        }, 
        indel_cutoff: struct {
            bin: int32, 
            min_score: float64
        }, 
        snv_trainin

In [16]:
hl.stop()