# Before starting

In [None]:
import hail as hl
import pandas as pd
import numpy as np

# Hail & gnomAD exomes v2.1

## Option 1 : parser les synonymes dans une liste d'intervalles

In [None]:
gs = "/gstock/biolo_datasets/variation/gnomAD/r2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.ht/"
data = hl.read_table(gs)

test = ['X:149826276-X:149826529']

intervals_raw = [hl.parse_locus_interval('{}'.format(e), reference_genome='GRCh37') for e in test] # or some other way to make a list of
data_lite = hl.filter_intervals(data, intervals_raw)

data_lite = data.filter((data.vep.most_severe_consequence == 'synonymous_variant'))

data_lite.head(20).show()

## Option 2 : parser les synonymes dans tout l'exome

In [None]:
gs = "/gstock/biolo_datasets/variation/gnomAD/r2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.ht/"
data = hl.read_table(gs)

data_lite = data.filter((data.vep.most_severe_consequence == 'synonymous_variant'))

data_lite.head(20).show()

# Pandas ClinVar

## Exemple avec ClinVar 01/21 en filtrant sur les synonymous Benign & Likely benign

In [None]:
df = pd.read_parquet('/gstock/EXOTIC/data/VARIATIONS/clinvar_20210123_lite_table.parquet')
df = df.loc[(df['Status'] == 'Benign') & (df['MC'].str.contains('synonymous'))]
df = df[['CHROM', 'POS', 'REF', 'ALT', 'Real_Status']]
df

# Convert pandas to hail, add key/index

In [None]:
t = hl.Table.from_pandas(df) 
t = t.key_by(
    locus=hl.locus(t.CHROM, hl.int32(t.POS), reference_genome='GRCh37')) 
#     alleles=[t.REF, t.ALT])
t.head(10).show()

# Annotate & show

In [None]:
data_lite_annotated = data_lite.annotate(clinvar = t[data_lite.locus].Real_Status)
data_lite_annotated.head(20).show()

## Filtrer ceux qui ont forcément une conséquence Benign dans ClinVar

In [None]:
data_lite_annotated = data_lite_annotated.filter((data_lite_annotated.clinvar != 'NA'))
data_lite_annotated.head(10).show()