# Imports and set up

In [1]:
import pandas as pd
import os

In [2]:
TSV_FILE_PATH = '/home/projects/genomics/summary_stats/34662886-GCST90077560-EFO_0004326.h.tsv.gz'
PATH_TO_VEP = '/home/ext_sofia_buyanova_gmail_com/vep_data'
VEP_INPUT_FILE = 'vep_input_all'

# Get the data

In [3]:
df = pd.read_csv(TSV_FILE_PATH, sep='\t', compression='gzip')
df.head()

Unnamed: 0,hm_variant_id,hm_rsid,hm_chrom,hm_pos,hm_other_allele,hm_effect_allele,hm_beta,hm_odds_ratio,hm_ci_lower,hm_ci_upper,...,other_allele,effect_allele,beta,ci_lower,ci_upper,p_value,effect_allele_frequency,standard_error,odds_ratio,variant_id
0,1_930158_C_T,rs1381099827,1,930158,C,T,-0.192142,,-0.905992,0.521708,...,C,T,-0.192142,-0.905992,0.521708,0.597811,9e-06,0.999991,,rs1381099827
1,1_930165_G_A,rs201186828,1,930165,G,A,-0.021754,,-0.33229,0.288782,...,G,A,-0.021754,-0.33229,0.288782,0.890793,4.5e-05,0.999955,,rs201186828
2,1_930204_G_A,rs148711625,1,930204,G,A,0.105264,,-0.210775,0.421303,...,G,A,0.105264,-0.210775,0.421303,0.513879,4.4e-05,0.999956,,rs148711625
3,1_930215_A_G,rs903331232,1,930215,A,G,0.081907,,-0.422869,0.586683,...,A,G,0.081907,-0.422869,0.586683,0.750462,1.7e-05,0.999983,,rs903331232
4,1_930245_G_A,rs146327803,1,930245,G,A,0.252748,,-0.192428,0.697924,...,G,A,0.252748,-0.192428,0.697924,0.265808,2.2e-05,0.999978,,rs146327803


In [4]:
cols_for_annotation = ['chromosome', 'base_pair_location', 'variant_id', 'other_allele', 'effect_allele']
df_to_vep = df[cols_for_annotation].copy()
df_to_vep.head()

Unnamed: 0,chromosome,base_pair_location,variant_id,other_allele,effect_allele
0,1,930158,rs1381099827,C,T
1,1,930165,rs201186828,G,A
2,1,930204,rs148711625,G,A
3,1,930215,rs903331232,A,G
4,1,930245,rs146327803,G,A


In [5]:
df_to_vep.rename({
    'chromosome': '# CHROM',
    'variant_id': 'POS',
    'base_pair_location': 'POS',
    'other_allele': 'REF',
    'effect_allele': 'ALT',
}, axis='columns', inplace=True)
df_to_vep.head()

Unnamed: 0,# CHROM,POS,POS.1,REF,ALT
0,1,930158,rs1381099827,C,T
1,1,930165,rs201186828,G,A
2,1,930204,rs148711625,G,A
3,1,930215,rs903331232,A,G
4,1,930245,rs146327803,G,A


In [9]:
df_to_vep.to_csv(os.path.join(PATH_TO_VEP, 'input', VEP_INPUT_FILE + '.vcf'), sep='\t', index=False)

# Run VEP

1) Make sure that all is OK with permissions (`sudo chmod 777` to `vep_data` and `docker.soc`)
2) Plugins:
   1) CADD: download file (All possible SNVs of GRCh38/hg38 OR All gnomad release 3.0 SNVs - not all variants, without very rare mutations)
   2) GO: not offline
   3) EVE: also need to download smth

In [10]:
! tmux new-session -d -s "vep_all" docker run -t -v {PATH_TO_VEP}:/opt/vep/.vep ensemblorg/ensembl-vep \
    ./vep --cache --format vcf --tab --force_overwrite --dir_cache /opt/vep/.vep/ --dir_plugins /opt/vep/.vep/Plugins/ \
        --assembly GRCh38 \
            --plugin Blosum62 \
            --plugin CADD,/opt/vep/.vep/custom/cadd.gnomad.genomes.r3.0.snv.tsv.gz \
            --plugin GO \
        --custom /opt/vep/.vep/custom/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN \
        --input_file /opt/vep/.vep/input/{VEP_INPUT_FILE}.vcf \
        --output_file /opt/vep/.vep/output/{VEP_INPUT_FILE}.vep