# Library

## Tools:
#### - BCFtools
#### - Genome Analysis Toolkit (GATK) framework

##### could be installed via conda

## Python libraries

In [5]:
import pandas as pd
import numpy as np

# ClinVar

##### get information about P/LP variants and annotated diseases from NCBI ClinVar database

In [1]:
! mkdir data

## 1.1. data download

In [36]:
! wget -O - https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20230326.vcf.gz | gzip -d > ./data/clinvar_grch38_20230326.vcf

--2023-04-02 08:33:07--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20230326.vcf.gz
Распознаётся ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)… 130.14.250.7, 165.112.9.230, 2607:f220:41e:250::10, ...
Подключение к ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:443... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 77919370 (74M) [application/x-gzip]
Сохранение в: «STDOUT»


/2023-04-02 08:33:54 (1,61 MB/s) - записан в stdout [77919370/77919370]



## 1.2. select variants for further analysis

In [37]:
# select only pathogenic and likely pathogenic variants
! bcftools view -i 'INFO/CLNSIG ~ "Pathogenic"' ./data/clinvar_grch38_20230326.vcf > ./data/clinvar_grch38_20230326_flt.vcf

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

In [38]:
! awk '{print "chr"$0}' ./data/clinvar_grch38_20230326_flt.vcf >  ./data/clinvar_grch38_20230326_PLP.vcf
! sed -i 's/chr#/#/g' ./data/clinvar_grch38_20230326_PLP.vcf
! rm ./data/clinvar_grch38_20230326_flt.vcf

## 1.3. parse .vcf

In [3]:
# extract basic fields + INFO(Gene symbol, associated diseases, identifiers of disease in databases) .vcf 
! gatk VariantsToTable -V ./data/clinvar_grch38_20230326_PLP.vcf -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F GENEINFO -F CLNSIG -F MC -F CLNDISDB -F CLNDN -O ./cvtable_grch38_20230326_PLP.txt

In [40]:
# save only gene symbols
! awk '{sub (/:/, OFS)} 1' OFS="\t" ./cvtable_grch38_20230326_PLP.txt > ./cvtable.txt


! cat cvtable.txt | cut -f1,2,3,4,5,6,7,8,10,11,12,12 | sed '1d' > cvtable_cut.txt
! echo -e "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tGENE\tCLNSIG\tMC\tCLNDISDB" | cat - cvtable_cut.txt >  cvtable_PLP.txt

! rm cvtable.txt cvtable_cut.txt

In [41]:
# save lines which include OMIM identifier of associated diseases
! grep OMIM cvtable_PLP.txt > cv_dis.txt

# save header
! grep CHROM cvtable_PLP.txt > header.txt

# merge header and lines
! cat header.txt cv_dis.txt > cv_ids.txt


! rm header.txt  cv_dis.txt cvtable_PLP.txt

In [42]:
# rm all identifiers of diseases except OMIM

! sed -i 's/SO:[0-9]*|//g' cv_ids.txt
! sed -i 's/MONDO:MONDO:[0-9]*\,//g' cv_ids.txt
! sed -i 's/MONDO:MONDO:[0-9]*//g' cv_ids.txt
! sed -i 's/MedGen:[A-Z]*[0-9]*\,//g' cv_ids.txt
! sed -i 's/MedGen:[A-Z]*[0-9]*//g' cv_ids.txt
! sed -i 's/SO:[0-9]*|//g' cv_ids.txt
! sed -i 's/MeSH:[A-Z]*[0-9]*\,//g' cv_ids.txt
! sed -i 's/MeSH:[A-Z]*[0-9]*//g' cv_ids.txt
! sed -i 's/SNOMED_CT:[0-9]*\,//g' cv_ids.txt
! sed -i 's/SNOMED_CT:[0-9]*//g' cv_ids.txt
! sed -i 's/\,Orphanet:ORPHA[0-9]*\,//g' cv_ids.txt
! sed -i 's/Orphanet:ORPHA[0-9]*\,//g' cv_ids.txt
! sed -i 's/\,Orphanet:ORPHA[0-9]*//g' cv_ids.txt
! sed -i 's/Orphanet:ORPHA[0-9]*//g' cv_ids.txt
! sed -i 's/Human_Phenotype_Ontology:HP:[0-9]*\,//g' cv_ids.txt
! sed -i 's/Human_Phenotype_Ontology:HP:[0-9]*//g' cv_ids.txt
! sed -i 's/|OMIM:PS[0-9]*//g' cv_ids.txt
! sed -i 's/OMIM:PS[0-9]*//g' cv_ids.txt

In [43]:
! cat cv_ids.txt | cut -f1,2,3,4,5,6,7,8,9,10,11 > cvtable.txt

! rm cv_ids.txt

! cat cvtable.txt | cut -f11 | sed 's/|/\n/g' | sed 's/\,/\n/g' | grep OMIM | sort | uniq > cvtable_MIM.txt
! wc -l cvtable_MIM.txt

! echo -e "OMIM" | cat - cvtable_MIM.txt> cvtable_mims.txt

! rm cvtable_MIM.txt

5935 cvtable_MIM.txt


In [44]:
# for every OMIM identifier save line with annotated variant

MIM = pd.read_csv('./cvtable_mims.txt', sep='\t')
cvtable = pd.read_csv('./cvtable.txt', sep='\t')

MIM['OMIM'] = MIM['OMIM'].astype('str')
cvtable['CLNDISDB'] = cvtable['CLNDISDB'].astype('str')

cvtable["OMIM"] = cvtable["CLNDISDB"].apply(lambda x: [y for y in MIM["OMIM"] if y in x])
cvtable= cvtable.explode("OMIM")
output = cvtable.merge(MIM)

output = output.drop(columns=['CLNDISDB'])

output.to_csv('ClinVarTable_grch38_20230326_PLP_OMIM.txt', sep='\t', index=False)

#print(output.shape)
#print(output.head(2))

In [45]:
! rm cvtable_mims.txt cvtable.txt cvtable_grch38_20230326_PLP.txt

# Human Phenotype Ontology (HPO)

#### get information about gene - monogenic disease(s) assosiations and inheritane pattern

## 2.1. Download annotation

In [None]:
! wget

## 2.2. Subset HPO annotation

#### remove provisional gene-disease associations and  diseases with non-monogenic inheritance pattern

In [1]:
# subset gene-MIM associations (rm provisional gene-disease associations)

! cat ./data/genes_to_phenotype.txt | cut -f2,7,9 | grep OMIM | grep -v question | cut -f1,3 | sort | uniq > hpo_gene_mim.txt

! echo -e "GENE\tOMIM" | cat - hpo_gene_mim.txt > hpo_gene_omim.txt

 ***HPO terms related to inheritance:*** 

- HP:0000006  
- HP:0000007  
- HP:0010984  
- HP:0010982  
- HP:0001426  
- HP:0001417  
- HP:0001423  
- HP:0001419  
- HP:0001450  
- HP:0001428  
- HP:0001442  
- HP:0034345  

In [2]:
# collect inheritance info

! for i in $(cat ./HP_InheritanceTerms.txt) ; do grep $i ./data/phenotype_to_genes.txt ; done > ./genes_mim_inh.txt
! cat genes_mim_inh.txt | cut -f1,2,4,7 | grep OMIM  > genes_inh_mim.txt

! echo -e  "HPO-id\tHPO label\tGENE\tOMIM" | cat - genes_inh_mim.txt > hpo_genes_inh_mim.txt

***List of inheritance patterns to exclude (somatic and polygenic):***  

- HP:0010984  
- HP:0010982  
- HP:0001426  
- HP:0001428  
- HP:0001442  

In [3]:
# collect genes with non-monogenic inheritance pattern

! for i in $(cat ./rm_inh_patterns.txt) ; do grep $i ./data/phenotype_to_genes.txt ; done > ./rm.txt
! cat rm.txt | cut -f4 | sort| uniq > genes_to_remove.txt

! echo -e "GENE" | cat - genes_to_remove.txt > genes_to_rm.txt

In [6]:
# rm genes with non-monogenic inheritance pattern

gene_mims = pd.read_csv('./hpo_gene_omim.txt', sep='\t')
genes_to_remove = pd.read_csv('./genes_to_rm.txt', sep='\t')

gene_mims_flt = gene_mims.merge(genes_to_remove, on=['GENE'], how='left', indicator=True)
gene_mims_flt = gene_mims_flt[gene_mims_flt['_merge'] == 'left_only']
gene_mims_flt = gene_mims_flt.drop(columns=['_merge'], axis=1)
gene_mims_flt.shape

(5497, 2)

In [7]:
gene_mims_flt.to_csv('./hpo_gene_mim_flt.txt', sep='\t', index=False)

In [15]:
genes_mim_inh = pd.read_csv('./hpo_genes_inh_mim.txt', sep='\t')
genes_mim_inh.head(2)

Unnamed: 0,HPO-id,HPO label,GENE,OMIM
0,HP:0000006,Autosomal dominant inheritance,PRIMPOL,OMIM:615420
1,HP:0000006,Autosomal dominant inheritance,SPINK1,OMIM:608189


In [16]:
inh = pd.merge(gene_mims_flt, genes_mim_inh, on=['GENE', 'OMIM'], how='inner')
inh

Unnamed: 0,GENE,OMIM,HPO-id,HPO label
0,A4GALT,OMIM:111400,HP:0000006,Autosomal dominant inheritance
1,A4GALT,OMIM:111400,HP:0034345,Mendelian inheritance
2,AAAS,OMIM:231550,HP:0000007,Autosomal recessive inheritance
3,AAAS,OMIM:231550,HP:0034345,Mendelian inheritance
4,AAGAB,OMIM:148600,HP:0000006,Autosomal dominant inheritance
...,...,...,...,...
11254,ZSWIM6,OMIM:603671,HP:0034345,Mendelian inheritance
11255,ZSWIM6,OMIM:617865,HP:0000006,Autosomal dominant inheritance
11256,ZSWIM6,OMIM:617865,HP:0034345,Mendelian inheritance
11257,ZSWIM7,OMIM:619831,HP:0000007,Autosomal recessive inheritance


In [17]:
inh = inh[inh['HPO-id'] != 'HP:0034345']
inh = inh[inh['OMIM'] != 'OMIM:268000']
inh.shape

(5803, 4)

In [18]:
inh_count = inh.groupby(['GENE', 'OMIM']).size().reset_index(name='count_inhpattern')
inh = inh.merge(inh_count, on=['GENE', 'OMIM'])
inh.head()

Unnamed: 0,GENE,OMIM,HPO-id,HPO label,count_inhpattern
0,A4GALT,OMIM:111400,HP:0000006,Autosomal dominant inheritance,1
1,AAAS,OMIM:231550,HP:0000007,Autosomal recessive inheritance,1
2,AAGAB,OMIM:148600,HP:0000006,Autosomal dominant inheritance,1
3,AARS1,OMIM:613287,HP:0000006,Autosomal dominant inheritance,1
4,AARS1,OMIM:616339,HP:0000007,Autosomal recessive inheritance,1


In [19]:
# define inhertance pattern for each gene-disease association
def define_inheritance(df):
    
    if (df['count_inhpattern'] == 1) & (df['HPO-id'] == 'HP:0000006'):
        return 'AD'
    
    elif (df['count_inhpattern'] == 1) & (df['HPO-id'] == 'HP:0000007'):
        return 'AR'
    
    elif (df['HPO-id'] == 'HP:0001423') | (df['HPO-id'] == 'HP:0001419') | (df['HPO-id'] == 'HP:0001417'):
        return 'XL'
    
    elif df['HPO-id'] == 'HP:0001450':
        return 'YL'
    
    else:
        return 'ADAR'


In [20]:
inh['Inheritance'] = inh.apply(define_inheritance, axis=1)
inh.tail(20)

Unnamed: 0,GENE,OMIM,HPO-id,HPO label,count_inhpattern,Inheritance
5783,ZNF341,OMIM:618282,HP:0000007,Autosomal recessive inheritance,1,AR
5784,ZNF407,OMIM:619557,HP:0000007,Autosomal recessive inheritance,1,AR
5785,ZNF408,OMIM:616469,HP:0000007,Autosomal recessive inheritance,1,AR
5786,ZNF423,OMIM:614844,HP:0000006,Autosomal dominant inheritance,2,ADAR
5787,ZNF423,OMIM:614844,HP:0000007,Autosomal recessive inheritance,2,ADAR
5788,ZNF462,OMIM:618619,HP:0000006,Autosomal dominant inheritance,1,AD
5789,ZNF469,OMIM:229200,HP:0000007,Autosomal recessive inheritance,1,AR
5790,ZNF526,OMIM:619877,HP:0000007,Autosomal recessive inheritance,1,AR
5791,ZNF644,OMIM:614167,HP:0000006,Autosomal dominant inheritance,1,AD
5792,ZNF687,OMIM:616833,HP:0000006,Autosomal dominant inheritance,1,AD


In [21]:
inh.to_csv('./check_inh.csv', sep='\t', index=False)

In [14]:
inh = inh.drop(columns=['HPO-id', 'HPO label', 'count_inhpattern'])

In [62]:
inh.shape

(5803, 3)

In [63]:
inh = inh.drop_duplicates()
inh.drop_duplicates().shape

(5371, 3)

In [2]:
# add ENSEMBL ids

! wget https://www.omim.org/static/omim/data/mim2gene.txt 
! sed -e '1,4d' mim2gene.txt | cut -f4,5 | awk 'NF' > mimgenes.txt
! sed -i 's/Ensembl Gene ID (Ensembl)/Ensembl/g' mimgenes.txt

In [65]:
ensembl_id = pd.read_csv('./mimgenes.txt', sep='\t')
hpo = inh.merge(ensembl_id, left_on=['GENE'], right_on=['Approved Gene Symbol (HGNC)'], how='inner')
hpo = hpo.drop(columns=['Approved Gene Symbol (HGNC)'])

In [66]:
hpo

Unnamed: 0,GENE,OMIM,Inheritance,Ensembl
0,A4GALT,OMIM:111400,AD,ENSG00000128274
1,AAAS,OMIM:231550,AR,ENSG00000094914
2,AAGAB,OMIM:148600,AD,ENSG00000103591
3,AARS1,OMIM:613287,AD,ENSG00000090861
4,AARS1,OMIM:616339,AR,ENSG00000090861
...,...,...,...,...
5353,ZP2,OMIM:618353,AR,ENSG00000103310
5354,ZP3,OMIM:617712,AD,ENSG00000188372
5355,ZSWIM6,OMIM:603671,AD,ENSG00000130449
5356,ZSWIM6,OMIM:617865,AD,ENSG00000130449


In [67]:
hpo.to_csv('hpo_table.csv', sep='\t', index=False)

In [68]:
! rm genes_inh_mim.txt genes_mim_inh.txt genes_to_remove.txt genes_to_rm.txt hpo_gene_mim.txt hpo_gene_mim_flt.txt hpo_gene_omim.txt hpo_genes_inh_mim.txt mim2gene.txt mimgenes.txt

# Merge ClinVar and HPO data

In [69]:
cv = pd.read_csv('ClinVarTable_grch38_20230326_PLP_OMIM.txt', sep='\t')

print(cv.head(2))
print(cv.shape)

  CHROM      POS       ID REF ALT  QUAL FILTER   GENE      CLNSIG  \
0  chr1  1013983  1028857   G   A -10.0   PASS  ISG15  Pathogenic   
1  chr1  1014143   183381   C   T -10.0   PASS  ISG15  Pathogenic   

                        MC         OMIM  
0  splice_acceptor_variant  OMIM:616126  
1                 nonsense  OMIM:616126  
(120938, 11)


In [70]:
hpo = pd.read_csv('hpo_table.csv', sep='\t')

print(hpo.head(2))
print(hpo.shape)

     GENE         OMIM Inheritance          Ensembl
0  A4GALT  OMIM:111400          AD  ENSG00000128274
1    AAAS  OMIM:231550          AR  ENSG00000094914
(5358, 4)


In [71]:
# final dataframe
df = pd.merge(cv, hpo, on=['GENE', 'OMIM'], how='inner')

In [72]:
# Statistics

print(df.shape)
print('N of variants:', df['ID'].drop_duplicates().shape[0])
print('N of genes:', df['GENE'].drop_duplicates().shape[0])
print('N of gene-MIM associations:', df[['GENE', 'OMIM']].drop_duplicates(keep='first').shape[0])

(87985, 13)
N of variants: 74365
N of genes: 3578
N of gene-MIM associations: 4850
