# **ClinVar VCF Parser**
### Author: Daniel Brock
### Date: 7/1/2023
### Purpose: to select pathogenic and likely pathogenic (PLP) and benign and likely benign (BLB) SNVs from ClinVar after 2020-11-21 (date of MutScore paper cutoff)

In [1]:
# Importing required packages
import io
import os
import pandas as pd

# Working directory
cwd = os.getcwd()
print(cwd)

C:\Users\TooFastDan\OneDrive - Baylor College of Medicine\BCM\Projects\Autosomal Dominant Predictor of IRDs\manuscript\GitHub


In [2]:
def read_vcf(path):
    """
    Function to parse a vcf file as a dataframe.
    Source: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744
    Input: a file path to a VCF file from ClinVar
    Output: a pandas dataframe
    """
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [4]:
# Importing the ClinVar VCF file (as of 2023-06-26)
# Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/
clinvar = read_vcf(path=cwd+"/ClinVar_Data/clinvar_20230626.vcf")
display(clinvar.shape)
display(clinvar.head())

(2181372, 8)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:..."
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:..."
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:..."
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:..."
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:..."


In [5]:
clinvar["INFO"][0]

'ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDN=Inborn_genetic_diseases;CLNHGVS=NC_000001.10:g.69134A>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1'

## Adding new columns based on ClinVar's "INFO" column

In [6]:
# ClinVar Allele ID
pattern = r"ALLELEID=(.*?);"
clinvar["ALLELEID"] = clinvar["INFO"].str.extract(pattern)
print("Done with ALLELEID")

# tag-value pairs of disease database name and identifier
pattern = r"CLNDISDB=(.*?);"
clinvar["CLNDISDB"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNDISDB")

# ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB
pattern = r"CLNDN=(.*?);"
clinvar["CLNDN"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNDN")

# top-level (primary assembly, alt, or patch) HGVS expression
pattern = r"CLNHGVS=(.*?);"
clinvar["CLNHGVS"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNHGVS")

# ClinVar's review status for the Variation ID
pattern = r"CLNREVSTAT=(.*?);"
clinvar["CLNREVSTAT"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNREVSTAT")

# clinical significance for this single variant; multiple values are separated by a vertical bar
pattern = r"CLNSIG=(.*?);"
clinvar["CLNSIG"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNSIG")

# variant type
pattern = r"CLNVC=(.*?);"
clinvar["CLNVC"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNVC")

# Sequence Ontology ID for the variant type (www.sequenceontology.org)
pattern = r"CLNVCSO=(.*?);"
clinvar["CLNVCSO"] = clinvar["INFO"].str.extract(pattern)
print("Done with CLNVCSO")

# gene(s) for the variant reported as gene symbol:NCBI GeneID. The gene symbol and ID are delimited by a colon and each pair is delimited by a vertical bar.
pattern = r"GENEINFO=(.*?);"
clinvar["GENEINFO"] = clinvar["INFO"].str.extract(pattern)
clinvar[["Gene", "Gene_ID"]] = clinvar["GENEINFO"].str.split(":", 1, expand=True)
print("Done with GENEINFO")

# comma separated list of molecular consequence in the form of Sequence Ontology ID|molecular_consequence
pattern = r"MC=(.*?);"
clinvar["MC"] = clinvar["INFO"].str.extract(pattern)
print("Done with MC")

# Allele origin reported to ClinVar
# 0 = unknown
# 1 = germline
# 2 = somatic 
# 4 = inherited
# 8 = paternal 
# 16 = maternal 
# 32 = de-novo
# 64 = biparental
# 128 = uniparental 
# 256 = not-tested
# 512 = tested-inconclusive
# 1073741824 = other
pattern = r"ORIGIN=(.*?);"
clinvar["ORIGIN"] = clinvar["INFO"].str.extract(pattern)
print("Done with ORIGIN")

display(clinvar.head())

Done with ALLELEID
Done with CLNDISDB
Done with CLNDN
Done with CLNHGVS
Done with CLNREVSTAT
Done with CLNSIG
Done with CLNVC
Done with CLNVCSO
Done with GENEINFO
Done with MC
Done with ORIGIN


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,ALLELEID,CLNDISDB,...,CLNHGVS,CLNREVSTAT,CLNSIG,CLNVC,CLNVCSO,GENEINFO,Gene,Gene_ID,MC,ORIGIN
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:...",2193183,"MeSH:D030342,MedGen:C0950123",...,NC_000001.10:g.69134A>G,"criteria_provided,_single_submitter",Likely_benign,single_nucleotide_variant,SO:0001483,OR4F5:79501,OR4F5,79501,SO:0001583|missense_variant,
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:...",2238986,"MeSH:D030342,MedGen:C0950123",...,NC_000001.10:g.69581C>G,"criteria_provided,_single_submitter",Uncertain_significance,single_nucleotide_variant,SO:0001483,OR4F5:79501,OR4F5,79501,SO:0001583|missense_variant,
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:...",2386655,"MeSH:D030342,MedGen:C0950123",...,NC_000001.10:g.69682G>A,"criteria_provided,_single_submitter",Uncertain_significance,single_nucleotide_variant,SO:0001483,OR4F5:79501,OR4F5,79501,SO:0001583|missense_variant,
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:...",2278803,"MeSH:D030342,MedGen:C0950123",...,NC_000001.10:g.69769T>C,"criteria_provided,_single_submitter",Uncertain_significance,single_nucleotide_variant,SO:0001483,OR4F5:79501,OR4F5,79501,SO:0001583|missense_variant,
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:...",2333177,"MeSH:D030342,MedGen:C0950123",...,NC_000001.10:g.69995G>C,"criteria_provided,_single_submitter",Uncertain_significance,single_nucleotide_variant,SO:0001483,OR4F5:79501,OR4F5,79501,SO:0001583|missense_variant,


## Filtering for BLB and PLP for any SNV reported to ClinVar after 2020-11-21 (MutScore training cutoff)

In [7]:
clinvar.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
       'ALLELEID', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNREVSTAT', 'CLNSIG',
       'CLNVC', 'CLNVCSO', 'GENEINFO', 'Gene', 'Gene_ID', 'MC', 'ORIGIN'],
      dtype='object')

In [8]:
clinvar["CLNVC"].unique()

array(['single_nucleotide_variant', 'Microsatellite', 'Indel', 'Deletion',
       'Duplication', 'Insertion', 'Variation', 'Inversion'], dtype=object)

In [9]:
clinvar["CLNSIG"].unique()

array(['Likely_benign', 'Uncertain_significance', 'Benign',
       'Conflicting_interpretations_of_pathogenicity', 'Pathogenic',
       'Likely_pathogenic', 'Benign/Likely_benign', 'not_provided',
       'Pathogenic/Likely_pathogenic', nan, 'risk_factor', 'Affects',
       'association', 'Benign|other',
       'Conflicting_interpretations_of_pathogenicity|other',
       'drug_response',
       'Conflicting_interpretations_of_pathogenicity|association',
       'Uncertain_risk_allele', 'other',
       'Uncertain_significance|risk_factor',
       'Likely_pathogenic|risk_factor', 'Likely_benign|association',
       'Likely_risk_allele', 'Pathogenic/Likely_pathogenic|other',
       'Pathogenic|other',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance|other',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance',
       'protective', 'Pathogenic|risk_factor',
       'Pathogenic/Likely_pathogenic|risk_factor',
       'Benign/Likely_benign|risk_factor',
       'Uncerta

In [10]:
# Filtering for SNVs
clinvar_filt = clinvar[clinvar["CLNVC"]=="single_nucleotide_variant"]
print("Clinvar unfiltered:", clinvar.shape)
print("Filtered for SNVs:", clinvar_filt.shape)

# Filtering out dates AFTER 2020-11-21
clinvar_filt["ID"] = clinvar_filt["ID"].astype(int)
clinvar_filt = clinvar_filt.sort_values(by="ID", ascending=False)
clinvar_filt = clinvar_filt[clinvar_filt["ID"] > 986732]  #if you look up ID 986732 in clinvar, this is the first SNV reported to clinvar after 2020-11-21
print("Filtered for dates after 2020-11-21:", clinvar_filt.shape)

# Filtering for variants with PLP and BLB annotations only 
pathogenecity = ["Benign", "Likely_benign", "Benign/Likely_benign", "Pathogenic", "Likely_pathogenic", "Pathogenic/Likely_pathogenic"]
clinvar_filt = clinvar_filt[clinvar_filt["CLNSIG"].isin(pathogenecity)]
print("Filtered for pathogenicity annotations:", clinvar_filt.shape)

Clinvar unfiltered: (2181372, 21)
Filtered for SNVs: (1991708, 21)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinvar_filt["ID"] = clinvar_filt["ID"].astype(int)


Filtered for dates after 2020-11-21: (1291946, 21)
Filtered for pathogenicity annotations: (573694, 21)


In [11]:
# Cleaning and mapping data 

# Mapping and binarizing pathogenicity
clinvar_filt["PATHOGENICITY"] = clinvar_filt["CLNSIG"].map(
    {"Benign":"BLB", "Likely_benign":"BLB", "Benign/Likely_benign":"BLB", "Pathogenic":"PLP", "Likely_pathogenic":"PLP", "Pathogenic/Likely_pathogenic":"PLP"})
clinvar_filt["y_test"] = clinvar_filt["PATHOGENICITY"].map({"PLP": 1, "BLB": 0})

# Renaming chromosomes for ANNOVAR input
chromosomes = {'1':'chr1', '2':'chr2', '3':'chr3', '4':'chr4', '5':'chr5', 
                '6':'chr6', '7':'chr7', '8':'chr8', '9':'chr9', '10':'chr10', 
                '11':'chr11', '12':'chr12', '13':'chr13', '14':'chr14', '15':'chr15', 
                '16':'chr16', '17':'chr17', '18':'chr18', '19':'chr19', '20':'chr20', 
                '21':'chr21', '22':'chr22', 'X':'chrX', 'Y':'chrY', 'MT':'M'}
clinvar_filt["CHROM"] = clinvar_filt["CHROM"].map(chromosomes)

# Making the end position the same name as the start position for SNVs and reordering/renaming columns for ANNOVAR input
clinvar_filt["END"] = clinvar_filt["POS"]
clinvar_filt = clinvar_filt[['CHROM', 'POS', 'END', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'ID', 'ALLELEID', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNREVSTAT', 'CLNVC', 'CLNVCSO', 'GENEINFO', 'Gene', 'Gene_ID', 'MC', 'ORIGIN', 'CLNSIG', 'PATHOGENICITY', 'y_test']]
clinvar_filt.columns = ['CHROM', 'START', 'END', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'ID', 'ALLELEID', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNREVSTAT', 'CLNVC', 'CLNVCSO', 'GENEINFO', 'Gene', 'Gene_ID', 'MC', 'ORIGIN', 'CLNSIG', 'PATHOGENICITY', 'y_test']

# Displaying the filtered df
display(clinvar_filt.shape)
display(clinvar_filt.head())

(573694, 24)

Unnamed: 0,CHROM,START,END,REF,ALT,QUAL,FILTER,INFO,ID,ALLELEID,...,CLNVC,CLNVCSO,GENEINFO,Gene,Gene_ID,MC,ORIGIN,CLNSIG,PATHOGENICITY,y_test
2153503,chrX,107869448,107869448,G,A,.,.,"ALLELEID=35936;CLNDISDB=MONDO:MONDO:0010520,Me...",2506495,35936,...,single_nucleotide_variant,SO:0001483,COL4A5:1287,COL4A5,1287,SO:0001583|missense_variant,1.0,Likely_pathogenic,PLP,1
2153970,chrX,107930842,107930842,C,A,.,.,"ALLELEID=2671452;CLNDISDB=MONDO:MONDO:0010520,...",2506494,2671452,...,single_nucleotide_variant,SO:0001483,COL4A5:1287,COL4A5,1287,SO:0001587|nonsense,,Likely_pathogenic,PLP,1
1081798,chr10,16870957,16870957,G,T,.,.,"ALLELEID=2671451;CLNDISDB=MONDO:MONDO:0030042,...",2506493,2671451,...,single_nucleotide_variant,SO:0001483,CUBN:8029,CUBN,8029,SO:0001587|nonsense,,Likely_pathogenic,PLP,1
382434,chr2,228124576,228124576,G,A,.,.,"ALLELEID=2671449;CLNDISDB=MONDO:MONDO:0008762,...",2506491,2671449,...,single_nucleotide_variant,SO:0001483,COL4A3:1285|MFF-DT:654841,COL4A3,1285|MFF-DT:654841,SO:0001583|missense_variant,,Pathogenic,PLP,1
382921,chr2,228145714,228145714,G,A,.,.,"ALLELEID=2671448;CLNDISDB=MONDO:MONDO:0008762,...",2506490,2671448,...,single_nucleotide_variant,SO:0001483,COL4A3:1285|MFF-DT:654841,COL4A3,1285|MFF-DT:654841,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1


In [12]:
clinvar_filt.columns

Index(['CHROM', 'START', 'END', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'ID',
       'ALLELEID', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNREVSTAT', 'CLNVC',
       'CLNVCSO', 'GENEINFO', 'Gene', 'Gene_ID', 'MC', 'ORIGIN', 'CLNSIG',
       'PATHOGENICITY', 'y_test'],
      dtype='object')

In [13]:
clinvar_filt["CLNSIG"].value_counts()

Likely_benign                   433231
Benign                           87625
Pathogenic                       24480
Likely_pathogenic                22855
Benign/Likely_benign              3691
Pathogenic/Likely_pathogenic      1812
Name: CLNSIG, dtype: int64

In [14]:
clinvar_filt["PATHOGENICITY"].value_counts()

BLB    524547
PLP     49147
Name: PATHOGENICITY, dtype: int64

In [15]:
clinvar_filt.dtypes

CHROM            object
START             int32
END               int32
REF              object
ALT              object
QUAL             object
FILTER           object
INFO             object
ID                int32
ALLELEID         object
CLNDISDB         object
CLNDN            object
CLNHGVS          object
CLNREVSTAT       object
CLNVC            object
CLNVCSO          object
GENEINFO         object
Gene             object
Gene_ID          object
MC               object
ORIGIN           object
CLNSIG           object
PATHOGENICITY    object
y_test            int64
dtype: object

## Exporting as a txt file with the extention "avinput"

In [16]:
#clinvar_filt.to_csv(cwd+'/annovar_files/clinvar_filt_20201128-20230626.avinput', sep="\t", index=False, header=False)

### Optional filtering using only RetNet genes

In [17]:
# Importing retnet genes
retnet = pd.read_excel(cwd+"/RetNet/RetNet_AD-AR_FINALIZED.xlsx", sheet_name="gene_info")
retnet_genes = retnet["gene"]

# Filtering filtered ClinVar variants to only include RetNet Genes
clinvar_filt_retnet = clinvar_filt[clinvar_filt["Gene"].isin(retnet_genes)]
display(clinvar_filt_retnet.shape)
display(clinvar_filt_retnet.head())

(62696, 24)

Unnamed: 0,CHROM,START,END,REF,ALT,QUAL,FILTER,INFO,ID,ALLELEID,...,CLNVC,CLNVCSO,GENEINFO,Gene,Gene_ID,MC,ORIGIN,CLNSIG,PATHOGENICITY,y_test
1107483,chr10,71142347,71142347,C,A,.,.,"ALLELEID=2671427;CLNDISDB=MONDO:MONDO:0032807,...",2506469,2671427,...,single_nucleotide_variant,SO:0001483,HK1:3098,HK1,3098,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1
1989824,chr20,10654126,10654126,A,C,.,.,"ALLELEID=2671423;CLNDISDB=MONDO:MONDO:0016862,...",2506465,2671423,...,single_nucleotide_variant,SO:0001483,JAG1:182,JAG1,182,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1
749118,chr6,65300662,65300662,T,A,.,.,ALLELEID=2671350;CLNDISDB=Human_Phenotype_Onto...,2506396,2671350,...,single_nucleotide_variant,SO:0001483,EYS:346007,EYS,346007,SO:0001587|nonsense,,Pathogenic,PLP,1
537620,chr4,39255536,39255536,C,T,.,.,"ALLELEID=2671345;CLNDISDB=MONDO:MONDO:0013719,...",2506391,2671345,...,single_nucleotide_variant,SO:0001483,WDR19:57728,WDR19,57728,SO:0001587|nonsense,,Likely_pathogenic,PLP,1
1320901,chr12,48380959,48380959,C,T,.,.,"ALLELEID=2671226;CLNDISDB=MONDO:MONDO:0008702,...",2506270,2671226,...,single_nucleotide_variant,SO:0001483,COL2A1:1280,COL2A1,1280,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1


In [18]:
len(clinvar_filt_retnet["Gene"].unique())

265

## Exporting filtered RetNet SNVs as a txt file with the extention "avinput"

In [19]:
clinvar_filt_retnet.to_csv(cwd+'/annovar_files/clinvar_filt_retnet.avinput', sep="\t", index=False, header=False)