In [31]:
import io
import os
import pandas as pd

In [32]:
# this will convert vcf to dataframe
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [33]:
'''
Example search:
1. Find BRCA1 gene on UCSC Genome Browser and get chr17:43044295-43125364 
2. Paste genome location into Clinvar search without the commas and filter for single nucleotide variation
3. On Clinvar website navigate to "FTP" on Home bar
4. Click on "vcf_GRCh38/"
5. Click on "clinvar.vcf.gz" and it will download VCF
6. Unzip the file and use that path below
'''

# ENTER your path to the vcf file
alz_vcf = read_vcf("C:\\Users\\lawfu\\Documents\\Github Hackathon 2025\\clinvar.vcf")

In [34]:
# show first 5 of dataframe
print(alz_vcf.head())

  CHROM    POS       ID REF ALT QUAL FILTER  \
0     1  66926  3385321  AG   A    .      .   
1     1  69134  2205837   A   G    .      .   
2     1  69314  3205580   T   G    .      .   
3     1  69423  3205581   G   A    .      .   
4     1  69581  2252161   C   G    .      .   

                                                INFO  
0  ALLELEID=3544463;CLNDISDB=Human_Phenotype_Onto...  
1  ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLND...  
2  ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLND...  
3  ALLELEID=3374048;CLNDISDB=MedGen:CN169374;CLND...  
4  ALLELEID=2238986;CLNDISDB=MedGen:CN169374;CLND...  


In [35]:
# this will help create a series from ; separators of vcf file
def get_condition(x):
    return x.split(";")[2]

In [36]:
# get only INFO column from dataframe
alz_vcf.INFO.map(get_condition)

0                         CLNDN=Retinitis_pigmentosa
1                                CLNDN=not_specified
2                                CLNDN=not_specified
3                                CLNDN=not_specified
4                                CLNDN=not_specified
                             ...                    
3368280                          CLNDN=not_specified
3368281                          CLNDN=not_specified
3368282                          CLNDN=not_specified
3368283                          CLNDN=not_specified
3368284    CLNDN=ABO_blood_group_system|not_provided
Name: INFO, Length: 3368285, dtype: object

In [37]:
counts_conditions = alz_vcf.INFO.map(lambda x: x.split(";")[2]).value_counts()
# count all items in column in INFO
print(counts_conditions)

INFO
CLNDN=not_provided                               643692
CLNDN=not_specified                              628798
CLNDISDB=MedGen:C3661900                         132026
CLNDN=Inborn_genetic_diseases                    114145
CLNDN=Hereditary_cancer-predisposing_syndrome     44164
                                                  ...  
ALLELEID=1695008                                      1
ALLELEID=522568                                       1
ALLELEID=722465                                       1
ALLELEID=561510                                       1
CLNDN=ABO_blood_group_system|not_provided             1
Name: count, Length: 322229, dtype: int64


In [38]:
counts_conditions = alz_vcf.INFO.map(lambda x: x.split(";")[2] if x.split(";")[2].startswith('CLNDN') else None).value_counts()

# count all items in column in INFO with just "CLNDN"
print(counts_conditions)

INFO
CLNDN=not_provided                                                     643692
CLNDN=not_specified                                                    628798
CLNDN=Inborn_genetic_diseases                                          114145
CLNDN=Hereditary_cancer-predisposing_syndrome                           44164
CLNDN=Cardiovascular_phenotype                                          23146
                                                                        ...  
CLNDN=Cohen_syndrome|not_specified|not_provided                             1
CLNDN=not_provided|VPS13B-related_disorder                                  1
CLNDN=Cohen_syndrome|not_provided|Abnormality_of_the_nervous_system         1
CLNDN=not_provided|Abnormality_of_the_nervous_system|Cohen_syndrome         1
CLNDN=ABO_blood_group_system|not_provided                                   1
Name: count, Length: 100057, dtype: int64


In [39]:
just_conditions = counts_conditions[~counts_conditions.index.isin(["CLNDN=not_provided","CLNDN=not_specified"])]  
# must put .index or it will only filter by the associated value and not the name

print(just_conditions)

INFO
CLNDN=Inborn_genetic_diseases                                          114145
CLNDN=Hereditary_cancer-predisposing_syndrome                           44164
CLNDN=Cardiovascular_phenotype                                          23146
CLNDN=Primary_ciliary_dyskinesia                                        15306
CLNDN=Inborn_genetic_diseases|not_provided                               9715
                                                                        ...  
CLNDN=Cohen_syndrome|not_specified|not_provided                             1
CLNDN=not_provided|VPS13B-related_disorder                                  1
CLNDN=Cohen_syndrome|not_provided|Abnormality_of_the_nervous_system         1
CLNDN=not_provided|Abnormality_of_the_nervous_system|Cohen_syndrome         1
CLNDN=ABO_blood_group_system|not_provided                                   1
Name: count, Length: 100055, dtype: int64


In [40]:
# list(map(print, just_conditions.head(5).index))
for condition in just_conditions.head(5).index:
    print(condition[6:])

Inborn_genetic_diseases
Hereditary_cancer-predisposing_syndrome
Cardiovascular_phenotype
Primary_ciliary_dyskinesia
Inborn_genetic_diseases|not_provided


In [41]:
top_cond = []
for condition in just_conditions.head(5).index:
    top_cond.append(str(condition[6:]))

print(top_cond)

['Inborn_genetic_diseases', 'Hereditary_cancer-predisposing_syndrome', 'Cardiovascular_phenotype', 'Primary_ciliary_dyskinesia', 'Inborn_genetic_diseases|not_provided']


In [42]:
# print(just_conditions.head(5))

top_cond_dict = {str(condition[6:]): just_conditions[condition] for condition in just_conditions.head(5).index}
# Print the dictionary
print(top_cond_dict)

{'Inborn_genetic_diseases': 114145, 'Hereditary_cancer-predisposing_syndrome': 44164, 'Cardiovascular_phenotype': 23146, 'Primary_ciliary_dyskinesia': 15306, 'Inborn_genetic_diseases|not_provided': 9715}
