In [77]:
import io
import os
import pandas as pd

In [78]:
# this will convert vcf to dataframe
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [79]:
'''
Example search:
1. Find BRCA1 gene on UCSC Genome Browser and get chr17:43044295-43125364 
2. Put in parameters of chromosome and start and end location into Defining the range code below
3. On Clinvar website navigate to "FTP" on Home bar
4. Click on "vcf_GRCh38/"
5. Click on "clinvar.vcf.gz" and it will download VCF
6. Unzip the file and use that path below
'''

# ENTER your path to the vcf file
alz_vcf_raw = read_vcf("C:\\Users\\lawfu\\Documents\\Github Hackathon 2025\\clinvar.vcf")

In [80]:
# show first 5 of dataframe
print(alz_vcf_raw.head())

  CHROM    POS       ID REF ALT QUAL FILTER  \
0     1  66926  3385321  AG   A    .      .   
1     1  69134  2205837   A   G    .      .   
2     1  69314  3205580   T   G    .      .   
3     1  69423  3205581   G   A    .      .   
4     1  69581  2252161   C   G    .      .   

                                                INFO  
0  ALLELEID=3544463;CLNDISDB=Human_Phenotype_Onto...  
1  ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLND...  
2  ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLND...  
3  ALLELEID=3374048;CLNDISDB=MedGen:CN169374;CLND...  
4  ALLELEID=2238986;CLNDISDB=MedGen:CN169374;CLND...  


In [81]:
# Define the range you want to find for the gene
chrom = '17'
start_pos = 43044295
end_pos = 43125364

# Filter the DataFrame to find the region within the range for chr##
alz_vcf = alz_vcf_raw[(alz_vcf_raw['CHROM'] == chrom) & 
                 (alz_vcf_raw['POS'] >= start_pos) & 
                 (alz_vcf_raw['POS'] <= end_pos) &
                 (alz_vcf_raw["ALT"].str.len() == 1) &
                 (alz_vcf_raw["REF"].str.len() == 1)]

# Print the result
print(alz_vcf)

total_var = len(alz_vcf)
print('Number of single nucleotide variations in chromosome region:', total_var)

        CHROM       POS       ID REF ALT QUAL FILTER  \
2708619    17  43044315   438907   T   A    .      .   
2708620    17  43044320  1336623   T   C    .      .   
2708622    17  43044342  1336422   T   C    .      .   
2708623    17  43044346   209232   C   T    .      .   
2708624    17  43044351   264789   C   T    .      .   
...       ...       ...      ...  ..  ..  ...    ...   
2723077    17  43125350  1194833   C   G    .      .   
2723078    17  43125357  1171499   G   A    .      .   
2723079    17  43125357   920681   G   C    .      .   
2723080    17  43125358  1685584   T   A    .      .   
2723081    17  43125360   323424   T   A    .      .   

                                                      INFO  
2708619  ALLELEID=432889;CLNDISDB=MedGen:CN169374;CLNDN...  
2708620  ALLELEID=1327634;CLNDISDB=MedGen:CN169374;CLND...  
2708622  ALLELEID=1327433;CLNDISDB=MedGen:CN169374;CLND...  
2708623  AF_EXAC=0.00500;AF_TGP=0.00419;ALLELEID=206190...  
2708624  AF_EXAC=0.015

In [82]:
# this will help create a series from ; separators of vcf file
def get_condition(x):
    return x.split(";")[2]

In [83]:
# get only INFO column from dataframe
alz_vcf.INFO.map(get_condition)

2708619                                  CLNDN=not_specified
2708620                                  CLNDN=not_specified
2708622                                  CLNDN=not_specified
2708623                                      ALLELEID=206190
2708624                                      ALLELEID=259533
                                 ...                        
2723077                                   CLNDN=not_provided
2723078    CLNDISDB=MONDO:MONDO:0015356,MeSH:D009386,MedG...
2723079    CLNDN=Hereditary_cancer-predisposing_syndrome|...
2723080                      CLNDN=Familial_cancer_of_breast
2723081      CLNDN=Hereditary_breast_ovarian_cancer_syndrome
Name: INFO, Length: 11046, dtype: object

In [90]:
counts_conditions = alz_vcf.INFO.map(lambda x: x.split(";")[2]).value_counts()
# count all items in column in INFO
print(counts_conditions)

INFO
CLNDN=Breast-ovarian_cancer,_familial,_susceptibility_to,_1                                                                                                                                                                                 2406
CLNDN=Hereditary_cancer-predisposing_syndrome                                                                                                                                                                                               1029
CLNDN=Hereditary_breast_ovarian_cancer_syndrome                                                                                                                                                                                             1006
CLNDN=Hereditary_cancer-predisposing_syndrome|Hereditary_breast_ovarian_cancer_syndrome                                                                                                                                                      692
CLNDN=Hereditary_breast_ovarian

In [98]:
print(counts_conditions[1000:1414])

INFO
CLNDISDB=MedGen:CN169374|MONDO:MONDO:0011450,MedGen:C2676676,OMIM:604370,Orphanet:145|MONDO:MONDO:0015356,MeSH:D009386,MedGen:C0027672,Orphanet:140162|MONDO:MONDO:0003582,MeSH:D061325,MedGen:C0677776,Orphanet:145|MedGen:CN221562|MedGen:C3661900                                                                                    1
CLNDN=Hereditary_cancer-predisposing_syndrome|Malignant_tumor_of_breast|Breast-ovarian_cancer,_familial,_susceptibility_to,_1|Hereditary_breast_ovarian_cancer_syndrome                                                                                                                                                                 1
CLNDISDB=.|MedGen:C3661900|MONDO:MONDO:0013685,MedGen:C3280442,OMIM:614320,Orphanet:1333|MONDO:MONDO:0011450,MedGen:C2676676,OMIM:604370,Orphanet:145|MONDO:MONDO:0054748,MedGen:C4554406,OMIM:617883|MONDO:MONDO:0003582,MeSH:D061325,MedGen:C0677776,Orphanet:145|MONDO:MONDO:0015356,MeSH:D009386,MedGen:C0027672,Orphanet:140162    1
CLNDN

In [85]:
counts_conditions = alz_vcf.INFO.map(lambda x: x.split(";")[2] if x.split(";")[2].startswith('CLNDN') else None).value_counts()

# count all items in column in INFO with just "CLNDN"
print(counts_conditions)

INFO
CLNDN=Breast-ovarian_cancer,_familial,_susceptibility_to,_1                                                                                                                                                                                                                                                                                                                      2406
CLNDN=Hereditary_cancer-predisposing_syndrome                                                                                                                                                                                                                                                                                                                                    1029
CLNDN=Hereditary_breast_ovarian_cancer_syndrome                                                                                                                                                                                                        

In [86]:
just_conditions = counts_conditions[~counts_conditions.index.isin(["CLNDN=not_provided","CLNDN=not_specified"])]  
# must put .index or it will only filter by the associated value and not the name

print(just_conditions)

INFO
CLNDN=Breast-ovarian_cancer,_familial,_susceptibility_to,_1                                                                                                                                                                                                                                                                                                                      2406
CLNDN=Hereditary_cancer-predisposing_syndrome                                                                                                                                                                                                                                                                                                                                    1029
CLNDN=Hereditary_breast_ovarian_cancer_syndrome                                                                                                                                                                                                        

In [87]:
# list(map(print, just_conditions.head(5).index))
for condition in just_conditions.head(5).index:
    print(condition[6:])

Breast-ovarian_cancer,_familial,_susceptibility_to,_1
Hereditary_cancer-predisposing_syndrome
Hereditary_breast_ovarian_cancer_syndrome
Hereditary_cancer-predisposing_syndrome|Hereditary_breast_ovarian_cancer_syndrome
Hereditary_breast_ovarian_cancer_syndrome|Hereditary_cancer-predisposing_syndrome


In [88]:
top_cond = []
for condition in just_conditions.head(5).index:
    top_cond.append(str(condition[6:]))

print(top_cond)

['Breast-ovarian_cancer,_familial,_susceptibility_to,_1', 'Hereditary_cancer-predisposing_syndrome', 'Hereditary_breast_ovarian_cancer_syndrome', 'Hereditary_cancer-predisposing_syndrome|Hereditary_breast_ovarian_cancer_syndrome', 'Hereditary_breast_ovarian_cancer_syndrome|Hereditary_cancer-predisposing_syndrome']


In [89]:
# print(just_conditions.head(5))

top_cond_dict = {str(condition[6:]): just_conditions[condition] for condition in just_conditions.head(5).index}
# Print the dictionary
print(top_cond_dict)

{'Breast-ovarian_cancer,_familial,_susceptibility_to,_1': 2406, 'Hereditary_cancer-predisposing_syndrome': 1029, 'Hereditary_breast_ovarian_cancer_syndrome': 1006, 'Hereditary_cancer-predisposing_syndrome|Hereditary_breast_ovarian_cancer_syndrome': 692, 'Hereditary_breast_ovarian_cancer_syndrome|Hereditary_cancer-predisposing_syndrome': 605}
