In [22]:
import pandas as pd
import pathlib as pl

In [23]:
extended_table_path = '../2nd_run.csv'
extended_table_df = pd.read_csv(extended_table_path, sep=',', low_memory=False)

In [24]:
varicarta_df = extended_table_df[extended_table_df['Varicarta'] == 1]
print(f"varicarta_df shape: {varicarta_df.shape}")

varicarta_df shape: (365351, 119)


In [25]:
# print rows where the chr col is not number between 1-22 or X or Y or MT
suspect_snv = varicarta_df[~varicarta_df['chr'].str.match(r'^(chr)?([1-9]|1[0-9]|2[0-2]|X|Y|MT)$')]
print(f"suspect_snv shape: {suspect_snv.shape}")

# remove suspect_snv from varicarta_df
varicarta_df = varicarta_df[~varicarta_df['chr'].isin(suspect_snv['chr'])]
print(f"varicarta_df shape after removing suspect_snv: {varicarta_df.shape}")

suspect_snv shape: (288, 119)
varicarta_df shape after removing suspect_snv: (365063, 119)


In [26]:
# remove duplicates variant according to base cols 'chr', 'pos', 'ref' and 'alt'
varicarta_df = varicarta_df.drop_duplicates(subset=['chr', 'pos', 'ref', 'alt'])
print(f"varicarta_df shape after removing duplicates: {varicarta_df.shape}")

varicarta_df shape after removing duplicates: (329096, 119)


In [27]:
print(f"varicarta_df columns: {varicarta_df.columns.tolist()}")

varicarta_df columns: ['chr', 'pos', 'ref', 'alt', 'AF', 'AFR_AF', 'AMR_AF', 'APPRIS', 'Allele', 'Amino_acids', 'BIOTYPE', 'BLOSUM62', 'CANONICAL', 'CCDS', 'CDS_position', 'CLIN_SIG', 'Codons', 'Condel', 'Consequence', 'DISTANCE', 'DOMAINS', 'EAS_AF', 'ENSP', 'EUR_AF', 'EXON', 'Enformer_SAD', 'Enformer_SAR', 'Existing_variation', 'FLAGS', 'Feature', 'Feature_type', 'FlagLRG', 'GENE_PHENO', 'Gene', 'HGNC_ID', 'HGVS_OFFSET', 'HGVSc', 'HGVSp', 'IMPACT', 'INTRON', 'LOVD', 'LoFtool', 'Location', 'MANE', 'MANE_PLUS_CLINICAL', 'MANE_SELECT', 'MAX_AF', 'MAX_AF_POPS', 'MINIMISED', 'MaveDB_nt', 'MaveDB_pro', 'MaveDB_score', 'MaveDB_urn', 'NMD', 'PHENO', 'PUBMED', 'PhastCons100', 'PhenotypeOrthologous_Mouse_geneid', 'PhenotypeOrthologous_Mouse_phenotype', 'PhenotypeOrthologous_Rat_geneid', 'PhenotypeOrthologous_Rat_phenotype', 'PolyPhen', 'Protein_position', 'REVEL', 'SAS_AF', 'SIFT', 'SOMATIC', 'STRAND', 'SWISSPROT', 'SYMBOL', 'SYMBOL_SOURCE', 'TREMBL', 'TSL', 'UNIPARC', 'UNIPROT_ISOFORM', 'VARI

In [28]:
print(varicarta_df['VARIANT_CLASS'].value_counts())

VARIANT_CLASS
SNV                    281626
deletion                25758
insertion               16099
substitution             3645
indel                    1912
sequence_alteration        56
Name: count, dtype: int64


In [29]:
snv_df = varicarta_df[varicarta_df['VARIANT_CLASS'] == 'SNV']

In [30]:
print(f"snv_df shape: {snv_df.shape}")

snv_df shape: (281626, 119)


In [31]:
suspect_snv = varicarta_df[(varicarta_df['VARIANT_CLASS'] == 'SNV') & ((varicarta_df['ref'].str.len() != 1) | (varicarta_df['alt'].str.len() != 1))]

In [32]:
print(f"suspect_snv shape: {suspect_snv.shape}")

suspect_snv shape: (54, 119)


In [33]:
print(suspect_snv)

       chr        pos ref  alt      AF  AFR_AF  AMR_AF APPRIS Allele  \
353930   4  113358474   G  A/G     NaN     NaN     NaN     A2      A   
353932   4  113369515   G  A/G     NaN     NaN     NaN     A2      A   
353993   9     429744   G  A/G     NaN     NaN     NaN    NaN      A   
354018   2  232806547   T  G/T     NaN     NaN     NaN     P4      G   
354020   2  232811337   G  C/G     NaN     NaN     NaN     P4      C   
354021   2  232812437   G  A/G     NaN     NaN     NaN     P4      A   
354035   3    4675221   G  A/G     NaN     NaN     NaN    NaN      A   
354084   2  182956552   G  C/G     NaN     NaN     NaN     P4      C   
354085   2  182982926   G  A/G     NaN     NaN     NaN     P4      A   
354086   2  182989037   T  G/T  0.0002  0.0000  0.0000     P4      G   
354087   2  183023873   T  C/T     NaN     NaN     NaN     P4      C   
354088   2  183023885   T  C/T     NaN     NaN     NaN     P4      C   
354096   1  151428188   G  C/G     NaN     NaN     NaN     P3   

In [34]:
snv_df[['Allele','alt']]

Unnamed: 0,Allele,alt
30,T,T
131,T,T
161,C,C
162,C,C
163,T,T
...,...,...
433884,C,C
433885,G,G
433886,A,A
433887,A,A


In [35]:
snv_df['alt'] = snv_df['Allele']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snv_df['alt'] = snv_df['Allele']


In [36]:
print(f"Numbers of snv which 'alt' len != 1: \n{snv_df[snv_df['alt'].str.len() != 1]}")
print(f"snv_df shape after fixing 'alt' column: {snv_df.shape}")


Numbers of snv which 'alt' len != 1: 
Empty DataFrame
Columns: [chr, pos, ref, alt, AF, AFR_AF, AMR_AF, APPRIS, Allele, Amino_acids, BIOTYPE, BLOSUM62, CANONICAL, CCDS, CDS_position, CLIN_SIG, Codons, Condel, Consequence, DISTANCE, DOMAINS, EAS_AF, ENSP, EUR_AF, EXON, Enformer_SAD, Enformer_SAR, Existing_variation, FLAGS, Feature, Feature_type, FlagLRG, GENE_PHENO, Gene, HGNC_ID, HGVS_OFFSET, HGVSc, HGVSp, IMPACT, INTRON, LOVD, LoFtool, Location, MANE, MANE_PLUS_CLINICAL, MANE_SELECT, MAX_AF, MAX_AF_POPS, MINIMISED, MaveDB_nt, MaveDB_pro, MaveDB_score, MaveDB_urn, NMD, PHENO, PUBMED, PhastCons100, PhenotypeOrthologous_Mouse_geneid, PhenotypeOrthologous_Mouse_phenotype, PhenotypeOrthologous_Rat_geneid, PhenotypeOrthologous_Rat_phenotype, PolyPhen, Protein_position, REVEL, SAS_AF, SIFT, SOMATIC, STRAND, SWISSPROT, SYMBOL, SYMBOL_SOURCE, TREMBL, TSL, UNIPARC, UNIPROT_ISOFORM, VARIANT_CLASS, Varicarta, am_class, am_genome, am_pathogenicity, am_protein_variant, am_transcript_id, am_uniprot_

In [37]:
print(f"SNV ratio: {len(snv_df) / len(varicarta_df)}")
print(f"deletion ratio: {len(varicarta_df[varicarta_df['VARIANT_CLASS'] == 'deletion']) / len(varicarta_df)}")
print(f"insertion ratio: {len(varicarta_df[varicarta_df['VARIANT_CLASS'] == 'insertion']) / len(varicarta_df)}")

SNV ratio: 0.8557563750395022
deletion ratio: 0.07826895495539296
insertion ratio: 0.04891885650387729


In [38]:
print(snv_df['BIOTYPE'].value_counts().to_list)

<bound method IndexOpsMixin.tolist of BIOTYPE
protein_coding                        191032
lncRNA                                 49546
processed_pseudogene                    2091
protein_coding_CDS_not_defined          1358
enhancer                                 797
unprocessed_pseudogene                   655
transcribed_unprocessed_pseudogene       652
nonsense_mediated_decay                  370
IG_V_gene                                314
misc_RNA                                 269
snRNA                                    238
protein_coding_LoF                       138
TEC                                      134
retained_intron                          123
miRNA                                     89
snoRNA                                    84
IG_J_gene                                 72
transcribed_unitary_pseudogene            72
transcribed_processed_pseudogene          72
rRNA_pseudogene                           69
open_chromatin_region                     69
IG_C_gene

In [39]:
protein_coding_df = snv_df[snv_df['BIOTYPE'] == 'protein_coding']
print(f"protein_coding_df shape: {protein_coding_df.shape}")

protein_coding_df shape: (191032, 119)


In [40]:
print(f"protein_coding_df shape: {protein_coding_df.shape}")
print(f"protein_coding ratio: {len(protein_coding_df) / len(snv_df)}")
print(f"incRNA ratio: {len(snv_df[snv_df['BIOTYPE'] == 'lncRNA']) / len(snv_df)}")
print(f"processed_pseudogene ratio: {len(snv_df[snv_df['BIOTYPE'] == 'processed_pseudogene']) / len(snv_df)}")
print(f"unclassified ratio: {len(snv_df[snv_df['BIOTYPE'].isna()]) / len(snv_df)}")


protein_coding_df shape: (191032, 119)
protein_coding ratio: 0.6783180530206728
incRNA ratio: 0.17592835888731864
processed_pseudogene ratio: 0.00742474061343768
unclassified ratio: 0.11758857491850895


In [43]:
cur_dir = pl.Path.cwd()
print(f"Current directory: {cur_dir}")
clinvar_path = pl.Path.home() / 'Nave_Oded_Project' / 'resources' / 'DBs' / 'hg38' / 'validation' / 'ClinVar' / 'clinvar_20250715.vcf'
clinvar_path = "/home/alu/aluguest/Nave_Oded_Project/resources/DBs/hg38/validation/ClinVar/clinvar_20250715.vcf"

print(f"Clinvar path: {clinvar_path}")

Current directory: /home/alu/aluguest/Nave_Oded_Project/extended_variants_table/table_analysis
Clinvar path: /home/alu/aluguest/Nave_Oded_Project/resources/DBs/hg38/validation/ClinVar/clinvar_20250715.vcf


In [44]:
# load clinvar vcf file
clinvar_df = pd.read_csv(clinvar_path, sep='\t', comment='#', low_memory=False, header=None,
                         names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'])
print(f"clinvar_df shape: {clinvar_df.shape}")

clinvar_df shape: (3660218, 8)


In [45]:
print(clinvar_df.head())

  CHROM    POS       ID REF ALT QUAL FILTER  \
0     1  66926  3385321  AG   A    .      .   
1     1  69134  2205837   A   G    .      .   
2     1  69308  3925305   A   G    .      .   
3     1  69314  3205580   T   G    .      .   
4     1  69404  3925306   T   C    .      .   

                                                INFO  
0  ALLELEID=3544463;CLNDISDB=Human_Phenotype_Onto...  
1  ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLND...  
2  ALLELEID=4039319;CLNDISDB=MedGen:CN169374;CLND...  
3  ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLND...  
4  ALLELEID=4039320;CLNDISDB=MedGen:CN169374;CLND...  


In [46]:
print(clinvar_df['CHROM'].value_counts().sort_index())

CHROM
1                 325757
10                136149
11                216980
12                166750
13                 74017
14                117025
15                129345
16                192756
17                225065
18                 62632
19                205208
2                 330801
20                 77999
21                 42433
22                 80286
3                 199931
4                 129539
5                 180589
6                 163251
7                 178767
8                 124288
9                 163369
MT                  3102
NT_113889.1            1
NT_187633.1           13
NT_187661.1            8
NT_187693.1           10
NW_009646201.1         1
X                 134033
Y                    113
Name: count, dtype: int64


In [47]:
# rename CHROM to chr
clinvar_df.rename(columns={'CHROM': 'chr', 'POS': 'pos', 'REF': 'ref', 'ALT': 'alt'}, inplace=True)
print(clinvar_df.columns.tolist())

['chr', 'pos', 'ID', 'ref', 'alt', 'QUAL', 'FILTER', 'INFO']


In [48]:

def parse_info_field(info_str):
    """
    Parse a VCF INFO string into a dict.
    Fields without '=' become boolean True entries (flags).
    """
    d = {}
    for item in info_str.split(';'):
        if not item:  # guard against empty items
            continue
        if '=' in item:
            k, v = item.split('=', 1)
            d[k] = v
        else:
            d[item] = True
    return d

# Build a list of dicts (Python list comprehension is quite fast in CPython)
info_dicts = [parse_info_field(s) for s in clinvar_df['INFO'].astype(str)]

# Expand into a DataFrame
info_expanded = pd.DataFrame(info_dicts)

# Merge back with the base columns
clinvar_expanded_df = pd.concat([clinvar_df.drop(columns=['INFO']), info_expanded], axis=1)

print(clinvar_expanded_df.shape)
clinvar_expanded_df.head()


(3660218, 39)


Unnamed: 0,chr,pos,ID,ref,alt,QUAL,FILTER,ALLELEID,CLNDISDB,CLNDN,...,ONCDISDB,ONCDN,ONCREVSTAT,ONCSCV,SCI,SCIREVSTAT,DBVARID,SCIDISDB,SCIDN,SCISCV
0,1,66926,3385321,AG,A,.,.,3544463,"Human_Phenotype_Ontology:HP:0000547,MONDO:MOND...",Retinitis_pigmentosa,...,,,,,,,,,,
1,1,69134,2205837,A,G,.,.,2193183,MedGen:CN169374,not_specified,...,,,,,,,,,,
2,1,69308,3925305,A,G,.,.,4039319,MedGen:CN169374,not_specified,...,,,,,,,,,,
3,1,69314,3205580,T,G,.,.,3374047,MedGen:CN169374,not_specified,...,,,,,,,,,,
4,1,69404,3925306,T,C,.,.,4039320,MedGen:CN169374,not_specified,...,,,,,,,,,,


In [51]:
ariel_rona_path = "/home/alu/davidg/Scripts/Nave_Oded_autism_project/S1.tsv"
ariel_rona_df = pd.read_csv(ariel_rona_path, sep='\t', low_memory=False)
ariel_rona_df

Unnamed: 0,orig_name,location,gene,strand,mutation,mol_conseq,clin_significance,phenotype_list,num_submit,last_date_eval,...,ADAR_motif,NGG_editing_window,MIT,count_DNA_bystander_edits,count_DNA_pathogenic_bystander_edits,improv_new_codon,improv_editing_tech,SIFT_score,liver_expression,brain_expression
0,10|NM_000410.4(HFE):c.187C>G (p.His63Asp),chr6 : 26090951,HFE,+,C>G,missense variant,"Conflicting interpretations of pathogenicity,o...",Hemochromatosis type 1|Microvascular complicat...,29,"Sep 15,2021",...,not relevant,,,0,0,(CAT)GAT>GGT ; (His)Asp>Gly,A-to-G (1),0.24>0.25,no,no
1,100|NM_000355.4(TCN2):c.581-176A>T,chr22 : 30615125,TCN2,+,A>T,intron variant,Pathogenic,Transcolabamin II deficiency,1,"Jun 01,2009",...,not relevant,,,0,0,,,,no,no
2,10000|NM_001278116.2(L1CAM):c.2974C>T (p.Gln99...,chrX : 153864893,L1CAM,-,C>T,nonsense,Pathogenic,"Hydrocephalus,X-linked,with congenital idiopat...",2,"Jan 05,2017",...,not relevant,chrX:153864890-153864894,67,0,0,not relevant,not relevant,not relevant,no,no
3,1000050|NM_001165963.4(SCN1A):c.655A>G (p.Arg2...,chr2 : 166052891,SCN1A,-,A>G,missense variant,Conflicting interpretations of pathogenicity,Early infantile epileptic encephalopathy with ...,2,"Jan 01,2021",...,not relevant,,,0,0,,,,no,yes
4,100009|NM_000709.4(BCKDHA):c.1312T>A (p.Tyr438...,chr19 : 41424582,BCKDHA,+,T>A,missense variant,Pathogenic/Likely pathogenic,Maple syrup urine disease type 1A|not provided...,14,"Oct 26,2020",...,not relevant,,,0,0,,,,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98508,99939|NM_133433.4(NIPBL):c.7301A>G (p.Asn2434Ser),chr5 : 37057223,NIPBL,+,A>G,missense variant,Likely pathogenic,Cornelia de Lange syndrome 1|not provided,2,"Aug 31,2016",...,not relevant,,,0,0,,,,no,no
98509,9994|NM_001278116.2(L1CAM):c.536T>G (p.Ile179Ser),chrX : 153870948,L1CAM,-,T>G,missense variant,Pathogenic,MASA syndrome,1,"Jul 01,1995",...,not relevant,,,0,0,,,,no,no
98510,99940|NM_133433.4(NIPBL):c.7327C>T (p.Gln2443Ter),chr5 : 37057249,NIPBL,+,C>T,nonsense,Pathogenic,Cornelia de Lange syndrome 1|Abnormality of br...,2,"Feb 08,2013",...,not relevant,,,0,0,,,,no,no
98511,9995|NM_001278116.2(L1CAM):c.1108G>A (p.Gly370...,chrX : 153869818,L1CAM,-,G>A,missense variant,Pathogenic,MASA syndrome|not provided|Spastic paraplegia|...,4,"Jul 07,2020",...,yes,chrX:153869818-153869822,83,2,0,,,,no,no


In [52]:
print(f"protein_coding_df shape: {protein_coding_df.shape}")

protein_coding_df shape: (191032, 119)


In [53]:
protein_coding_df.head()

Unnamed: 0,chr,pos,ref,alt,AF,AFR_AF,AMR_AF,APPRIS,Allele,Amino_acids,...,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_REMAINING_AF,gnomADg_SAS_AF,miRNA,pLI_gene_value,is_ADAR_fixable,is_APOBEC_fixable,hg38
30,4,84836960,C,T,,,,P1,T,R/H,...,0.0,0.0,1.5e-05,0.0,0.0,,1.0,True,False,C
131,22,20991686,C,T,,,,P1,T,R/C,...,0.0,0.0,0.0,0.0,0.0,,0.0,False,False,C
161,2,165162582,T,C,,,,P1,C,N/S,...,,,,,,,1.0,False,False,T
162,X,154440078,G,C,,,,P1,C,D/H,...,,,,,,,0.99,False,False,G
163,1,15928377,C,T,,,,P1,T,R/W,...,0.0,0.0,1.5e-05,0.0,0.0,,1.0,False,False,C


In [59]:
ariel_rona_df

Unnamed: 0,orig_name,location,gene,strand,mutation,mol_conseq,clin_significance,phenotype_list,num_submit,last_date_eval,...,MIT,count_DNA_bystander_edits,count_DNA_pathogenic_bystander_edits,improv_new_codon,improv_editing_tech,SIFT_score,liver_expression,brain_expression,chr,pos
0,10|NM_000410.4(HFE):c.187C>G (p.His63Asp),chr6 : 26090951,HFE,+,C>G,missense variant,"Conflicting interpretations of pathogenicity,o...",Hemochromatosis type 1|Microvascular complicat...,29,"Sep 15,2021",...,,0,0,(CAT)GAT>GGT ; (His)Asp>Gly,A-to-G (1),0.24>0.25,no,no,chr6,26090951
1,100|NM_000355.4(TCN2):c.581-176A>T,chr22 : 30615125,TCN2,+,A>T,intron variant,Pathogenic,Transcolabamin II deficiency,1,"Jun 01,2009",...,,0,0,,,,no,no,chr22,30615125
2,10000|NM_001278116.2(L1CAM):c.2974C>T (p.Gln99...,chrX : 153864893,L1CAM,-,C>T,nonsense,Pathogenic,"Hydrocephalus,X-linked,with congenital idiopat...",2,"Jan 05,2017",...,67,0,0,not relevant,not relevant,not relevant,no,no,chrX,153864893
3,1000050|NM_001165963.4(SCN1A):c.655A>G (p.Arg2...,chr2 : 166052891,SCN1A,-,A>G,missense variant,Conflicting interpretations of pathogenicity,Early infantile epileptic encephalopathy with ...,2,"Jan 01,2021",...,,0,0,,,,no,yes,chr2,166052891
4,100009|NM_000709.4(BCKDHA):c.1312T>A (p.Tyr438...,chr19 : 41424582,BCKDHA,+,T>A,missense variant,Pathogenic/Likely pathogenic,Maple syrup urine disease type 1A|not provided...,14,"Oct 26,2020",...,,0,0,,,,no,no,chr19,41424582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98508,99939|NM_133433.4(NIPBL):c.7301A>G (p.Asn2434Ser),chr5 : 37057223,NIPBL,+,A>G,missense variant,Likely pathogenic,Cornelia de Lange syndrome 1|not provided,2,"Aug 31,2016",...,,0,0,,,,no,no,chr5,37057223
98509,9994|NM_001278116.2(L1CAM):c.536T>G (p.Ile179Ser),chrX : 153870948,L1CAM,-,T>G,missense variant,Pathogenic,MASA syndrome,1,"Jul 01,1995",...,,0,0,,,,no,no,chrX,153870948
98510,99940|NM_133433.4(NIPBL):c.7327C>T (p.Gln2443Ter),chr5 : 37057249,NIPBL,+,C>T,nonsense,Pathogenic,Cornelia de Lange syndrome 1|Abnormality of br...,2,"Feb 08,2013",...,,0,0,,,,no,no,chr5,37057249
98511,9995|NM_001278116.2(L1CAM):c.1108G>A (p.Gly370...,chrX : 153869818,L1CAM,-,G>A,missense variant,Pathogenic,MASA syndrome|not provided|Spastic paraplegia|...,4,"Jul 07,2020",...,83,2,0,,,,no,no,chrX,153869818


In [65]:
ariel_rona_df[['chr', 'pos']] = ariel_rona_df['location'].str.split(':', expand=True)
ariel_rona_df['chr'] = ariel_rona_df['chr'].str.replace('chr', '', regex=False).replace(' ','')
ariel_rona_df['pos'] = ariel_rona_df['pos'].replace(' ','').astype(int)

pd.merge(ariel_rona_df, protein_coding_df, on=['chr', 'pos'])

Unnamed: 0,orig_name,location,gene,strand,mutation,mol_conseq,clin_significance,phenotype_list,num_submit,last_date_eval,...,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_REMAINING_AF,gnomADg_SAS_AF,miRNA,pLI_gene_value,is_ADAR_fixable,is_APOBEC_fixable,hg38


In [54]:
# merge clinvar_expanded_df with protein_coding_df on 'chr', 'pos', 'ref', 'alt'
merged_df = pd.merge(clinvar_expanded_df, protein_coding_df, on=['chr', 'pos', 'ref', 'alt'], how='inner')


In [66]:
merged_df

Unnamed: 0,chr,pos,ID,ref,alt,QUAL,FILTER,ALLELEID,CLNDISDB,CLNDN,...,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_REMAINING_AF,gnomADg_SAS_AF,miRNA,pLI_gene_value,is_ADAR_fixable,is_APOBEC_fixable,hg38
0,1,930274,1672121,C,T,.,.,1647430,MedGen:C3661900,not_provided,...,,,,,,,0.00,False,False,C
1,1,935823,1622526,C,T,.,.,1561373,MedGen:C3661900,not_provided,...,0.000000,0.0,0.000073,0.000000,0.000000,,0.00,False,False,C
2,1,939437,1024192,C,T,.,.,1003029,MedGen:CN169374|MedGen:C3661900,not_specified|not_provided,...,0.000116,0.0,0.000169,0.000000,0.001401,,0.00,True,False,C
3,1,972133,2397724,G,A,.,.,2386707,MedGen:CN169374,not_specified,...,,,,,,,0.00,True,False,G
4,1,1020337,2073398,C,T,.,.,2123486,"MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...",Congenital_myasthenic_syndrome_8,...,0.000000,0.0,0.000000,0.000000,0.000000,,0.17,False,False,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9438,X,154462290,3352370,C,T,.,.,3511593,.,PLXNA3-related_disorder,...,0.000000,0.0,0.000056,0.000000,0.000000,,0.98,False,False,C
9439,X,154463439,3030710,G,A,.,.,3194469,.,PLXNA3-related_disorder,...,0.000000,0.0,0.000019,0.000000,0.000403,,0.98,True,False,G
9440,X,154535176,706112,C,G,.,.,694845,"MONDO:MONDO:0010480,MedGen:C2720289,OMIM:30090...","Anemia,_nonspherocytic_hemolytic,_due_to_G6PD_...",...,0.000000,0.0,0.000000,0.000658,0.004977,,0.97,False,False,C
9441,X,154652203,2661856,A,G,.,.,2821752,MedGen:C3661900,not_provided,...,0.000000,0.0,0.000038,0.001313,0.000000,,0.01,False,True,A


In [67]:
print(f"Merged DataFrame shape: {merged_df.shape}")
print(f"variants in ClinVar ratio: {len(merged_df) / len(protein_coding_df)}")

Merged DataFrame shape: (9443, 154)
variants in ClinVar ratio: 0.049431508857154825


In [68]:
print(merged_df['CLNSIG'].value_counts())


CLNSIG
Uncertain_significance                                3690
Likely_benign                                         2271
Conflicting_classifications_of_pathogenicity           977
Pathogenic                                             861
Pathogenic/Likely_pathogenic                           468
Likely_pathogenic                                      408
Benign                                                 374
Benign/Likely_benign                                   304
association                                             44
not_provided                                            29
risk_factor                                              7
no_classification_for_the_single_variant                 3
Conflicting_classifications_of_pathogenicity|other       1
Pathogenic|other                                         1
Benign/Likely_benign|other                               1
other                                                    1
Name: count, dtype: int64


In [69]:
# if 'CLNSIG' value contain 'Pathogenic' or 'pathogenic' but not 'pathogenicity' then set 'CLNSIG' to 'Pathogenic'
merged_df.loc[merged_df['CLNSIG'].str.contains('Pathogenic', case=False, na=False) & ~merged_df['CLNSIG'].str.contains('pathogenicity', case=False, na=False), 'CLNSIG'] = 'Pathogenic'
merged_df.loc[merged_df['CLNSIG'].str.contains('Benign', case=False, na=False), 'CLNSIG'] = 'Benign'
print(merged_df['CLNSIG'].value_counts())


CLNSIG
Uncertain_significance                                3690
Benign                                                2950
Pathogenic                                            1738
Conflicting_classifications_of_pathogenicity           977
association                                             44
not_provided                                            29
risk_factor                                              7
no_classification_for_the_single_variant                 3
Conflicting_classifications_of_pathogenicity|other       1
other                                                    1
Name: count, dtype: int64


In [70]:
# where 'CLNSIG' is "Conflicting_classifications_of_pathogenicity" and 'CLNSIGCONF' contains 'Pathogenic' or 'pathogenic' it should change 'CLNSIG' to 'Pathogenic'
merged_df.loc[merged_df['CLNSIG'] == 'Conflicting_classifications_of_pathogenicity', 'CLNSIG'] = merged_df['CLNSIGCONF'].apply(lambda x: 'Pathogenic' if pd.notna(x) and ('Pathogenic' in x or 'pathogenic' in x) else 'Conflicting_classifications_of_pathogenicity')
print(merged_df['CLNSIG'].value_counts())
print(f"Number of 'patogenic' variants added: {len(merged_df[merged_df['CLNSIG'] == 'Pathogenic']) - 1738}")


CLNSIG
Uncertain_significance                                3690
Benign                                                2950
Pathogenic                                            1996
Conflicting_classifications_of_pathogenicity           719
association                                             44
not_provided                                            29
risk_factor                                              7
no_classification_for_the_single_variant                 3
Conflicting_classifications_of_pathogenicity|other       1
other                                                    1
Name: count, dtype: int64
Number of 'patogenic' variants added: 258


In [71]:
# where 'CLNSIG' is not one of "Uncertain_significance", "Benign", "Pathogenic" it should be "Unclassified"
merged_df.loc[~merged_df['CLNSIG'].isin(['Uncertain_significance', 'Benign', 'Pathogenic']), 'CLNSIG'] = 'Unclassified'
print(merged_df['CLNSIG'].value_counts())

CLNSIG
Uncertain_significance    3690
Benign                    2950
Pathogenic                1996
Unclassified               807
Name: count, dtype: int64


In [72]:
pathogenic_df = merged_df[merged_df['CLNSIG'] == 'Pathogenic']
print(f"Pathogenic variants shape: {pathogenic_df.shape}")

Pathogenic variants shape: (1996, 154)


In [73]:
pathogenic_df['Consequence'].value_counts()

Consequence
stop_gained                                                                 834
missense_variant                                                            659
splice_donor_variant                                                        199
splice_acceptor_variant                                                     143
downstream_gene_variant                                                      39
stop_gained,splice_region_variant                                            32
missense_variant,splice_region_variant                                       29
upstream_gene_variant                                                        20
intron_variant                                                               10
splice_donor_5th_base_variant,intron_variant                                  7
splice_polypyrimidine_tract_variant,intron_variant                            5
start_lost                                                                    4
splice_donor_region_variant,

In [74]:
pathogenic_df.loc[pathogenic_df['Consequence'].str.contains(',', case=False, na=False), 'new_Consequence'] = 'Ambiguous'
pathogenic_df.loc[pathogenic_df['Consequence'].str.contains('splice', case=False, na=False), 'new_Consequence'] = 'Splice'
# if it contains "missense" or "stop_gained" or "stop_lost" or "start_lost" or "synonymous" it should be coding
pathogenic_df.loc[pathogenic_df['Consequence'].str.contains('missense|stop_gained|stop_lost|start_lost|synonymous', case=False, na=False), 'new_Consequence'] = 'Coding'
pathogenic_df.loc[pathogenic_df['Consequence'].str.contains('utr', case=False, na=False), 'new_Consequence'] = 'UTR'

print(pathogenic_df['new_Consequence'].value_counts())
print(f"Pathogenic variants shape after Consequence processing: {pathogenic_df.shape}")

new_Consequence
Coding    1564
Splice     358
UTR          5
Name: count, dtype: int64
Pathogenic variants shape after Consequence processing: (1996, 155)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathogenic_df.loc[pathogenic_df['Consequence'].str.contains(',', case=False, na=False), 'new_Consequence'] = 'Ambiguous'


In [75]:
coding_df = pathogenic_df[pathogenic_df['Consequence'] == 'Coding']
print(f"coding_df shape: {coding_df.shape}")

coding_df shape: (0, 155)


In [77]:
import numpy as np
# Assume 'pathogenic_df' is your DataFrame and 'Consequence' column exists

conditions = [
    pathogenic_df['Consequence'].str.contains(',', case=False, na=False),
    pathogenic_df['Consequence'].str.contains('splice', case=False, na=False),
    pathogenic_df['Consequence'].str.contains('missense|stop_gained|stop_lost|start_lost|synonymous', case=False, na=False),
    pathogenic_df['Consequence'].str.contains('utr', case=False, na=False)
]

choices = ['Ambiguous', 'Splice', 'Coding', 'UTR']

pathogenic_df['new_Consequence'] = np.select(conditions, choices, default='Other')

# Print the result like in R
print("new_Consequence value counts:")
print(pathogenic_df['new_Consequence'].value_counts(dropna=False))

print(f"\nPathogenic variants shape after Consequence processing: {pathogenic_df.shape}")

new_Consequence value counts:
new_Consequence
Coding       1500
Splice        342
Ambiguous      80
Other          69
UTR             5
Name: count, dtype: int64

Pathogenic variants shape after Consequence processing: (1996, 155)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathogenic_df['new_Consequence'] = np.select(conditions, choices, default='Other')


In [79]:
pathogenic_df_coding_splice = pathogenic_df[pathogenic_df['new_Consequence'].isin(['Coding', 'Splice'])]
print(f"pathogenic_df_coding_splice shape: {pathogenic_df_coding_splice.shape}")

pathogenic_df_coding_splice shape: (1842, 155)


In [81]:
sfari_genes = "/home/alu/davidg/Scripts/Nave_Oded_autism_project/SFARI-Gene_genes_07-08-2025release_07-17-2025export.csv"
sfari_genes_df = pd.read_csv(sfari_genes, sep=',', low_memory=False)
print(f"sfari_genes_df shape: {sfari_genes_df.shape}")

sfari_genes_df shape: (1238, 10)


In [92]:
pathogenic_df_coding_splice.shape

(1842, 155)

In [93]:
pathogenic_df_coding_splice_sfari = pathogenic_df_coding_splice.merge(sfari_genes_df, left_on='SYMBOL', right_on='gene-symbol', how='inner')


In [96]:
pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'].isin(['Coding'])].shape
pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'].isin(['Splice'])].shape

(132, 165)

In [100]:
pathogenic_df_coding_splice_sfari

Unnamed: 0,chr,pos,ID,ref,alt,QUAL,FILTER,ALLELEID,CLNDISDB,CLNDN,...,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
0,1,8361229,996676,G,A,.,.,984291,"MONDO:MONDO:0005258,MeSH:D000067877,MedGen:C15...",Autism_spectrum_disorder,...,9,RERE,Arginine-glutamic acid dipeptide (RE) repeats,ENSG00000142599,1,"Rare Single Gene Mutation, Syndromic, Genetic ...",1.0,1,6.50,20
1,1,11129789,224083,A,C,.,.,225813,"MONDO:MONDO:0014716,MedGen:C4225259,OMIM:61663...",Macrocephaly-intellectual_disability-neurodeve...,...,9,MTOR,mechanistic target of rapamycin kinase,ENSG00000198793,1,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,,36
2,1,15873105,2391426,C,T,.,.,2384383,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
3,1,15931632,992608,C,T,.,.,980533,MedGen:C3661900,not_provided,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
4,1,15933472,2429939,C,A,.,.,2403442,"MedGen:C3661900|MONDO:MONDO:0005258,MeSH:D0000...",not_provided|Autism_spectrum_disorder,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,X,154031431,11809,G,A,.,.,26848,"MedGen:C3661900|MeSH:D030342,MedGen:C0950123|M...",not_provided|Inborn_genetic_diseases|Rett_synd...,...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
995,X,154032268,11814,G,A,.,.,26853,"MONDO:MONDO:0010342,MedGen:C1845336,OMIM:30049...","Autism,_susceptibility_to,_X-linked_3|Rett_syn...",...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
996,X,154032268,11814,G,A,.,.,26853,"MONDO:MONDO:0010342,MedGen:C1845336,OMIM:30049...","Autism,_susceptibility_to,_X-linked_3|Rett_syn...",...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
997,X,154032405,1066884,C,T,.,.,1056743,"MONDO:MONDO:0010397,MedGen:C1968556,OMIM:30067...",Severe_neonatal-onset_encephalopathy_with_micr...,...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146


In [102]:
pathogenic_df_coding_splice_sfari[["gene-score","syndromic"]].value_counts()
pathogenic_df_coding_splice_sfari["gene-score"].value_counts()

gene-score
1.0    750
2.0    154
3.0     43
Name: count, dtype: int64

In [103]:
n_coding = pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'] == 'Coding'].shape[0]
n_splice = pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'] == 'Splice'].shape[0]

print(f"🧬 SFARI Coding variants: {n_coding}")
print(f"🧬 SFARI Splice variants: {n_splice}")


🧬 SFARI Coding variants: 867
🧬 SFARI Splice variants: 132


In [104]:
total_sfari = pathogenic_df_coding_splice_sfari.shape[0]
print(f"📊 % Coding: {n_coding / total_sfari:.1%}")
print(f"📊 % Splice: {n_splice / total_sfari:.1%}")


📊 % Coding: 86.8%
📊 % Splice: 13.2%


In [105]:
combo_counts = pathogenic_df_coding_splice_sfari[["gene-score", "syndromic"]].value_counts().reset_index()
combo_counts.columns = ['gene-score', 'syndromic', 'count']
combo_counts = combo_counts.sort_values(by='count', ascending=False)

print("📊 Variant counts by gene-score & syndromic:")
print(combo_counts.to_string(index=False))


📊 Variant counts by gene-score & syndromic:
 gene-score  syndromic  count
        1.0          1    521
        1.0          0    229
        2.0          0    118
        3.0          0     37
        2.0          1     36
        3.0          1      6


In [106]:
# Split into coding and splice
df_coding = pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'] == 'Coding']
df_splice = pathogenic_df_coding_splice_sfari[pathogenic_df_coding_splice_sfari['new_Consequence'] == 'Splice']

# Count combo of gene-score and syndromic for each
coding_counts = df_coding[["gene-score", "syndromic"]].value_counts().reset_index()
coding_counts.columns = ['gene-score', 'syndromic', 'count']
coding_counts = coding_counts.sort_values(by='count', ascending=False)

splice_counts = df_splice[["gene-score", "syndromic"]].value_counts().reset_index()
splice_counts.columns = ['gene-score', 'syndromic', 'count']
splice_counts = splice_counts.sort_values(by='count', ascending=False)

# Print results
print("🧬 Coding Variants by gene-score & syndromic:")
print(coding_counts.to_string(index=False))

print("\n🧬 Splice Variants by gene-score & syndromic:")
print(splice_counts.to_string(index=False))


🧬 Coding Variants by gene-score & syndromic:
 gene-score  syndromic  count
        1.0          1    456
        1.0          0    203
        2.0          0     92
        3.0          0     34
        2.0          1     30
        3.0          1      6

🧬 Splice Variants by gene-score & syndromic:
 gene-score  syndromic  count
        1.0          1     65
        1.0          0     26
        2.0          0     26
        2.0          1      6
        3.0          0      3


In [107]:
pathogenic_df_coding_splice_sfari.to_csv('/home/alu/davidg/Scripts/Nave_Oded_autism_project/pathogenic_df_coding_splice_sfari.csv', index=False)

In [110]:
pathogenic_df_coding_splice_sfari

Unnamed: 0,chr,pos,ID,ref,alt,QUAL,FILTER,ALLELEID,CLNDISDB,CLNDN,...,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports
0,1,8361229,996676,G,A,.,.,984291,"MONDO:MONDO:0005258,MeSH:D000067877,MedGen:C15...",Autism_spectrum_disorder,...,9,RERE,Arginine-glutamic acid dipeptide (RE) repeats,ENSG00000142599,1,"Rare Single Gene Mutation, Syndromic, Genetic ...",1.0,1,6.50,20
1,1,11129789,224083,A,C,.,.,225813,"MONDO:MONDO:0014716,MedGen:C4225259,OMIM:61663...",Macrocephaly-intellectual_disability-neurodeve...,...,9,MTOR,mechanistic target of rapamycin kinase,ENSG00000198793,1,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,,36
2,1,15873105,2391426,C,T,.,.,2384383,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
3,1,15931632,992608,C,T,.,.,980533,MedGen:C3661900,not_provided,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
4,1,15933472,2429939,C,A,.,.,2403442,"MedGen:C3661900|MONDO:MONDO:0005258,MeSH:D0000...",not_provided|Autism_spectrum_disorder,...,9,SPEN,spenfamily transcriptional repressor,ENSG00000065526,1,"Rare Single Gene Mutation, Syndromic",2.0,0,,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,X,154031431,11809,G,A,.,.,26848,"MedGen:C3661900|MeSH:D030342,MedGen:C0950123|M...",not_provided|Inborn_genetic_diseases|Rett_synd...,...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
995,X,154032268,11814,G,A,.,.,26853,"MONDO:MONDO:0010342,MedGen:C1845336,OMIM:30049...","Autism,_susceptibility_to,_X-linked_3|Rett_syn...",...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
996,X,154032268,11814,G,A,.,.,26853,"MONDO:MONDO:0010342,MedGen:C1845336,OMIM:30049...","Autism,_susceptibility_to,_X-linked_3|Rett_syn...",...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146
997,X,154032405,1066884,C,T,.,.,1056743,"MONDO:MONDO:0010397,MedGen:C1968556,OMIM:30067...",Severe_neonatal-onset_encephalopathy_with_micr...,...,9,MECP2,Methyl CpG binding protein 2,ENSG00000169057,X,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,106.65,146


In [112]:
# #select important columns
# pathogenic_df_coding_splice_sfari_selected = pathogenic_df_coding_splice_sfari[['chr', 'pos', 'ref', 'alt',"STRAND","Allele","Codons", "EXON","CDS_position","Protein_position","Amino_acids","BLOSUM62", 'SYMBOL', "SWISSPROT",'gene-score', 'syndromic', 'CLNSIG', "Consequence",'new_Consequence', "Existing_variation","MANE_SELECT", "PhastCons100", "PolyPhen", "SIFT", "is_ADAR_fixable", "is_APOBEC_fixable", "genetic-category", "gene-score", "syndromic", "number-of-reports"]]
# pathogenic_df_coding_splice_sfari_selected.head()

selected_columns = [
    # Genomic location
    'chr', 'pos', 'ref', 'alt', 'STRAND', 'Allele',

    # Consequence and position
    'Consequence', 'new_Consequence',
    'Codons', 'EXON', 'CDS_position', 'Protein_position', 'Amino_acids',
    'BLOSUM62', 'PhastCons100',

    # Annotation and metadata
    'SYMBOL', 'SWISSPROT', 'Existing_variation', 'MANE_SELECT',
    'CLNSIG', 'PolyPhen', 'SIFT',

    # Editing predictions
    'is_ADAR_fixable', 'is_APOBEC_fixable',

    # SFARI data
    'genetic-category', 'gene-score', 'syndromic', 'number-of-reports'
]

# Select and preview
pathogenic_df_coding_splice_sfari_selected = pathogenic_df_coding_splice_sfari[selected_columns]
pathogenic_df_coding_splice_sfari_selected.head()

pathogenic_df_coding_splice_sfari_selected.to_csv('/home/alu/davidg/Scripts/Nave_Oded_autism_project/pathogenic_df_coding_splice_sfari_selected.csv', index=False)
