##### Import package

In [92]:
import re
import pandas as pd
from Bio.Seq import Seq
from liftover import get_lifter
# from concurrent.futures import ThreadPoolExecutor

In [115]:
all_mut_default_colnames: list = [
    "disase", "gene", "chrom", "genename", "gdbid", "omimid", "amino", 
    "deletion", "insertion", "codon", "codonAff", "descr", "refseq", "hgvs", 
    "hgvsAll", "dbsnp", "chromosome", "startCoord", "endCoord", 
    "expected_inheritance", "gnomad_AC", "gnomad_AF", "gnomad_AN", "tag", 
    "dmsupport", "rankscore", "mutype", "author", "title", "fullname", 
    "allname", "vol", "page", "year", "pmid", "pmidAll", "reftag", "comments", 
    "acc_num", "new_date", "base", "clinvarID", "clinvar_clnsig"
]
allmut: pd.DataFrame = pd.read_csv(
    'allmut.csv', sep=';', encoding='cp1252', names=all_mut_default_colnames, 
    skiprows=1,low_memory=False)

allmut = allmut[
    ["gene", "genename", "mutype", "clinvar_clnsig", "tag",
     "refseq", "hgvs", "hgvsAll", "chromosome", "startCoord", "endCoord", 
     "amino", "deletion", "insertion", "expected_inheritance", "gnomad_AF"]]

# Drop non-numeric values in 'startCoord'
allmut = allmut.dropna(subset=['startCoord'])

# Drop duplicates in 'chrom', 'startCoord', and 'endCoord'
allmut = allmut.drop_duplicates(subset=['chromosome', 'startCoord', 'endCoord'])

# Extract tag == "DM" from allmut
allmut_dm = allmut[allmut.tag == "DM"]
print(f"A total of {len(allmut_dm)} DM mutations are found in allmut.")

A total of 253018 DM mutations are found in allmut.


In [116]:
# Fillna with empty string in "gnomad_AF" colmun in allmut_dm
# Extratct MAF 0 from allmut_dm
allmut_dm['gnomad_AF'].fillna(0, inplace=True)
allmut_dm_maf0 = allmut_dm[allmut_dm['gnomad_AF'] == 0]
print(f"A total of {len(allmut_dm_maf0)} DM mutations are found in allmut with MAF 0.")

# Extract non-deletion or non-insertion from allmut_dm
allmut_dm_maf0_snv = allmut_dm_maf0[(allmut_dm_maf0['deletion'].isnull()) & (allmut_dm['insertion'].isnull())].copy()
print(f"A total of {len(allmut_dm_maf0_snv)} DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.")

# Extract the mutation type from the mutype column
splice_mutations = allmut_dm[allmut_dm["mutype"].str.contains("splice")]
non_splice_mutations = allmut_dm[~allmut_dm["mutype"].str.contains("splice")]
print(len(splice_mutations), len(non_splice_mutations))

A total of 231981 DM mutations are found in allmut with MAF 0.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allmut_dm['gnomad_AF'].fillna(0, inplace=True)
  allmut_dm_maf0_snv = allmut_dm_maf0[(allmut_dm_maf0['deletion'].isnull()) & (allmut_dm['insertion'].isnull())].copy()


A total of 154107 DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.
23777 229241


In [118]:
splice_mutations

Unnamed: 0,gene,genename,mutype,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF
96651,AP3B1,Adaptor related protein complex 3 subunit beta 1,canonical-splice,,DM,NM_003664.5,1168-1G>C,1168minus1GtoC,5,78165673.0,78165673.0,,,,AR,0.0
126648,NF1,Neurofibromin 1,exonic-splice,Pathogenic/Likely_pathogenic,DM,NM_000267.3,1748A>G,1748AtoG | K583R,17,31223470.0,31223470.0,,,,AD,0.0
129700,TSC2,TSC complex subunit 2,exonic-splice,Pathogenic,DM,NM_000548.5,1255C>T,1255CtoT | P419S,16,2062006.0,2062006.0,,,,AD,0.0
130620,COL4A5,Collagen type IV alpha 5 chain,exonic-splice,Likely_pathogenic,DM,NM_000495.5,1856C>T,1856CtoT | P619L,X,108598778.0,108598778.0,,,,XLD,0.0
134411,DMD,Dystrophin,exonic-splice,Pathogenic,DM,NM_004006.3,10279C>T,10279CtoT | Q3427*,X,31173588.0,31173588.0,,,,XLR,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510785,PPT1,Palmitoyl-protein thioesterase 1,canonical-splice,Likely_pathogenic,DM,NM_000310.4,536+1G>A,536plus1GtoA,1,40089409.0,40089409.0,,,,AR,0.0
510786,APC,APC regulator of WNT signaling pathway,splice,Likely_pathogenic,DM,NM_000038.6,1409-6A>G,1409minus6AtoG,5,112827102.0,112827102.0,,,,AD,0.0
510789,UROD,Uroporphyrinogen decarboxylase,canonical-splice,,DM,NM_000374.5,876-1G>C,876minus1GtoC,1,45014939.0,45014939.0,,,,ADAR,0.0
510791,PTS,6-pyruvoyltetrahydropterin synthase,canonical-splice,Pathogenic,DM,NM_000317.3,186+1G>A,186plus1GtoA,11,112230231.0,112230231.0,,,,AR,0.0


In [81]:
print(allmut_dm['mutype'].unique())
print(allmut_dm['gnomad_AF'].unique())

['missense' 'initiation' 'nonsense' 'synonymous' 'gross'
 'canonical-splice' 'exonic-splice' 'nonstop' 'regulatory' 'splice']
[0.]


In [117]:
allmut_dm[allmut_dm.mutype == 'splice']

Unnamed: 0,gene,genename,mutype,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF
462252,NF1,Neurofibromin 1,splice,Pathogenic/Likely_pathogenic,DM,NM_000267.3,2410-16A>G,2410minus16AtoG,17,31229009.0,31229009.0,,,,AD,0.000000
462258,NF1,Neurofibromin 1,splice,,DM,NM_000267.3,1393-9T>A,1393minus9TtoA,17,31214442.0,31214442.0,,,,AD,0.000000
462270,NTRK1,Neurotrophic receptor tyrosine kinase 1,splice,Pathogenic,DM,NM_001012331.2,851-33T>A,851minus33TtoA,1,156873600.0,156873600.0,,,,AR,0.000000
462274,SLC25A20,Solute carrier family 25 member 20,splice,Pathogenic,DM,NM_000387.6,199-10T>G,199minus10TtoG,3,48884134.0,48884134.0,,,,AR,0.000065
462325,MAPT,Microtubule associated protein tau,splice,not_provided,DM,NM_005910.6,915+12C>T,915plus12CtoT,17,46010414.0,46010414.0,,,,AD,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510767,SERPING1,Serpin family G member 1,splice,,DM,NM_000062.3,1250-13G>A,1250minus13GtoA,11,57614315.0,57614315.0,,,,AD,0.000000
510773,PYGM,"Glycogen phosphorylase, muscle associated",splice,,DM,NM_005609.4,529-8G>A,529minus8GtoA,11,64757918.0,64757918.0,,,,AR,0.000000
510783,MYBPC3,"Myosin binding protein C, cardiac",splice,Pathogenic,DM,NM_000256.3,927-9G>A,927minus9GtoA,11,47346379.0,47346379.0,,,,AD,0.000032
510786,APC,APC regulator of WNT signaling pathway,splice,Likely_pathogenic,DM,NM_000038.6,1409-6A>G,1409minus6AtoG,5,112827102.0,112827102.0,,,,AD,0.000000


In [45]:
def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()

Unnamed: 0,gene,chrom,genename,mutype,clinvarID,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF


In [34]:
len(allmut)

510804

In [2]:
def _liftover_to_hg19(chrom, pos):
    converter = get_lifter('hg38', 'hg19')
    result = converter.query(chrom, pos)
    if result:
        return result[0]
    else:
        return None
    
def anno_vars_id(row):
    variant_id = f'{row["CHROM"]}:{row["POS_hg19"]}-{row["REF"]}-{row["ALT"]}'
    return variant_id

def anno_hg19_pos(row):
    converted = _liftover_to_hg19(row['CHROM'], row['POS_hg38'])
    return converted[1]

def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()
    
def remove_non_canon(x):
    if x in enst_set:
        return True
    else:
        pass

def create_refalt(row, nt):
    if row['Strand'] == '+':
        return row[nt]
    elif row['Strand'] == '-':
        return str(Seq(row[nt]).complement())
    else:
        pass

In [3]:
canonlist = '/Github/Projects/DeNovo/data/CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
refseq = '/work/CanonicalTrasncripts/gencode.v43lift37.metadata.RefSeq.gz'

df_canon = pd.read_table(canonlist, header=0)
df_canon = df_canon[(df_canon['ENST'] != 'ENST00000649912')
                    & (df_canon['ENST'] != 'ENST00000609375')]
df_enst = df_canon.drop_duplicates(subset='ENST')
enst_set = set(df_enst['ENST'])
df_refseq = pd.read_table(refseq, header=None, 
                          names=['ENST_refseq', 'RefSeq_RNA', 'RefSeq_Pro'])
df_refseq.fillna(value='.', inplace=True)

df_refseq['ENST_refseq'] = df_refseq['ENST_refseq'].apply(remove_dot_ver)
df_refseq['RefSeq_RNA'] = df_refseq['RefSeq_RNA'].apply(remove_dot_ver)
df_refseq['RefSeq_Pro'] = df_refseq['RefSeq_Pro'].apply(remove_dot_ver)

df_refseq['is_Canonical'] =  df_refseq['ENST_refseq'].apply(remove_non_canon)
df_refseq_canon = df_refseq[df_refseq['is_Canonical'] == True].copy()

##### Generate VCF

In [4]:
# Loading HGMD splicing variants list (tsv)
hgmd_file = '/Github/MyProjects/DeNovo/data/ValidationData/Positive/allmut.trim.colfixed.maf0.tsv.txt'
df = pd.read_table(hgmd_file, header=0)

# Pre-processing
df = df.dropna(subset=['startCoord'])
df['startCoord'] = df['startCoord'].astype(int)
df['refseq'] = df['refseq'].apply(remove_dot_ver)

# Extract REF and ALT from HGVS descriptions
sr_alt_nt = df['hgvs'].str[-1:].rename('alt_nt')
sr_ref_nt = df['hgvs'].str[-3:-2].rename('ref_nt')
df = pd.concat([df, sr_ref_nt, sr_alt_nt], axis=1)

# Insert cols as VCF
df.loc[:,'ID'] = '.'
df.loc[:,'QUAL'] = '.'
df.loc[:,'FILTER'] = '.'
df.loc[:,'INFO'] = '.'

# Annotate ENST IDs
df = pd.merge(df, df_enst, how='left', 
              left_on='gene', right_on='GeneSymbol')

# Rename cols for downstream processing
df = df.rename(columns={'CHROM': 'Chr',
                        'chromosome': 'CHROM', 
                        'startCoord': 'POS_hg38'})

# Create REF and ALT columns
df['REF'] = df.apply(create_refalt, nt='ref_nt', axis=1)
df['ALT'] = df.apply(create_refalt, nt='alt_nt', axis=1)

In [5]:
# Liftover to hg19
df['POS_hg19'] = df.apply(anno_hg19_pos, axis=1)

In [None]:
df2 = df.copy()
df2 = df2.astype({'POS_hg19': str})

# Annotate variant IDs
df2['variant_id'] = df2.apply(anno_vars_id, axis=1)

# Extract columns for VCF
df_19 = df2[['CHROM', 'POS_hg19', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]
df_38 = df2[['CHROM', 'POS_hg38', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

In [265]:
df_19 = df_19.dropna(subset=['REF'])
df_38 = df_38.dropna(subset=['REF'])

In [266]:
# Output as VCF
df_19.to_csv('./patho.hg19.vcf', sep='\t', index=False, header=False)
df_38.to_csv('./patho.hg38.vcf', sep='\t', index=False, header=False)

##### Concatenate header and variant list

In [267]:
!cat ../header_for_VCF.tsv ./patho.hg19.vcf > patho.hg19.header.vcf
!cat ../header_for_VCF.tsv ./patho.hg38.vcf > patho.hg38.header.vcf

In [263]:
df[['Strand', 'ref_nt', 'REF', 'alt_nt', 'ALT']]

Unnamed: 0,Strand,ref_nt,REF,alt_nt,ALT
0,+,A,A,G,G
1,-,T,A,G,C
2,-,G,C,A,T
3,+,A,A,G,G
4,-,A,T,G,C
...,...,...,...,...,...
10553,,G,,A,
10554,+,G,G,A,A
10555,-,G,C,A,T
10556,-,G,C,A,T


In [277]:
df2 =df2[['gene', 'gdbid', 'omimid', 'amino', 'codon', 'codonAff', 'descr',
       'refseq', 'hgvs', 'CHROM', 'POS_hg38', 'endCoord', 
       'expected_inheritance', 'dmsupport', 'mutype', 'acc_num', 'new_date', 
       'clinvarID', 'clinvar_clnsig', 'Chr', 'Start', 'End', 'Strand', 
       'GeneSymbol', 'HGNC_ID', 'ENSG', 'ENST', 'GeneType', 'Tag', 
       'REF', 'ALT', 'POS_hg19', 'variant_id']]

In [278]:
df2.to_csv('./patho2.tsv', sep='\t', index=False)