##### Import package

In [6]:
import os
import re
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
pandarallel.initialize(nb_workers=3, progress_bar=False, verbose=0, use_memory_fs=False) 
tqdm.pandas()

import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count

try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

In [2]:
## Parse VCF to simple input table

raw_vcf = 'splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.vcf'

vcf = VCF(raw_vcf)
header = vcf.header_iter()
for h in header:
    try:
        h['ID']
    except KeyError:
        continue
    else:
        if h['ID'] == 'CSQ':
            vep_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        elif h['ID'] == 'SpliceAI':
            splai_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        else:
            pass

vepidx: dict = {col: i for i, col in enumerate(vep_cols_list)}
splaidx: dict = {col: i for i, col in enumerate(splai_cols_list)}

cols = [
    'CHROM', 'POS', 'REF', 'ALT', 'GeneSymbol', 'SymbolSource', 'HGNC_ID', 
    'ENST', 'HGVSc', 'Consequence', 'EXON', 'INTRON', 'Strand',
    'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL', 'MaxSpliceAI'
]

print(vepidx)

{'Allele': 0, 'Consequence': 1, 'IMPACT': 2, 'SYMBOL': 3, 'Gene': 4, 'Feature_type': 5, 'Feature': 6, 'BIOTYPE': 7, 'EXON': 8, 'INTRON': 9, 'HGVSc': 10, 'HGVSp': 11, 'cDNA_position': 12, 'CDS_position': 13, 'Protein_position': 14, 'Amino_acids': 15, 'Codons': 16, 'Existing_variation': 17, 'DISTANCE': 18, 'STRAND': 19, 'FLAGS': 20, 'VARIANT_CLASS': 21, 'SYMBOL_SOURCE': 22, 'HGNC_ID': 23, 'CANONICAL': 24, 'REFSEQ_MATCH': 25, 'SOURCE': 26, 'REFSEQ_OFFSET': 27, 'HGVS_OFFSET': 28}


In [3]:
df: pd.DataFrame = pd.DataFrame(columns=cols)
for v in VCF(raw_vcf):
    vep: list = v.INFO.get('CSQ').split('|')

    # Get SpliceAI scores
    if v.INFO.get('SpliceAI'):
        splai: list = v.INFO.get('SpliceAI').split(',')[0].split('|')
    else:
        splai = ['NA'] * len(splai_cols_list)

    # Get HGVSc from VEP
    try:
        hgvsc = re.search('(?<=:).*',vep[vepidx['HGVSc']])[0]
    except TypeError:
        hgvsc = "NA"

    # Convert strand to +/- 
    strand = lambda s: '+' if s == '1' else '-'

    # Get max SpliceAI scores
    ds_ag: float = splai[splaidx['DS_AG']]
    ds_al: float = splai[splaidx['DS_AL']]
    ds_dg: float = splai[splaidx['DS_DG']]
    ds_dl: float = splai[splaidx['DS_DL']]
    if splai[splaidx['DP_AG']] == 'NA':
        maxsplai: str = "NA"
    maxsplai: float = max(ds_ag, ds_al, ds_dg, ds_dl)

    # Add df row
    df = pd.concat([df, pd.DataFrame([[
        v.CHROM, v.POS, v.REF, v.ALT[0], 
        vep[vepidx['SYMBOL']], vep[vepidx['SYMBOL_SOURCE']], vep[vepidx['HGNC_ID']], 
        vep[vepidx['Feature']], hgvsc, vep[vepidx['Consequence']], 
        vep[vepidx['EXON']], vep[vepidx['INTRON']],
        strand(vep[vepidx['STRAND']]), 
        ds_ag, ds_al, ds_dg, ds_dl,
        splai[splaidx['DP_AG']], splai[splaidx['DP_AL']], 
        splai[splaidx['DP_DG']], splai[splaidx['DP_DL']],
        maxsplai
    ]], columns=cols)], ignore_index=True)

    # if hgvsc == "NA":
    #     logger.warning(f"[{v.CHROM}:{v.POS}] HGVSc not found")
    # if maxsplai == "NA":
    #     logger.warning(f"[{v.CHROM}:{v.POS}] SpliceAI scores not found")

# ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL
# CHROM, POS, REF, ALT, GeneSymbol, NCBI_ID, ENST, ExonIntronNumbers, FLAGS, SYMBOL_SOURCE|

df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.pkl')


In [2]:
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.pkl')
print(len(df))
df.drop_duplicates(inplace=True)
print(len(df))

20701
20687


In [3]:
#### Very slow process ####
# Annotate ENST Full ID for fetching variant information from GENCODE database
df['ENST_Full'] = df.progress_apply(posparser.fetch_enst_full, db=db, axis=1)

df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

100%|██████████| 20687/20687 [09:03<00:00, 38.07it/s]


In [4]:
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

logger.info('Classify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df)

logger.info('Calculate the distance to the nearest splice site in intron variant...')
df['IntronDist'] = df.progress_apply(
    posparser.signed_distance_to_exon_boundary, 
    db=db, db_intron=db_intron, axis=1)

df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.introndist.pkl')

2024/10/17 05:31:01 [INFO   ] (__main__) - Classify "Canonical" splice site or "Non-canonical" splice site...
2024/10/17 05:31:01 [INFO   ] (__main__) - Calculate the distance to the nearest splice site in intron variant...


100%|██████████| 20687/20687 [01:01<00:00, 337.94it/s] 


## ここから解析

In [2]:
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.introndist.pkl')

gencode_gff = '../../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'
tbx_anno = pysam.TabixFile(gencode_gff)
df['exon_loc'] = df.progress_apply(
    posparser.calc_exon_loc, tabixfile=tbx_anno, enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)
df.drop(columns=['exon_loc'], inplace=True)

#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)
#2-3. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)

#2-4. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
logger.info('Annotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)

#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
clinvar_file = '../../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
df['clinvar_same_pos'] = df.progress_apply(
    anno_clinvar.anno_same_pos_vars, tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(
    anno_clinvar.anno_same_motif_vars, tabixfile=tbx_clinvar, axis=1)

logger.info('Parsing SpliceAI results...')
logger.info('Annotating Exon/Intron position information...')
df['ExInt_INFO'] = df.progress_apply(
    splaiparser.calc_exint_info, db=db, db_intron=db_intron, axis=1)

100%|██████████| 20687/20687 [00:22<00:00, 920.44it/s] 


In [5]:
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.introndist.exintinfo.pkl')

In [1]:
import os
import re
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
pandarallel.initialize(nb_workers=3, progress_bar=False, verbose=0, use_memory_fs=False) 
tqdm.pandas()

import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count

try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.introndist.exintinfo.pkl')
#6-3. Predict splicing effects
df['Pseudoexon'] = df.progress_apply(
    splaiparser.pseudoexon_activation,
    thresholds=thresholds_SpliceAI_parser, 
    db_intron=db_intron,
    axis=1)

df['Part_IntRet'] = df.parallel_apply(
    splaiparser.partial_intron_retention,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Part_ExDel'] = df.parallel_apply(
    splaiparser.partial_exon_deletion,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Exon_skipping'] = df.parallel_apply(
    splaiparser.exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)
                                        
df['Int_Retention'] = df.parallel_apply(
    splaiparser.intron_retention, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['multiexs'] = df.parallel_apply(
    splaiparser.multi_exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)



## ここまでOK

100%|██████████| 20687/20687 [00:03<00:00, 6537.38it/s]


In [2]:
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.introndist.exintinfo.splicing.pkl')

In [5]:
df.loc[df['MaxSpliceAI'] == '.', ['MaxSpliceAI', 'DP_AG', 'ENST_Full']]

Unnamed: 0,MaxSpliceAI,DP_AG,ENST_Full
21,.,.,ENST00000379370.7_3
84,.,.,ENST00000379198.5_5
104,.,.,ENST00000378891.9_9
105,.,.,ENST00000378891.9_9
204,.,.,ENST00000378585.7_9
...,...,...,...
20353,.,.,ENST00000366560.4_6
20455,.,.,ENST00000358704.4_6
20553,.,.,ENST00000283179.14_7
20572,.,.,ENST00000336119.8_4


In [4]:


#6.   Parising SpliceAI results (For PVS1 decision tree)
logger.info('Parsing SpliceAI results...')
## Thresholds configuration
TH_min_sALDL, TH_max_sALDL = 0.02, 0.2
TH_min_sAGDG, TH_max_sAGDG = 0.01, 0.05
TH_min_GExon, TH_max_GExon = 25, 500
TH_sAG, TH_sDG = 0.2, 0.2

#6-1. Annotate Exon/Intron position information
df['ExInt_INFO'] = df.progress_apply(
    splaiparser.calc_exint_info, db=db, db_intron=db_intron, axis=1)

2024/10/17 05:49:34 [INFO   ] (__main__) - Parsing SpliceAI results...


100%|██████████| 20687/20687 [01:13<00:00, 280.67it/s] 


In [6]:
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

In [20]:
all_mut_default_colnames: list = [
    "disase", "gene", "chrom", "genename", "gdbid", "omimid", "amino", 
    "deletion", "insertion", "codon", "codonAff", "descr", "refseq", "hgvs", 
    "hgvsAll", "dbsnp", "chromosome", "startCoord", "endCoord", 
    "expected_inheritance", "gnomad_AC", "gnomad_AF", "gnomad_AN", "tag", 
    "dmsupport", "rankscore", "mutype", "author", "title", "fullname", 
    "allname", "vol", "page", "year", "pmid", "pmidAll", "reftag", "comments", 
    "acc_num", "new_date", "base", "clinvarID", "clinvar_clnsig"
]
allmut: pd.DataFrame = pd.read_csv(
    'allmut.csv', sep=';', encoding='cp1252', names=all_mut_default_colnames, 
    skiprows=1,low_memory=False)

allmut = allmut[
    ["gene", "genename", "mutype", "clinvar_clnsig", "tag",
     "refseq", "hgvs", "hgvsAll", "chromosome", "startCoord", "endCoord", 
     "amino", "deletion", "insertion", "expected_inheritance", "gnomad_AF"]]

# Drop non-numeric values in 'startCoord'
allmut = allmut.dropna(subset=['startCoord'])

# Drop duplicates in 'chrom', 'startCoord', and 'endCoord'
allmut = allmut.drop_duplicates(subset=['chromosome', 'startCoord', 'endCoord'])

# Extract tag == "DM" from allmut
allmut_dm = allmut[allmut.tag == "DM"].copy()
print(f"A total of {len(allmut_dm)} DM mutations are found in allmut.")

A total of 253018 DM mutations are found in allmut.


In [22]:
# Fillna with empty string in "gnomad_AF" colmun in allmut_dm
# Extratct MAF 0 from allmut_dm
allmut_dm['gnomad_AF'].fillna(0, inplace=True)
allmut_dm_maf0 = allmut_dm[allmut_dm['gnomad_AF'] == 0].copy()
print(f"A total of {len(allmut_dm_maf0)} DM mutations are found in allmut with MAF 0.")

# Extract non-deletion or non-insertion from allmut_dm
allmut_dm_maf0_snv = allmut_dm_maf0[(allmut_dm_maf0['deletion'].isnull()) & (allmut_dm_maf0['insertion'].isnull())]
print(f"A total of {len(allmut_dm_maf0_snv)} DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.")

# Extract the mutation type from the mutype column
splice_mutations = allmut_dm[allmut_dm["mutype"].str.contains("splice")].copy()
non_splice_mutations = allmut_dm[~allmut_dm["mutype"].str.contains("splice")]
print(f"Splicing_DM: {len(splice_mutations)}, Non-splicing_DM: {len(non_splice_mutations)}")

A total of 231981 DM mutations are found in allmut with MAF 0.
A total of 154107 DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.
Splicing_DM: 23777, Non-splicing_DM: 229241


In [23]:
print(allmut_dm_maf0_snv['mutype'].unique())
print(allmut_dm_maf0_snv['gnomad_AF'].unique())

['missense' 'initiation' 'nonsense' 'synonymous' 'gross'
 'canonical-splice' 'exonic-splice' 'nonstop' 'regulatory' 'splice']
[0.]


In [24]:
splice_mutations

Unnamed: 0,gene,genename,mutype,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF
96651,AP3B1,Adaptor related protein complex 3 subunit beta 1,canonical-splice,,DM,NM_003664.5,1168-1G>C,1168minus1GtoC,5,78165673.0,78165673.0,,,,AR,0.0
126648,NF1,Neurofibromin 1,exonic-splice,Pathogenic/Likely_pathogenic,DM,NM_000267.3,1748A>G,1748AtoG | K583R,17,31223470.0,31223470.0,,,,AD,0.0
129700,TSC2,TSC complex subunit 2,exonic-splice,Pathogenic,DM,NM_000548.5,1255C>T,1255CtoT | P419S,16,2062006.0,2062006.0,,,,AD,0.0
130620,COL4A5,Collagen type IV alpha 5 chain,exonic-splice,Likely_pathogenic,DM,NM_000495.5,1856C>T,1856CtoT | P619L,X,108598778.0,108598778.0,,,,XLD,0.0
134411,DMD,Dystrophin,exonic-splice,Pathogenic,DM,NM_004006.3,10279C>T,10279CtoT | Q3427*,X,31173588.0,31173588.0,,,,XLR,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510785,PPT1,Palmitoyl-protein thioesterase 1,canonical-splice,Likely_pathogenic,DM,NM_000310.4,536+1G>A,536plus1GtoA,1,40089409.0,40089409.0,,,,AR,0.0
510786,APC,APC regulator of WNT signaling pathway,splice,Likely_pathogenic,DM,NM_000038.6,1409-6A>G,1409minus6AtoG,5,112827102.0,112827102.0,,,,AD,0.0
510789,UROD,Uroporphyrinogen decarboxylase,canonical-splice,,DM,NM_000374.5,876-1G>C,876minus1GtoC,1,45014939.0,45014939.0,,,,ADAR,0.0
510791,PTS,6-pyruvoyltetrahydropterin synthase,canonical-splice,Pathogenic,DM,NM_000317.3,186+1G>A,186plus1GtoA,11,112230231.0,112230231.0,,,,AR,0.0


In [45]:
def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()

Unnamed: 0,gene,chrom,genename,mutype,clinvarID,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF


In [34]:
len(allmut)

510804

In [2]:
def _liftover_to_hg19(chrom, pos):
    converter = get_lifter('hg38', 'hg19')
    result = converter.query(chrom, pos)
    if result:
        return result[0]
    else:
        return None
    
def anno_vars_id(row):
    variant_id = f'{row["CHROM"]}:{row["POS_hg19"]}-{row["REF"]}-{row["ALT"]}'
    return variant_id

def anno_hg19_pos(row):
    converted = _liftover_to_hg19(row['CHROM'], row['POS_hg38'])
    return converted[1]

def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()
    
def remove_non_canon(x):
    if x in enst_set:
        return True
    else:
        pass

def create_refalt(row, nt):
    if row['Strand'] == '+':
        return row[nt]
    elif row['Strand'] == '-':
        return str(Seq(row[nt]).complement())
    else:
        pass

In [3]:
canonlist = '/Github/Projects/DeNovo/data/CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
refseq = '/work/CanonicalTrasncripts/gencode.v43lift37.metadata.RefSeq.gz'

df_canon = pd.read_table(canonlist, header=0)
df_canon = df_canon[(df_canon['ENST'] != 'ENST00000649912')
                    & (df_canon['ENST'] != 'ENST00000609375')]
df_enst = df_canon.drop_duplicates(subset='ENST')
enst_set = set(df_enst['ENST'])
df_refseq = pd.read_table(refseq, header=None, 
                          names=['ENST_refseq', 'RefSeq_RNA', 'RefSeq_Pro'])
df_refseq.fillna(value='.', inplace=True)

df_refseq['ENST_refseq'] = df_refseq['ENST_refseq'].apply(remove_dot_ver)
df_refseq['RefSeq_RNA'] = df_refseq['RefSeq_RNA'].apply(remove_dot_ver)
df_refseq['RefSeq_Pro'] = df_refseq['RefSeq_Pro'].apply(remove_dot_ver)

df_refseq['is_Canonical'] =  df_refseq['ENST_refseq'].apply(remove_non_canon)
df_refseq_canon = df_refseq[df_refseq['is_Canonical'] == True].copy()

##### Generate VCF

In [4]:
# Loading HGMD splicing variants list (tsv)
hgmd_file = '/Github/MyProjects/DeNovo/data/ValidationData/Positive/allmut.trim.colfixed.maf0.tsv.txt'
df = pd.read_table(hgmd_file, header=0)

# Pre-processing
df = df.dropna(subset=['startCoord'])
df['startCoord'] = df['startCoord'].astype(int)
df['refseq'] = df['refseq'].apply(remove_dot_ver)

# Extract REF and ALT from HGVS descriptions
sr_alt_nt = df['hgvs'].str[-1:].rename('alt_nt')
sr_ref_nt = df['hgvs'].str[-3:-2].rename('ref_nt')
df = pd.concat([df, sr_ref_nt, sr_alt_nt], axis=1)

# Insert cols as VCF
df.loc[:,'ID'] = '.'
df.loc[:,'QUAL'] = '.'
df.loc[:,'FILTER'] = '.'
df.loc[:,'INFO'] = '.'

# Annotate ENST IDs
df = pd.merge(df, df_enst, how='left', 
              left_on='gene', right_on='GeneSymbol')

# Rename cols for downstream processing
df = df.rename(columns={'CHROM': 'Chr',
                        'chromosome': 'CHROM', 
                        'startCoord': 'POS_hg38'})

# Create REF and ALT columns
df['REF'] = df.apply(create_refalt, nt='ref_nt', axis=1)
df['ALT'] = df.apply(create_refalt, nt='alt_nt', axis=1)

In [5]:
# Liftover to hg19
df['POS_hg19'] = df.apply(anno_hg19_pos, axis=1)

In [None]:
df2 = df.copy()
df2 = df2.astype({'POS_hg19': str})

# Annotate variant IDs
df2['variant_id'] = df2.apply(anno_vars_id, axis=1)

# Extract columns for VCF
df_19 = df2[['CHROM', 'POS_hg19', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]
df_38 = df2[['CHROM', 'POS_hg38', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

In [265]:
df_19 = df_19.dropna(subset=['REF'])
df_38 = df_38.dropna(subset=['REF'])

In [266]:
# Output as VCF
df_19.to_csv('./patho.hg19.vcf', sep='\t', index=False, header=False)
df_38.to_csv('./patho.hg38.vcf', sep='\t', index=False, header=False)

##### Concatenate header and variant list

In [267]:
!cat ../header_for_VCF.tsv ./patho.hg19.vcf > patho.hg19.header.vcf
!cat ../header_for_VCF.tsv ./patho.hg38.vcf > patho.hg38.header.vcf

In [263]:
df[['Strand', 'ref_nt', 'REF', 'alt_nt', 'ALT']]

Unnamed: 0,Strand,ref_nt,REF,alt_nt,ALT
0,+,A,A,G,G
1,-,T,A,G,C
2,-,G,C,A,T
3,+,A,A,G,G
4,-,A,T,G,C
...,...,...,...,...,...
10553,,G,,A,
10554,+,G,G,A,A
10555,-,G,C,A,T
10556,-,G,C,A,T


In [277]:
df2 =df2[['gene', 'gdbid', 'omimid', 'amino', 'codon', 'codonAff', 'descr',
       'refseq', 'hgvs', 'CHROM', 'POS_hg38', 'endCoord', 
       'expected_inheritance', 'dmsupport', 'mutype', 'acc_num', 'new_date', 
       'clinvarID', 'clinvar_clnsig', 'Chr', 'Start', 'End', 'Strand', 
       'GeneSymbol', 'HGNC_ID', 'ENSG', 'ENST', 'GeneType', 'Tag', 
       'REF', 'ALT', 'POS_hg19', 'variant_id']]

In [278]:
df2.to_csv('./patho2.tsv', sep='\t', index=False)