In [1]:
# Standard modules
from dataclasses import dataclass
from logging import getLogger, StreamHandler, DEBUG, config
import os

# Third party modules
import gffutils
import pybedtools
import pandas as pd
import numpy as np
from pathlib2 import Path
import pysam
from tqdm import tqdm
import yaml

# Local modules
from libs.args_notebook import parser_setting
from libs.utils import load_config, OutputSettings
from libs.preprocess import PreProcessExomeSummary
from libs.modesamples import ModeSamples
from libs.annolib.anno import Anno
from libs.annolib.genebased import GeneBasedAnno
from libs.filter.maffilter import MafFilter
from libs.filter.typefilter import TypeFilter
from libs.filter.qcfilter import QcFilter
from libs.filter.gtfilter import GtFilter
from libs.filter.hardfilter import HardFilter
from libs.filter.counter import counter

# Settings
# tqdm.pandas()

#----- STEP 0. Logging settings
# parent_directory = os.path.dirname(os.path.dirname(__file__))
parent_directory = os.path.dirname(os.path.abspath('../__file__'))
config_path: str = os.path.join(parent_directory, 'config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

# http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
# conda create -n env python=3.9 --yes
# pip install jupyter cyvcf2 pybedtools gffutils biopython pathlib2 pyarrow openpyxl pandarallel pysam tqdm pandas scipy numpy pyhpo toml pyyaml 
# conda install --channel bioconda --channel conda-forge bedtools --yes 

libs.filter.counter


In [2]:
#-----   STEP 1. Argument settings
args = parser_setting()
# args['input'] = '/Volumes/vol/work/workspace/30829/exome_summary.20220427_194150.txt'
args['input'] = '/Volumes/SSD_480GB/workspace/Github/TestData/proband/32805/exome_summary_20240410_001219.txt'
args['input'] = '/Volumes/SSD_480GB/workspace/Github/TestData/proband/32968/exome_summary_20240410_080519.txt'
args['input'] = '/Volumes/vol/work/Github/TestData/proband/32097/exome_summary_20240417_201801.txt'



     input: /Volumes/vol/work/Github/TestData/proband/31070/annovar/exome_summary.20220607_145914.txt
     output: None
     xhmm: /work/Github/TestData/proband/xhmm/data.segdup.strvar.haplo.deciph.omim.xcnv.gene.uniq
     phenotype: None
     mode: auto
     samples: auto
     assembly: hg19
     config: /Volumes/vol/work/Github/playground/wesanno/config/config.toml
     resources: /Volumes/vol/work/Github/playground/wesanno/resources
     no_gnomad: False
     no_hgmd: False
     no_decipher: False
     no_ddg2p: False
     no_jarvis: False
     no_spliceai: False
     no_syno: None
     no_alphamissense: False
     no_revel: False
     no_trap: False
     excel_formating: True




In [3]:

#-----   STEP 2. Input file and Output settings
df = pd.read_table(args['input'], header=0, dtype=str)
configs: dict = load_config(args['config'])


#-----   STEP 3. Get Mode and Samples information
modesamples = ModeSamples(df=df, args=args)
mode_samples_info = modesamples.get_mode_samples_info()
logger.info(f"mode_samples_info: {mode_samples_info.mode}")

#-----   STEP 4. Output settings
output_settings = OutputSettings(
    args=args, mode_samples_info=mode_samples_info)
output_file_path = output_settings.get_saving_file_path()

#-----   STEP 5. Pre-processing
preprocessing = PreProcessExomeSummary(
    df=df, args=args, mode_samples_info=mode_samples_info)
df = preprocessing.all_pre_processing()


2024/04/18 01:33:07 [INFO   ] (libs.preprocess) - Drop ALL dot rows columns
2024/04/18 01:33:08 [INFO   ] (libs.preprocess) - Extract MAF from each database cols


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['HGVD_AF'] = df[col].str.extract('(\.|[0-1]\.\d{,6})')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['HGVD_AF'] = df['HGVD_AF'].replace('.', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['HGVD_AF'] = df['HGVD_AF'].astype(float)


2024/04/18 01:33:08 [INFO   ] (libs.preprocess) - Extract genotypeing info
2024/04/18 01:33:08 [INFO   ] (libs.preprocess) - Split QC info
2024/04/18 01:33:09 [INFO   ] (libs.preprocess) - Rename gnomAD MAF cols
2024/04/18 01:33:11 [INFO   ] (libs.preprocess) - Replace "." to np.nan
2024/04/18 01:33:11 [INFO   ] (libs.preprocess) - Split ALT column
2024/04/18 01:33:13 [INFO   ] (libs.preprocess) - Extract SpliceAI
2024/04/18 01:33:14 [INFO   ] (libs.preprocess) - Drop unused columns


In [4]:
#-----   STEP 6. Annotation
gba = GeneBasedAnno(args)
df = gba.anno_hgmd(df=df)
df = gba.dm_filter(df=df)
df = gba.anno_dcpr(df=df)
df['MOI'] = df.apply(gba.summarize_moi, axis=1)


#-----   STEP 7. Filtering
maffilter = MafFilter(
    df=df, mode_samples_info=mode_samples_info, configs=configs)
df = maffilter.all_filtering()

typefilter = TypeFilter(df=df)
df = typefilter.exclude_hlamuc_and_exonicsyno()

qcfilter = QcFilter(df=df, configs=configs)
df = qcfilter.exclude_low_quality()

gtfilter = GtFilter(
    df=df, mode_samples_info=mode_samples_info)
dfs = gtfilter.genotypeing_filter()


#-----  Variant reduction and count variants of filtering process
countsummery_file = str(Path(output_file_path).parent) + '/CountSummary.xlsx'
dfs_filtered = counter(dfs=dfs, output_excel=countsummery_file)


#-----   STEP 8. Hard filtering
hard = HardFilter(dfs=dfs_filtered)
dfs = hard.hard_filtering()


#-----   STEP 8. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

df_to_excel(dfs, f"{output_file_path}.xlsx")




2024/04/18 01:33:16 [INFO   ] (libs.filter.qcfilter) - FILTER cutoffs: GQ >= 20  DP >= 10


In [8]:
dfs.AD[dfs.AD['HARD_FILTER'] == 'PASS']

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,MAF_0.1%_FILTER,MAF_1%_FILTER,FlaggedSNP_FILTER,HLAMUC_FILTER,ExonicSyno_FILTER,GQ_FILTER,Hetero_count,GT_FILTER,MOI_FILTER,HARD_FILTER
1571,1,150529731,150529731,A,G,exonic;splicing,ADAMTSL4;ADAMTSL4,NM_001288607:exon13:c.1857-7A>G,nonsynonymous SNV,"ADAMTSL4:NM_019032:exon12:c.A1967G:p.H656R,ADA...",...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
1578,1,150940556,150940556,C,A,splicing,CERS2,NM_181746:exon4:c.410+3G>T;NM_022075:exon4:c.4...,.,.,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
6023,3,132036365,132036365,T,C,exonic,ACPP,.,nonsynonymous SNV,"ACPP:NM_001099:exon1:c.T65C:p.L22P,ACPP:NM_001...",...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
7054,4,85678317,85678321,AAGAG,-,splicing,WDFY3,NM_014991:exon33:r.spl,.,.,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
8205,5,110456178,110456178,T,C,splicing,WDR36,NM_139281:exon18:c.2073-16T>C,.,.,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
13358,9,21994323,21994323,C,T,exonic,CDKN2A,.,nonsynonymous SNV,CDKN2A:NM_058195:exon1:c.G8A:p.R3H,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
22095,16,30709488,30709488,G,A,exonic,LOC730183,.,nonsynonymous SNV,LOC730183:NM_001256932:exon1:c.C142T:p.R48C,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
25224,19,3959260,3959260,C,T,exonic,DAPK3,.,nonsynonymous SNV,DAPK3:NM_001348:exon9:c.G1204A:p.A402T,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
25684,19,12740051,12740051,C,-,exonic,ZNF791,.,frameshift deletion,ZNF791:NM_153358:exon4:c.1708delC:p.H570Ifs*2,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS
25925,19,18895130,18895130,G,A,exonic,COMP,.,nonsynonymous SNV,COMP:NM_000095:exon17:c.C1958T:p.A653V,...,PASS,PASS,PASS,PASS,PASS,PASS,1,PASS,PASS,PASS


### Memo, Test & Dev.

In [3]:
import glob

class GeneBasedAnno:
    def __init__(self, args: dict):
        self._root_path = args['resources']
        self.path_to_hgmd = glob.glob(f'{self._root_path}/HGMD*.tsv.gz')
        self.path_to_ddg2p = glob.glob(f'{self._root_path}/DDG2P*.gz')
        self.path_to_eyeg2p = glob.glob(f'{self._root_path}/EyeG2P*.gz')
        self.path_to_sking2p = glob.glob(f'{self._root_path}/SkinG2P*.gz')
        self.path_to_cancer2p = glob.glob(f'{self._root_path}/CancerG2P*.gz')
        self.path_to_cardiacg2p = glob.glob(f'{self._root_path}/CardiacG2P*.gz')
        self.path_to_skeletalg2p = glob.glob(f'{self._root_path}/SkeletalG2P*.gz')

    def anno_hgmd(self, df: pd.DataFrame) -> pd.DataFrame:
        hgmd = pd.read_table(self.path_to_hgmd[0], header=0, dtype=str)
        hgmd = hgmd[
            ['gene', 'altsymbol', 'refseq', 
             'expected_inheritance', 'hgncID', 'omimid', 'DM']
             ]
        df = pd.merge(
            df, hgmd, left_on='Gene.refGene', right_on='gene', how='left'
            )
        df = df.drop(columns=['gene'])

        return df
    
    def anno_dcpr(self, df: pd.DataFrame) -> pd.DataFrame:
        usecols = ['gene symbol', 'disease name', 'allelic requirement',
                   'hgnc id']
        dd = pd.read_csv(self.path_to_ddg2p[0], header=0, dtype=str)
        eye = pd.read_csv(self.path_to_eyeg2p[0], header=0, dtype=str)
        ski = pd.read_csv(self.path_to_sking2p[0], header=0, dtype=str)
        can = pd.read_csv(self.path_to_cancer2p[0], header=0, dtype=str)
        car = pd.read_csv(self.path_to_cardiacg2p[0], header=0, dtype=str)
        ske = pd.read_csv(self.path_to_skeletalg2p[0], header=0, dtype=str)

        pass


    def anno_gnomad(self, da):
        pass


    def match_g2p_phenotypes(self, df):
        pass

In [10]:
import pyhpo

In [3]:
gba = GeneBasedAnno(args)
df = gba.anno_hgmd(df=df)
df = gba.anno_dcpr(df=df)
df['MOI'] = df.apply(gba.summarize_moi, axis=1)



In [12]:
df['AB'] = df['AD'].str.split(',', expand=True)[1].astype(int) / df['DP'].astype(int)

In [22]:
df.loc[df['GT_Pro'] == '0/1', ['AD', 'DP', 'GT_Pro', 'AB (Allele Balance)', 'AB']]

Unnamed: 0,AD,DP,GT_Pro,AB (Allele Balance),AB
5,79,28,0/1,0.562500,0.321429
6,149,23,0/1,0.391304,0.391304
7,95,14,0/1,0.357143,0.357143
12,5826,84,0/1,0.309524,0.309524
13,3212,44,0/1,0.272727,0.272727
...,...,...,...,...,...
7856,5363,116,0/1,0.543103,0.543103
7860,136,23,0/1,0.315789,0.260870
7861,8028,108,0/1,0.259259,0.259259
7862,3655,91,0/1,0.604396,0.604396


In [6]:
df['MOI'].value_counts()

MOI
.     6780
AR     608
AD     448
XL      26
YL       1
Name: count, dtype: int64

In [73]:
def summarize_allelic_requirement(row, not_ad, not_ar, not_xl, not_yl):
    inheritance = row['expected_inheritance']
    alleleinfo = row['G2P_alleleinfo']
    
    if inheritance == 'AD':
        return 'AD' if not any(word in alleleinfo for word in not_ad) else '.'
    elif inheritance == 'AR':
        return 'AR' if not any(word in alleleinfo for word in not_ar) else '.'
    elif inheritance in ['XLD', 'XLR']:
        return 'XL' if not any(word in alleleinfo for word in not_xl) else '.'
    elif inheritance == 'YL':
        return 'YL' if not any(word in alleleinfo for word in not_yl) else '.'
    else:
        return '.'

df['MOI'] = df.apply(lambda row: summarize_allelic_requirement(row, not_ad, not_ar, not_xl, not_yl), axis=1)


In [41]:
df['check'] = df['G2P_alleleinfo'].apply(lambda x: any(word in x for word in not_ad))


In [None]:
#-----   STEP 6. Additional annotations
anno = Anno(df=df, args=args)
df = anno.anno_scores()

#-----   STEP 7. Filtering
maffilter = MafFilter(
    df=df, mode_samples_info=mode_samples_info, config=configs)
df = maffilter.all_filtering()

typefilter = TypeFilter(df=df)
df = typefilter.exclude_hlamuc_and_exonicsyno()

gtfilter = GtFilter(
    df=df, mode_samples_info=mode_samples_info)
dfs = gtfilter.genotypeing_filter()

#-----   STEP 8. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

df_to_excel(dfs, f"{output_file_path}.xlsx")

############################################
import pickle
with open(f"{output_file_path}.pkl", mode="wb") as f:
    pickle.dump(dfs, f)
############################################

#-----   STEP (Final Step). Count variants of filtering process
countsummery_file = str(Path(output_file_path).parent) + '/CountSummary.xlsx'
counter_result = counter(dfs=dfs, output_excel=countsummery_file)
print(counter_result)



In [6]:
import pickle
pkl = '/work/Github/TestData/trio/29881/Sample_29881-trio_results/Sample_29881-trio.tsv.pkl'
with open(pkl, mode='rb') as f:
    dfs = pickle.load(f)

print(type(dfs))

<class 'libs.filter.gtfilter.ModelDataFrame'>


In [10]:
hgmd_pkl = '/work/resources/HGMD/hgmd_info_2023.3'
hgmd = pd.read_pickle(hgmd_pkl)

Unnamed: 0,gene,altsymbol,refseq,expected_inheritance,hgncID,omimid,DFP,DM,DM?,DP,FP,R
0,RBFOX1,2BP1|A2BP1|FOX-1|FOX1|HRNBP1,NM_145891.3,AD,18222,605104,0.0,15.0,47.0,4.0,0.0,0.0
1,ABCA3,ABC-C|ABC3|EST111653|LBM180|SMDP3,NM_001089.3,AR,33,601615,0.0,230.0,152.0,1.0,2.0,0.0
2,AKAP13,AKAP-13|AKAP-Lbc|ARHGEF13|BRX|c-lbc|HA-3|Ht31|...,NM_007200.5,UNK,371,604686,0.0,0.0,11.0,1.0,5.0,0.0
3,GSS,GSHS|HEL-S-64p|HEL-S-88n,NM_000178.4,AR,4624,601002,0.0,40.0,5.0,0.0,0.0,0.0
4,BRF1,BRF|BRF-1|CFDS|GTF3B|hBRF|HEL-S-76p|TAF3B2|TAF...,NM_001519.4,ADAR,11551,604902,0.0,19.0,15.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17668,TNFRSF21,BM-018|CD358|DR6,NM_014452.5,UNK,13469,605732,0.0,4.0,7.0,0.0,0.0,0.0
17669,C12orf4,MRT66,NM_020374.4,AR,1184,616082,0.0,7.0,4.0,0.0,0.0,0.0
17670,DMBX1,Atx|MBX|OTX3|PAXB,NM_147192.3,UNK,19026,607410,0.0,1.0,5.0,0.0,0.0,0.0
17671,WDR93,C1d-87|CFAP297|FAP297,NM_020212.2,UNK,26924,,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
genocode_file_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.gtf.gz'
genocode_db_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.db.bak'

db = gffutils.FeatureDB(genocode_db_hg19)

In [None]:
import re

@dataclass
class GencodeInfo:
    gene_name: str
    hgnc: str
    ensg: str
    enst: str
    ensg_full: str
    enst_full: str
    strand: str

def __anno_gencode_info(row) -> str:
    query_region: str = f"chr{row['CHROM']}:{row['POS']}-{row['POS']}"
    # query_region: str = f"chr17:1132706-1132706"
    fetched_data = db.region(region=query_region, 
                            featuretype='gene',
                            completely_within=False)
    
    result = []
    while 1:
        try:
            data = next(fetched_data)
        except StopIteration:
            break
        else:
            strand = data.strand
            gene_name = data.attributes['gene_name'][0]
            
            try:
                ensg_full = data.attributes['gene_id'][0]
            except KeyError:
                ensg = '.'
            else:
                ensg = re.match(r'ENSG\d+', ensg_full).group()
    
    # print(result)
    return result

In [None]:
import re

@dataclass
class GencodeInfo:
    gene_name: str
    hgnc: str
    ensg: str
    enst: str
    ensg_full: str
    enst_full: str
    strand: str

def __anno_gencode_info(row) -> str:
    query_region: str = f"chr{row['CHROM']}:{row['POS']}-{row['POS']}"
    # query_region: str = f"chr17:1132706-1132706"
    fetched_data = db.region(region=query_region, 
                            featuretype='gene',
                            completely_within=False)
    
    result = []
    while 1:
        try:
            data = next(fetched_data)
        except StopIteration:
            break
        else:
            strand = data.strand
            gene_name = data.attributes['gene_name'][0]

            try:
                hgnc_info = data.attributes['hgnc_id'][0]
            except KeyError:
                hgnc = '.'
            else:
                hgnc = re.search(r'\d+', hgnc_info).group()
            
            try:
                ensg_full = data.attributes['gene_id'][0]
            except KeyError:
                ensg = '.'
            else:
                ensg = re.match(r'ENSG\d+', ensg_full).group()
            
            try:
                enst_full = data.attributes['transcript_id'][0]
            except KeyError:
                enst = '.'
            else:
                enst = re.match(r'ENST\d+', enst_full).group()

            genecode_info = GencodeInfo(
                gene_name, hgnc, ensg, enst, ensg_full, enst_full, strand)
            result.append(genecode_info)
    
    # print(result)
    return result
        


In [None]:
query_region: str = f"chr17:1132706-1132706"
fetched_data = db.region(region=query_region, 
                            featuretype='transcript',
                            completely_within=False)
    
result = []
while 1:
    try:
        data = next(fetched_data)
    except StopIteration:
        break
    else:
        strand = data.strand
        gene_name = data.attributes['gene_name'][0]

        try:
            hgnc_info = data.attributes['hgnc_id'][0]
        except KeyError:
            hgnc = '.'
        else:
            hgnc = re.search(r'\d+', hgnc_info).group()
        
        try:
            ensg_full = data.attributes['gene_id'][0]
        except KeyError:
            ensg = '.'
        else:
            ensg = re.match(r'ENSG\d+', ensg_full).group()
        
        try:
            enst_full = data.attributes['transcript_id'][0]
        except KeyError:
            enst = '.'
        else:
            enst = re.match(r'ENST\d+', enst_full).group()

        genecode_info = GencodeInfo(
            gene_name, hgnc, ensg, enst, ensg_full, enst_fuœll, strand)
        result.append(genecode_info)

In [None]:
query_region: str = f"chr17:1132706-1132706"
fetched_gene = db.region(region=query_region, 
                         featuretype=['gene', 'transcript'],
                         completely_within=False)


In [None]:
gtf = db.children('', featuretype='transcript', order_by='start')

In [None]:
for g in gtf:
    print(g)
    print(g.attributes['tag'])

In [None]:

df['gencode'] = df.progress_apply(__anno_gencode_info, axis=1)

100%|██████████| 6443/6443 [01:25<00:00, 75.63it/s] 


In [None]:
for row in df.iterrows():
    print(row[0], row[1]['Gene.refGene'], row[1]['gencode'])

In [None]:
import re

query_region: str = f'chr1: 69610-69610'
rows = db.region(region=query_region, featuretype='transcript')

for row in rows:
    print(row)

    enst_full = row.attributes['transcript_id'][0]
    ensg_full = row.attributes['gene_id'][0]
    hgnc_info = row.attributes['hgnc_id'][0]

    try:
        enst = re.match(r'ENST\d+', enst_full).group()
    except AttributeError:
        enst = '.'
    try:
        ensg = re.match(r'ENSG\d+', ensg_full).group()
    except AttributeError:
        ensg = '.'
    try:
        hgnc = re.search(r'\d+', hgnc_info).group()
    except AttributeError:
        hgnc = '.'

    print(f"Strand: {row.strand}")
    print(f"GeneName : {row.attributes['gene_name'][0]}")
    print(f"HGNC_ID  : {hgnc}")
    print(f"ENSG_Full: {ensg_full}")
    print(f"ENSG     : {ensg}")
    print(f"ENST_Full: {enst_full}")
    print(f"ENST     : {enst}")

chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092.7_9"; transcript_id "ENST00000641515.2_5"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-201"; level "2"; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS30547.2"; havana_gene "OTTHUMG00000001094.4_9"; havana_transcript "OTTHUMT00000003223.4_5"; remap_num_mappings "1"; remap_status "full_contig"; remap_target_status "new";
Strand: +
ENSG_Full: ENSG00000186092.7_9
ENSG     : ENSG00000186092
ENST_Full: ENST00000641515.2_5
ENST     : ENST00000641515
GeneName : OR4F5
HGNC_ID  : 14825


In [None]:
#-----   STEP 5. Pre-processing
preprocessing = PreProcessExomeSummary(
    df=df, mode_samples_info=mode_samples_info)
df = preprocessing.all_pre_processing()

#-----   STEP 6. Additional annotations
anno = Anno(df=df, args=args)
df = anno.anno_scores()

#-----   STEP 7. Filtering
maffilter = MafFilter(
    df=df, mode_samples_info=mode_samples_info, config=config)
df = maffilter.all_filtering()

typefilter = TypeFilter(df=df)
df = typefilter.exclude_hlamuc_and_exonicsyno()

gtfilter = GtFilter(
    df=df, mode_samples_info=mode_samples_info)
dfs = gtfilter.genotypeing_filter()

#-----   STEP 9. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

output_xlsx = f"{output_file_path}.xlsx"
output_pickle = f"{output_file_path}.pkl"
df_to_excel(dfs, output_xlsx)
dfs.to_pickle(output_pickle)

#-----   STEP 8. Count variants of filtering process
countsummery_file = str(Path(output_file_path).parent) + '/CountSummary.xlsx'
counter_result = counter(dfs=dfs, output_excel=countsummery_file)
print(counter_result)


In [None]:
dfs.AD.columns

Index(['InHouse_absent_FILTER', 'InHouse_1%_FILTER', 'MAF_0.1%_FILTER',
       'MAF_1%_FILTER', 'HLAMUC_FILTER', 'ExonicSyno_FILTER', 'GT_FILTER'],
      dtype='object')

In [None]:
print(output_file_path)

/work/Github/TestData/trio/29881/Sample_29881-trio_results/Sample_29881-trio.tsv


In [None]:
#-----   STEP 9. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

output_xlsx = f"{output_file_path}.xlsx"
df_to_excel(dfs, output_xlsx)

In [None]:
hgmd_resource: str = [
    str(x) for x in self.resources_dir.glob('HGMD/HGMD_gene_based.tsv.gz')][0]
        return pd.read_csv(hgmd_resource, sep='\t')

'Sample_32741-proband_countsummary.txt'

In [None]:
couter_result = counter(dfs=dfs, output_excel=output_file_path)


In [None]:
df = counter(dfs=dfs, output_excel='./test.xlsx')

In [None]:
df.to_pickle('./post7.pkl')

In [None]:
df = pd.read_pickle('./post7.pkl')

In [None]:
df = df.head(100)

In [None]:
#-----   STEP 8. Output as an Excel 
output_xlsx = './head100.xlsx'
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

df_to_excel(dfs, 'output_xlsx')

In [102]:
from cyvcf2 import VCF
from pysam import VariantFile


In [103]:
tommo_bulk_file = '/Users/utsu/Downloads/tommo-54kjpn-20230626r3-GRCh37_lifted_from_GRCh38-af-chrX_PAR2.vcf.gz'
tbx = pysam.TabixFile(tommo_bulk_file)


In [126]:
query_start, query_end = 60145, 60146
query_chrom: str = 'X'
query_ref, query_alt = 'G', 'A'
query_variant = f"{query_chrom}:{query_end}-{query_ref}-{query_alt}"

# for r in tbx.fetch(query_chrom, query_start, query_end, parser=pysam.asVCF()):
#     print(r.info)

vcfin = VariantFile(tommo_bulk_file)
for rec in vcfin.fetch(query_chrom, query_start, query_end):
    if len(rec.alts) > 1:
        continue
    if rec.filter.keys()[0] != 'PASS':
        continue
    
    tommo_id = f"{rec.contig}:{rec.pos}-{rec.ref}-{rec.alts[0]}"
    if tommo_id == query_variant:
        print(rec.info['AF'][0])
        break
    else:
        pass
        

    


1.9999999494757503e-05


In [30]:
df = pd.read_table('/Users/utsu/work/Github/TestData/proband/32487_IRUD_TTR/annovar/exome_summary.20230413_004616.txt', header=0, dtype=str)

In [31]:
from pysam import VariantFile
tommo_bulk_file = '/Users/utsu/Downloads/tommo-54kjpn-20230626r3-GRCh37_lifted_from_GRCh38-af-chrX_PAR2.vcf.gz'
tommovcf = VariantFile(tommo_bulk_file)

def anno_tommo(row, tommovcf):
    # Set queries
    query_chrom: str = row['CHROM']
    query_start: int = int(row['POS']) - 1
    query_end: int = int(row['POS'])
    query_variant_id = f"{query_chrom}:{query_end}-{row['REF']}-{row['ALT']}"

    for rec in tommovcf.fetch(query_chrom, query_start, query_end):
        # Filter out non-PASS variants
        if rec.filter.keys()[0] != 'PASS':
            return '.'
        
        for i, alt in enumerate(rec.alts):
            tommo_variant_id = f"{rec.contig}:{rec.pos}-{rec.ref}-{alt}"
            if tommo_variant_id == query_variant_id:
                return rec.info['AF'][i]
            else:
                pass

        return '.'       

In [32]:

df['ToMMo_AF'] = df.progress_apply(anno_tommo, tommovcf=tommovcf, axis=1)



  0%|          | 0/7940 [00:00<?, ?it/s]

100%|██████████| 7940/7940 [00:01<00:00, 6623.29it/s] 


In [None]:
# Replace '.' to np.nan in 'ToMMo_AF' column
df['ToMMo_AF'] = df['ToMMo_AF'].replace('.', np.nan)
df = df.astype({'ToMMo_AF': float})
df.loc[((df['CHROM'] == 'X') & (df['ToMMo_AF'] < 0.01)), 'ToMMo_AF']

In [52]:
vcf = VCF(tommo_bulk_file)
i = 0
while 1:
    i += 1
    if i > 2:
        break

    v = next(vcf)
    
    print(f"""
CHROM:  {v.CHROM}
POS:    {v.POS}
ID:     {v.ID}
REF:    {v.REF}
ALT:    {v.ALT}
QUAL:   {v.QUAL}
INFO:   {v.INFO}
starat: {v.start}
end:    {v.end}
          """)
    
    for inf in v.INFO:
        print(inf)
    



CHROM:  X
POS:    60170
ID:     rs2068139808
REF:    C
ALT:    ['G']
QUAL:   47.290000915527344
INFO:   <cyvcf2.cyvcf2.INFO object at 0x108081c60>
starat: 60169
end:    60170
          
('AC', 2)
('AF', 1.9999999494757503e-05)
('AN', 102418)
('DP', 807194)
('ExcessHet', 0.008299999870359898)
('FS', 0.0)
('InbreedingCoeff', 0.0013000000035390258)
('MLEAC', 1)
('MLEAF', 7.679999725951348e-06)
('QD', 15.760000228881836)
('SOR', 2.8329999446868896)
('NEGATIVE_TRAIN_SITE', True)
('VQSLOD', -3.8850998878479004)
('culprit', 'DP')
('AC_XX', 2)
('AN_XX', 67988)
('AF_XX', 2.9000000722589903e-05)
('AC_XY', 0)
('AN_XY', 34430)
('AF_XY', 0.0)
('AC_Illumina_HiSeq_2500_162PE', 0)
('AN_Illumina_HiSeq_2500_162PE', 5876)
('AF_Illumina_HiSeq_2500_162PE', 0.0)
('AC_Illumina_NovaSeq_6000_150PE', 0)
('AN_Illumina_NovaSeq_6000_150PE', 45688)
('AF_Illumina_NovaSeq_6000_150PE', 0.0)
('AC_Illumina_NovaSeq_6000_161PE', 2)
('AN_Illumina_NovaSeq_6000_161PE', 33674)
('AF_Illumina_NovaSeq_6000_161PE', 5.90000017837

In [53]:
import pysam

In [None]:
#----   STEP 9. Insert hyperlinks
# 0. 最初にエクセルを読み込む
# 1. メモ用の列を作る (1-2列目)
# 2. リンク挿入用の列を作る (3-7列目)
# 3. リンクの挿入

In [None]:
from libs.excelibs.excel_format import ExcelFormat
from libs.excelibs.hyperlinks import HyperLinks

excelformat = ExcelFormat('./test1.xlsx')

Formatng ......


In [None]:
excelformat.insert_comment_cols()

In [None]:
excelformat.insert_hyperlink_cols()

In [None]:
excelformat.workbook.save('./test2.xlsx')

##### CREATE dbs

In [None]:
## Create db for gffutils
import gffutils
import gffutils.pybedtools_integration


genocode_file_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.gtf.gz'
genocode_db_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.db'


In [None]:
db = gffutils.create_db(data=genocode_file_hg19, dbfn=genocode_db_hg19, 
                        disable_infer_genes=True,
                        disable_infer_transcripts=True,
                        keep_order=True, 
                        force=True)

In [None]:
genocode_db_intron_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.intron.db'

db = gffutils.FeatureDB(genocode_db_hg19)
introns = db.create_introns(exon_featuretype='exon', 
                            new_featuretype='intron', 
                            merge_attributes=True, 
                            numeric_sort=True)
pybed = gffutils.pybedtools_integration.to_bedtool(introns)
pybed.saveas(genocode_db_intron_hg19)

<BedTool(/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.intron.db)>

In [None]:
def generate_intoron_gtf(db: gffutils.FeatureDB, output: str) -> None:
    introns = db.create_introns(exon_featuretype='exon', 
                                new_featuretype='intron', 
                                merge_attributes=True, 
                                numeric_sort=True)
    pybed = gffutils.pybedtools_integration.to_bedtool(introns)
    pybed.saveas(output)
    
    return