In [1]:
# Standard modules
from dataclasses import dataclass
from logging import getLogger, StreamHandler, DEBUG, Formatter

# Third party modules
import gffutils
import pybedtools
import pandas as pd
import numpy as np
from pathlib2 import Path
from tqdm import tqdm

# Local modules
from libs.args import parser_setting
from libs.utils import load_config, OutputSettings
from libs.preprocess import PreProcessExomeSummary
from libs.modesamples import ModeSamples
from libs.annolibs.anno import Anno
from libs.filter.maffilter import MafFilter
from libs.filter.typefilter import TypeFilter
from libs.filter.gtfilter import GtFilter
from libs.filter.counter import counter

# Settings
tqdm.pandas()
logger = getLogger(__name__)
handler = StreamHandler()
handler.setLevel(DEBUG)
logger.setLevel(DEBUG)
logger.addHandler(handler)

# http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
import pysam

In [63]:
#-----   STEP 1. Argument settings
args = parser_setting()

#-----   STEP 2. Input file and Output settings
df = pd.read_table(args['input'], header=0, dtype=str)
config: dict = load_config(args['config'])

#-----   STEP 3. Get Mode and Samples information
modesamples = ModeSamples(df=df, args=args)
mode_samples_info = modesamples.get_mode_samples_info()

#-----   STEP 4. Output settings
output_settings = OutputSettings(
    args=args, mode_samples_info=mode_samples_info)
output_file_path = output_settings.get_saving_file_path()


     input: /work/Github/TestData/trio/29881/exome_summary.20230420_014141.txt
     output: None
     xhmm: /work/Github/TestData/proband/xhmm/data.segdup.strvar.haplo.deciph.omim.xcnv.gene.uniq
     vcf: None
     phenotype: None
     mode: auto
     samples: auto
     config: /work/Github/playground/wesanno/config/config.toml
     resources: /resources
     no_gnomad: False
     no_hgmd: False
     no_decipher: False
     no_ddg2p: False
     no_jarvis: False
     no_spliceai: False
     no_syno: None
     no_alphamissense: False
     no_revel: False
     no_trap: False
     excel_formating: True




In [64]:
def process_dataframe_optimized(df):
    # 処理結果を格納するためのリストを初期化
    rows_list = []

    # データフレームの各行に対して処理を行う
    for row in df.itertuples(index=False):
        # REF列の値をカンマで分割
        # refs = row['ALT'].split(',')
        alts = row.ALT.split(',')
        # 分割した値ごとに新しい行を作成
        for alt in alts:
            new_row = row._asdict()
            new_row['SplitALT'] = alt
            rows_list.append(new_row)

    # 新しいデータフレームを作成
    result_df = pd.DataFrame(rows_list)

    return result_df

# 関数を実行し、結果を表示
# processed_df = process_dataframe_optimized(df)
# print(processed_df)

In [65]:
processed_df = process_dataframe_optimized(df)

In [66]:
def split_qc_col(df: pd.DataFrame) -> pd.DataFrame:
    for i, new_col in enumerate(['GT', 'AD', 'DP', 'GQ', 'PL']):
        df[new_col] = df[mode_samples_info.proband_id].str.split(':').str[i]
    df.fillna({'GQ': 0}, inplace=True)
    df = df.astype({'GQ': 'int32'})
   
    return df

In [67]:
df = split_qc_col(df)
processed_df = split_qc_col(processed_df)
df2 = processed_df.drop_duplicates(subset=['CHROM', 'POS', 'ALT', 'SplitALT'])


In [82]:
df['GT'].unique()

array(['0/1', '0/0', '1/1', './.', '0/2', '1/2', '1/3', '2/2', '2/3',
       '0/3', '3/3'], dtype=object)

In [88]:
df.loc[((df['GT'] == '2/3')), ['REF', 'ALT', 'GT', 'PL']].head(50)

Unnamed: 0,REF,ALT,GT,PL
2827,ATGTG,"A,ATG,ATGTGTGTGTG",2/3,9781028156377110539902702910625
2828,ATGTG,"A,ATG,ATGTGTGTGTG",2/3,9781028156377110539902702910625
2829,ATGTG,"A,ATG,ATGTGTGTGTG",2/3,9781028156377110539902702910625
3564,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",2/3,2783130312552432141952211670173
3565,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",2/3,2783130312552432141952211670173
3566,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",2/3,2783130312552432141952211670173
4296,TAGCAGCAGC,"TAGC,T,TAGCAGC",2/3,4058221823001039109718112190123002077
4297,TAGCAGCAGC,"TAGC,T,TAGCAGC",2/3,4058221823001039109718112190123002077
9610,G,GGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTA...,2/3,3268049724719697333926720306138053
9611,G,GGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTA...,2/3,3268049724719697333926720306138053


In [89]:
processed_df.loc[((processed_df['GT'] == '2/3')), ['REF', 'ALT', 'SplitALT', 'GT', 'PL']]

Unnamed: 0,REF,ALT,SplitALT,GT,PL
3044,ATGTG,"A,ATG,ATGTGTGTGTG",A,2/3,9781028156377110539902702910625
3045,ATGTG,"A,ATG,ATGTGTGTGTG",ATG,2/3,9781028156377110539902702910625
3046,ATGTG,"A,ATG,ATGTGTGTGTG",ATGTGTGTGTG,2/3,9781028156377110539902702910625
3047,ATGTG,"A,ATG,ATGTGTGTGTG",A,2/3,9781028156377110539902702910625
3048,ATGTG,"A,ATG,ATGTGTGTGTG",ATG,2/3,9781028156377110539902702910625
3049,ATGTG,"A,ATG,ATGTGTGTGTG",ATGTGTGTGTG,2/3,9781028156377110539902702910625
3050,ATGTG,"A,ATG,ATGTGTGTGTG",A,2/3,9781028156377110539902702910625
3051,ATGTG,"A,ATG,ATGTGTGTGTG",ATG,2/3,9781028156377110539902702910625
3052,ATGTG,"A,ATG,ATGTGTGTGTG",ATGTGTGTGTG,2/3,9781028156377110539902702910625
3845,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",CTGTGTGTGTGTG,2/3,2783130312552432141952211670173


In [90]:
df2.loc[((df2['GT'] == '2/3')), ['REF', 'ALT', 'SplitALT', 'GT', 'PL']].head(50)

Unnamed: 0,REF,ALT,SplitALT,GT,PL
3044,ATGTG,"A,ATG,ATGTGTGTGTG",A,2/3,9781028156377110539902702910625
3045,ATGTG,"A,ATG,ATGTGTGTGTG",ATG,2/3,9781028156377110539902702910625
3046,ATGTG,"A,ATG,ATGTGTGTGTG",ATGTGTGTGTG,2/3,9781028156377110539902702910625
3845,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",CTGTGTGTGTGTG,2/3,2783130312552432141952211670173
3846,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",CTG,2/3,2783130312552432141952211670173
3847,CTCTCTGTGTGTGTGTGTGTGTG,"CTGTGTGTGTGTG,CTG,C",C,2/3,2783130312552432141952211670173
4628,TAGCAGCAGC,"TAGC,T,TAGCAGC",TAGC,2/3,4058221823001039109718112190123002077
4629,TAGCAGCAGC,"TAGC,T,TAGCAGC",T,2/3,4058221823001039109718112190123002077
4630,TAGCAGCAGC,"TAGC,T,TAGCAGC",TAGCAGC,2/3,4058221823001039109718112190123002077
10234,G,GGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTA...,GGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTA...,2/3,3268049724719697333926720306138053


In [72]:
df2 = processed_df.drop_duplicates(subset=['CHROM', 'POS', 'ALT', 'SplitALT', 'PL'])

In [73]:
print(len(df))
print(len(processed_df))
print(len(df2))

11471
12181
11481


In [62]:
pd.merge(df,df2, how ="outer", indicator=True).query(f'_merge != "both"')


Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,_32,_39,_57,_58,_65,_66,_67,_68,SplitALT,_merge
25,1,1247579,1247579,G,-,splicing,INTS11,NM_001256463:exon13:c.1304+27C>-;NM_001256462:...,.,.,...,,,,,,,,,,left_only
381,1,47610436,47610436,G,C,splicing,CYP4A22,NM_001308102:exon7:c.743-21G>C;NM_001010969:ex...,.,.,...,,,,,,,,,,left_only
401,1,54605318,54605318,-,G,exonic,CDCP2,.,frameshift insertion,CDCP2:NM_201546:exon5:c.1224dupC:p.L409fs,...,,,,,,,,,,left_only
410,1,55505558,55505558,-,CTG,exonic,PCSK9,.,nonframeshift insertion,PCSK9:NM_174936:exon1:c.48_49insCTG:p.L23_G24insL,...,,,,,,,,,,left_only
413,1,55524340,55524355,GTGTGTGTGTGTGTGT,-,splicing,PCSK9,NM_174936:exon9:r.spl,.,.,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11583,X,27765403,27765405,GAG,-,exonic,DCAF8L2,.,unknown,UNKNOWN,...,,,,,,,,,,left_only
11595,X,44935925,44935926,TT,-,splicing,KDM6A,NM_001291418:exon16:r.spl;NM_001291421:exon16:...,.,.,...,,,,,,,,,,left_only
11635,X,56296780,56296780,-,ACACAC,splicing,KLF8,NM_001324105:exon5:c.889+26->ACACAC;NM_0013241...,.,.,...,,,,,,,,,,left_only
11645,X,66765159,66765161,GCA,-,exonic,AR,.,nonframeshift deletion,"AR:NM_000044:exon1:c.171_173del:p.Q80del,AR:NM...",...,,,,,,,,,,left_only


In [None]:
def separate_alt(info):
    info = info.split(':')
    separated_alt = []
    return separated_alt

def generate_tmp_vcf(df, threads) -> pysam.TabixFile:
    #1. Sort by CHROM and POS
    df = df.astype({'CHROM': str, 'POS': int})
    df = df.sort_values(by=['CHROM', 'POS'])

    #2. Generate VCF file
    df_vcf = df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'QUAL', mode_samples_info.proband_id, 'FORMAT']]
    df_vcf.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'QUAL', 'INFO', 'FORMAT']
    df_vcf = df_vcf.drop_duplicates(subset=['#CHROM', 'POS', 'REF', 'ALT'])
    df_vcf.to_csv('./temp.vcf', sep='\t', index=False, header=True)
    
    #3. Compress and Index
    pysam.tabix_compress('./temp.vcf', './temp.vcf.gz', force=True)
    pysam.tabix_index('./temp.vcf.gz', preset='vcf', force=True)
    
    #4. Load and Return as TabixFile
    tbx_vcf = pysam.TabixFile('./temp.vcf.gz', threads=threads)

    return tbx_vcf

{
            'gnomad40_genome_AF': '.', 'gnomad40_genome_AF_afr': '.',
            'gnomad40_genome_AF_ami': '.', 'gnomad40_genome_AF_amr': '.',
            'gnomad40_genome_AF_asj': '.', 'gnomad40_genome_AF_eas': '.',
            'gnomad40_genome_AF_fin': '.', 'gnomad40_genome_AF_mid': '.',
            'gnomad40_genome_AF_nfe': '.', 'gnomad40_genome_AF_remaining': '.',
            'gnomad40_genome_AF_sas': '.', 'ALL.sites.2015_08': '.',
            'HRC_AF': '.', 'HRC_non1000G_AF': '.', 'Kaviar_AF': '.', 
            'GME_AF': '.', 'GME_NWA': '.', 'GME_NEA': '.', 'GME_AP': '.', 
            'GME_Israel': '.', 'GME_SD': '.', 'GME_TP': '.', 'GME_CA': '.'
            }

df = df.astype({'CHROM': str, 'POS': int})
df = df.sort_values(by=['CHROM', 'POS'])
df_vcf = df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'QUAL', mode_samples_info.proband_id, 'FORMAT']]
df_vcf.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'QUAL', 'INFO', 'FORMAT']
df_vcf = df_vcf.drop_duplicates(subset=['#CHROM', 'POS', 'REF', 'ALT'])
df_vcf.to_csv('./temp.vcf', sep='\t', index=False, header=True)

pysam.tabix_compress('./temp.vcf', './temp.vcf.gz', force=True)
pysam.tabix_index('./temp.vcf.gz', preset='vcf', force=True)
tbx_vcf = pysam.TabixFile('./temp.vcf.gz', threads=4)


for row in tbx_vcf.fetch(parser=pysam.asVCF()):
    gt = row.info.split(':')[0]
    for alt in row.alt.split(','):
        print(alt, gt, sep='\t')
        break


In [None]:
o = './temp_out.vcf'
with open(o, 'w') as outfh:
    vcf = tbx_vcf.fetch(parser=pysam.asVCF())
    for record in vcf:
        for sample in snp_record.samples:
            gt_list.append(sample['GT'])
        
        new_record = []
        
        new_record.extend(gt_list)
        
        outfh.write('\t'.join(new_record) + '\n')

In [None]:
for row in tbx_vcf.fetch(parser=pysam.asVCF()):
    # 各行（row）に対する処理をここに記述
    # 例えば、行の内容を印刷する
    print(row)


In [61]:
pysam.tabix_index('./temp.tsv.gz', preset='vcf', force=True)

'./temp.tsv.gz'

In [36]:
import pandas as pd

# VCFデータのDataFrameを作成
df_vcf = df[['CHROM', 'POS', 'REF', 'ALT', 'FILTER', 'QUAL', 'FORMAT', mode_samples_info.proband_id]]
df_vcf.columns = ['#CHROM', 'POS', 'REF', 'ALT', 'FILTER', 'QUAL', 'FORMAT', 'INFO']

# VCFのヘッダーを用意
vcf_header = """##fileformat=VCFv4.2


#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE
"""

# ヘッダーとデータを組み合わせてファイルに書き出す
with open('./temp.vcf', 'w') as outfile:
    outfile.write(vcf_header)
    df_vcf.to_csv(outfile, sep='\t', index=False, header=False)

# pysamでVCFファイルを読み込む
vcf = pysam.VariantFile('./temp.vcf')
# filename = './temp.vcf.idx'
# vcf.index(filename=filename)
# # ファイルを閉じる
# vcf.close()


In [37]:
for rec in vcf:
    print(rec)
    break

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse_filter] FILTER 'GT:AD:DP:GQ:PL:JpnAD2' is not defined in the header
[W::vcf_parse_info] INFO '0/1:18,16:34:99:324,0,405:0.471' is not defined in the header, assuming Type=String
[E::bcf_hdr_parse_line] Could not parse the header line: "##INFO=<ID=0/1:18,16:34:99:324,0,405:0.471,Number=1,Type=String,Description=\"Dummy\">"
[E::vcf_parse_info] Could not add dummy header for INFO '0/1:18,16:34:99:324,0,405:0.471' at 1:871193


OSError: unable to parse next record

In [53]:
"""GT:AD:DP:GQ:PL:JpnAD2"""
# df['AD'] = df[mode_samples_info.proband_id].str.split(':')[1]
# df['DP'] = df[mode_samples_info.proband_id].str.split(':')[2]
# df['GQ'] = df[mode_samples_info.proband_id].str.split(':')[3]
# df['PL'] = df[mode_samples_info.proband_id].str.split(':')[4]

for i, new_col in enumerate(['GT', 'AD', 'DP', 'GQ', 'PL']):
    df[new_col] = df[mode_samples_info.proband_id].str.split(':').str[i]




In [54]:
import numpy as np
df.fillna({'GQ': 0}, inplace=True)
df = df.astype({'GQ': 'int32'})
df.loc[((df['GT'] == '1/2') & (df['GQ'] < 20)), ['REF', 'ALT', 'GT', 'GQ', 'PL']]

Unnamed: 0,REF,ALT,GT,GQ,PL
1433,TAA,"TA,T",1/2,17,37320173010389
1434,TAA,"TA,T",1/2,17,37320173010389
2789,G,"GGTGT,GGT",1/2,6,148321516061
2790,G,"GGTGT,GGT",1/2,6,148321516061
3148,GAA,"GA,G",1/2,14,36314372520410
3149,GAA,"GA,G",1/2,14,36314372520410
4053,T,"TAA,TA",1/2,16,24216153490128
4054,T,"TAA,TA",1/2,16,24216153490128
4164,G,"GAAAAAAA,GAAAAAA",1/2,15,522729615019
4165,G,"GAAAAAAA,GAAAAAA",1/2,15,522729615019


In [None]:

#-----   STEP 5. Pre-processing
preprocessing = PreProcessExomeSummary(
    df=df, mode_samples_info=mode_samples_info)
df = preprocessing.all_pre_processing()

#-----   STEP 6. Additional annotations
anno = Anno(df=df, args=args)
df = anno.anno_scores()

#-----   STEP 7. Filtering
maffilter = MafFilter(
    df=df, mode_samples_info=mode_samples_info, config=config)
df = maffilter.all_filtering()

typefilter = TypeFilter(df=df)
df = typefilter.exclude_hlamuc_and_exonicsyno()

gtfilter = GtFilter(
    df=df, mode_samples_info=mode_samples_info)
dfs = gtfilter.genotypeing_filter()

#-----   STEP 8. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

df_to_excel(dfs, f"{output_file_path}.xlsx")

############################################
import pickle
with open(f"{output_file_path}.pkl", mode="wb") as f:
    pickle.dump(dfs, f)
############################################

#-----   STEP (Final Step). Count variants of filtering process
countsummery_file = str(Path(output_file_path).parent) + '/CountSummary.xlsx'
counter_result = counter(dfs=dfs, output_excel=countsummery_file)
print(counter_result)



In [6]:
import pickle
pkl = '/work/Github/TestData/trio/29881/Sample_29881-trio_results/Sample_29881-trio.tsv.pkl'
with open(pkl, mode='rb') as f:
    dfs = pickle.load(f)

print(type(dfs))

<class 'libs.filter.gtfilter.ModelDataFrame'>


In [10]:
hgmd_pkl = '/work/resources/HGMD/hgmd_info_2023.3'
hgmd = pd.read_pickle(hgmd_pkl)

Unnamed: 0,gene,altsymbol,refseq,expected_inheritance,hgncID,omimid,DFP,DM,DM?,DP,FP,R
0,RBFOX1,2BP1|A2BP1|FOX-1|FOX1|HRNBP1,NM_145891.3,AD,18222,605104,0.0,15.0,47.0,4.0,0.0,0.0
1,ABCA3,ABC-C|ABC3|EST111653|LBM180|SMDP3,NM_001089.3,AR,33,601615,0.0,230.0,152.0,1.0,2.0,0.0
2,AKAP13,AKAP-13|AKAP-Lbc|ARHGEF13|BRX|c-lbc|HA-3|Ht31|...,NM_007200.5,UNK,371,604686,0.0,0.0,11.0,1.0,5.0,0.0
3,GSS,GSHS|HEL-S-64p|HEL-S-88n,NM_000178.4,AR,4624,601002,0.0,40.0,5.0,0.0,0.0,0.0
4,BRF1,BRF|BRF-1|CFDS|GTF3B|hBRF|HEL-S-76p|TAF3B2|TAF...,NM_001519.4,ADAR,11551,604902,0.0,19.0,15.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17668,TNFRSF21,BM-018|CD358|DR6,NM_014452.5,UNK,13469,605732,0.0,4.0,7.0,0.0,0.0,0.0
17669,C12orf4,MRT66,NM_020374.4,AR,1184,616082,0.0,7.0,4.0,0.0,0.0,0.0
17670,DMBX1,Atx|MBX|OTX3|PAXB,NM_147192.3,UNK,19026,607410,0.0,1.0,5.0,0.0,0.0,0.0
17671,WDR93,C1d-87|CFAP297|FAP297,NM_020212.2,UNK,26924,,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
genocode_file_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.gtf.gz'
genocode_db_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.db.bak'

db = gffutils.FeatureDB(genocode_db_hg19)

In [None]:
import re

@dataclass
class GencodeInfo:
    gene_name: str
    hgnc: str
    ensg: str
    enst: str
    ensg_full: str
    enst_full: str
    strand: str

def __anno_gencode_info(row) -> str:
    query_region: str = f"chr{row['CHROM']}:{row['POS']}-{row['POS']}"
    # query_region: str = f"chr17:1132706-1132706"
    fetched_data = db.region(region=query_region, 
                            featuretype='gene',
                            completely_within=False)
    
    result = []
    while 1:
        try:
            data = next(fetched_data)
        except StopIteration:
            break
        else:
            strand = data.strand
            gene_name = data.attributes['gene_name'][0]
            
            try:
                ensg_full = data.attributes['gene_id'][0]
            except KeyError:
                ensg = '.'
            else:
                ensg = re.match(r'ENSG\d+', ensg_full).group()
    
    # print(result)
    return result

In [None]:
import re

@dataclass
class GencodeInfo:
    gene_name: str
    hgnc: str
    ensg: str
    enst: str
    ensg_full: str
    enst_full: str
    strand: str

def __anno_gencode_info(row) -> str:
    query_region: str = f"chr{row['CHROM']}:{row['POS']}-{row['POS']}"
    # query_region: str = f"chr17:1132706-1132706"
    fetched_data = db.region(region=query_region, 
                            featuretype='gene',
                            completely_within=False)
    
    result = []
    while 1:
        try:
            data = next(fetched_data)
        except StopIteration:
            break
        else:
            strand = data.strand
            gene_name = data.attributes['gene_name'][0]

            try:
                hgnc_info = data.attributes['hgnc_id'][0]
            except KeyError:
                hgnc = '.'
            else:
                hgnc = re.search(r'\d+', hgnc_info).group()
            
            try:
                ensg_full = data.attributes['gene_id'][0]
            except KeyError:
                ensg = '.'
            else:
                ensg = re.match(r'ENSG\d+', ensg_full).group()
            
            try:
                enst_full = data.attributes['transcript_id'][0]
            except KeyError:
                enst = '.'
            else:
                enst = re.match(r'ENST\d+', enst_full).group()

            genecode_info = GencodeInfo(
                gene_name, hgnc, ensg, enst, ensg_full, enst_full, strand)
            result.append(genecode_info)
    
    # print(result)
    return result
        


In [None]:
query_region: str = f"chr17:1132706-1132706"
fetched_data = db.region(region=query_region, 
                            featuretype='transcript',
                            completely_within=False)
    
result = []
while 1:
    try:
        data = next(fetched_data)
    except StopIteration:
        break
    else:
        strand = data.strand
        gene_name = data.attributes['gene_name'][0]

        try:
            hgnc_info = data.attributes['hgnc_id'][0]
        except KeyError:
            hgnc = '.'
        else:
            hgnc = re.search(r'\d+', hgnc_info).group()
        
        try:
            ensg_full = data.attributes['gene_id'][0]
        except KeyError:
            ensg = '.'
        else:
            ensg = re.match(r'ENSG\d+', ensg_full).group()
        
        try:
            enst_full = data.attributes['transcript_id'][0]
        except KeyError:
            enst = '.'
        else:
            enst = re.match(r'ENST\d+', enst_full).group()

        genecode_info = GencodeInfo(
            gene_name, hgnc, ensg, enst, ensg_full, enst_fuœll, strand)
        result.append(genecode_info)

In [None]:
query_region: str = f"chr17:1132706-1132706"
fetched_gene = db.region(region=query_region, 
                         featuretype=['gene', 'transcript'],
                         completely_within=False)


In [None]:
for data in fetched_gene:
    print(data)

chr17	HAVANA	gene	906759	1133032	.	-	.	gene_id "ENSG00000159842.16_14"; gene_type "protein_coding"; gene_name "ABR"; level "1"; hgnc_id "HGNC:81"; tag "ncRNA_host"; havana_gene "OTTHUMG00000090313.18_14"; remap_status "full_contig"; remap_num_mappings "1"; remap_target_status "overlap";


In [None]:
gtf = db.children('', featuretype='transcript', order_by='start')

In [None]:
for g in gtf:
    print(g)
    print(g.attributes['tag'])

In [None]:

df['gencode'] = df.progress_apply(__anno_gencode_info, axis=1)

100%|██████████| 6443/6443 [01:25<00:00, 75.63it/s] 


In [None]:
for row in df.iterrows():
    print(row[0], row[1]['Gene.refGene'], row[1]['gencode'])

In [None]:
import re

query_region: str = f'chr1: 69610-69610'
rows = db.region(region=query_region, featuretype='transcript')

for row in rows:
    print(row)

    enst_full = row.attributes['transcript_id'][0]
    ensg_full = row.attributes['gene_id'][0]
    hgnc_info = row.attributes['hgnc_id'][0]

    try:
        enst = re.match(r'ENST\d+', enst_full).group()
    except AttributeError:
        enst = '.'
    try:
        ensg = re.match(r'ENSG\d+', ensg_full).group()
    except AttributeError:
        ensg = '.'
    try:
        hgnc = re.search(r'\d+', hgnc_info).group()
    except AttributeError:
        hgnc = '.'

    print(f"Strand: {row.strand}")
    print(f"GeneName : {row.attributes['gene_name'][0]}")
    print(f"HGNC_ID  : {hgnc}")
    print(f"ENSG_Full: {ensg_full}")
    print(f"ENSG     : {ensg}")
    print(f"ENST_Full: {enst_full}")
    print(f"ENST     : {enst}")

chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092.7_9"; transcript_id "ENST00000641515.2_5"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-201"; level "2"; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS30547.2"; havana_gene "OTTHUMG00000001094.4_9"; havana_transcript "OTTHUMT00000003223.4_5"; remap_num_mappings "1"; remap_status "full_contig"; remap_target_status "new";
Strand: +
ENSG_Full: ENSG00000186092.7_9
ENSG     : ENSG00000186092
ENST_Full: ENST00000641515.2_5
ENST     : ENST00000641515
GeneName : OR4F5
HGNC_ID  : 14825


In [None]:
#-----   STEP 5. Pre-processing
preprocessing = PreProcessExomeSummary(
    df=df, mode_samples_info=mode_samples_info)
df = preprocessing.all_pre_processing()

#-----   STEP 6. Additional annotations
anno = Anno(df=df, args=args)
df = anno.anno_scores()

#-----   STEP 7. Filtering
maffilter = MafFilter(
    df=df, mode_samples_info=mode_samples_info, config=config)
df = maffilter.all_filtering()

typefilter = TypeFilter(df=df)
df = typefilter.exclude_hlamuc_and_exonicsyno()

gtfilter = GtFilter(
    df=df, mode_samples_info=mode_samples_info)
dfs = gtfilter.genotypeing_filter()

#-----   STEP 9. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

output_xlsx = f"{output_file_path}.xlsx"
output_pickle = f"{output_file_path}.pkl"
df_to_excel(dfs, output_xlsx)
dfs.to_pickle(output_pickle)

#-----   STEP 8. Count variants of filtering process
countsummery_file = str(Path(output_file_path).parent) + '/CountSummary.xlsx'
counter_result = counter(dfs=dfs, output_excel=countsummery_file)
print(counter_result)


In [None]:
dfs.AD.columns

Index(['InHouse_absent_FILTER', 'InHouse_1%_FILTER', 'MAF_0.1%_FILTER',
       'MAF_1%_FILTER', 'HLAMUC_FILTER', 'ExonicSyno_FILTER', 'GT_FILTER'],
      dtype='object')

In [None]:
print(output_file_path)

/work/Github/TestData/trio/29881/Sample_29881-trio_results/Sample_29881-trio.tsv


In [None]:
#-----   STEP 9. Output as an Excel 
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

output_xlsx = f"{output_file_path}.xlsx"
df_to_excel(dfs, output_xlsx)

In [None]:
hgmd_resource: str = [
    str(x) for x in self.resources_dir.glob('HGMD/HGMD_gene_based.tsv.gz')][0]
        return pd.read_csv(hgmd_resource, sep='\t')

'Sample_32741-proband_countsummary.txt'

In [None]:
couter_result = counter(dfs=dfs, output_excel=output_file_path)


In [None]:
df = counter(dfs=dfs, output_excel='./test.xlsx')

In [None]:
df.to_pickle('./post7.pkl')

In [None]:
df = pd.read_pickle('./post7.pkl')

In [None]:
df = df.head(100)

In [None]:
#-----   STEP 8. Output as an Excel 
output_xlsx = './head100.xlsx'
def df_to_excel(dfs: dataclass, output_xlsx) -> None:
    sheet_names = ['AD', 'Homo', 'CH', 'XL']
    with pd.ExcelWriter(output_xlsx) as writer:
        for df, sheet_name in zip([dfs.AD, dfs.Hm, dfs.CH, dfs.XL], sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

df_to_excel(dfs, 'output_xlsx')

In [None]:
#----   STEP 9. Insert hyperlinks
# 0. 最初にエクセルを読み込む
# 1. メモ用の列を作る (1-2列目)
# 2. リンク挿入用の列を作る (3-7列目)
# 3. リンクの挿入

In [None]:
from libs.excelibs.excel_format import ExcelFormat
from libs.excelibs.hyperlinks import HyperLinks

excelformat = ExcelFormat('./test1.xlsx')

Formatng ......


In [None]:
excelformat.insert_comment_cols()

In [None]:
excelformat.insert_hyperlink_cols()

In [None]:
excelformat.workbook.save('./test2.xlsx')

##### CREATE dbs

In [None]:
## Create db for gffutils
import gffutils
import gffutils.pybedtools_integration


genocode_file_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.gtf.gz'
genocode_db_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.db'


In [None]:
db = gffutils.create_db(data=genocode_file_hg19, dbfn=genocode_db_hg19, 
                        disable_infer_genes=True,
                        disable_infer_transcripts=True,
                        keep_order=True, 
                        force=True)

In [None]:
genocode_db_intron_hg19 = '/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.intron.db'

db = gffutils.FeatureDB(genocode_db_hg19)
introns = db.create_introns(exon_featuretype='exon', 
                            new_featuretype='intron', 
                            merge_attributes=True, 
                            numeric_sort=True)
pybed = gffutils.pybedtools_integration.to_bedtool(introns)
pybed.saveas(genocode_db_intron_hg19)

<BedTool(/resources/GENCODE_Basic_Annotations/gencode.v44lift37.basic.annotation.intron.db)>

In [None]:
def generate_intoron_gtf(db: gffutils.FeatureDB, output: str) -> None:
    introns = db.create_introns(exon_featuretype='exon', 
                                new_featuretype='intron', 
                                merge_attributes=True, 
                                numeric_sort=True)
    pybed = gffutils.pybedtools_integration.to_bedtool(introns)
    pybed.saveas(output)
    
    return