In [1]:
import os
import pandas as pd
import numpy as np

from pyliftover import LiftOver
from pybedtools import BedTool
import altair as alt

from utils.inter_giggle import __extract_results, __extract_conEnDep, __overlap_gene_num

In [2]:
meta = pd.read_table('./db/meta_genes.tsv')

giggle_dir = "./ReMap2022_Giggle"
query_dir = "./query"
remap2022_dir = "./db/GM12878/ReMap2022_NonRedun"

human_TFs = pd.read_table('./db/Homo_sapiens.tsv').rename(columns={'Gene Name':'Gene_Name'})['Gene_Name']
bedcols = ['chr', 'start', 'end', 'Gene_Name']


### LOAD RESULTS DIRECTORY

In [3]:
#escape vs chrX
escape_bed = os.path.join(query_dir, 'hg38_escape_55TSS.bed.gz')

escapeVSchrX_resdir = os.path.join(giggle_dir, "TSS_escapeVSgcmatchedchrX_NRuniq/results")
escape_gigresults, df_escape_enrichTFs, df_escape_depleteTFs = __extract_results(escapeVSchrX_resdir, 55, human_TFs)
escape_conEnrichTFs, escape_conDepleteTFs = __extract_conEnDep(df_escape_enrichTFs, df_escape_depleteTFs,10)

In [4]:
#subject vs chrX
subjectVSchrX_resdir = os.path.join(giggle_dir, "TSS_escapeVSgcmatchedchrX_NRuniq/results")
subject_gigresults, df_subject_enrichTFs, df_subject_depleteTFs = __extract_results(subjectVSchrX_resdir, 462, human_TFs)
subject_conEnrichTFs, subject_conDepleteTFs = __extract_conEnDep(df_subject_enrichTFs, df_subject_depleteTFs,10)

In [5]:
#upshell escape vs chrX
escape_upshell_bed = os.path.join(query_dir, 'hg38_escape_55UpShell.bed.gz')

upshell_escapeVSchrX_resdir = os.path.join(giggle_dir, "UPSHELL_escapeVSgcmatchedchrX_NRuniq/results")
escapeUP_gigresults, df_escapeUP_enrichTFs, df_escapeUP_depleteTFs = __extract_results(upshell_escapeVSchrX_resdir, 55, human_TFs)
escapeUP_conEnrichTFs, escapeUP_conDepleteTFs = __extract_conEnDep(df_escapeUP_enrichTFs, df_escapeUP_depleteTFs, 10)

In [6]:
#downshell escape vs chrX
escape_downshell_bed = os.path.join(query_dir, 'hg38_escape_55DownShell.bed.gz')

downshell_escapeVSchrX_resdir = os.path.join(giggle_dir, "DOWNSHELL_escapeVSgcmatchedchrX_NRuniq/results")
escapeDOWN_gigresults, df_escapeDOWN_enrichTFs, df_escapeDOWN_depleteTFs = __extract_results(downshell_escapeVSchrX_resdir, 55, human_TFs)
escapeDOWN_conEnrichTFs, escapeDOWN_conDepleteTFs = __extract_conEnDep(df_escapeDOWN_enrichTFs, df_escapeDOWN_depleteTFs, 10)

### DETERMINE OVERLAPPING NUMBER GENES

In [19]:
overlap_dir = "./ReMap2022_Overlap/"
if not os.path.isdir(overlap_dir):
    os.makedirs(overlap_dir)

summary_dir = "./summary/"
if not os.path.isdir(summary_dir):
    os.makedirs(summary_dir)

In [8]:
escapeoverlapdir, escapeoverlap, bgoverlapdir, bgoverlap = __overlap_gene_num(overlap_dir, "ReMap2022_Escape", escape_bed, "ReMap2022_BG", escapeVSchrX_resdir, bedcols, remap2022_dir)

escapeUPoverlapdir, escapeUPoverlap, bgUPoverlapdir, bgUPoverlap = __overlap_gene_num(overlap_dir, "ReMap2022_EscapeUP", escape_upshell_bed, "ReMap2022_BGUP", upshell_escapeVSchrX_resdir, bedcols, remap2022_dir, shell=True, tss_file=escape_bed)

escapeDOWNoverlapdir, escapeDOWNoverlap, bgDOWNoverlapdir, bgDOWNoverlap = __overlap_gene_num(overlap_dir, "ReMap2022_EscapeDOWN", escape_downshell_bed, "ReMap2022_BGDOWN", downshell_escapeVSchrX_resdir, bedcols, remap2022_dir, shell=True, tss_file=escape_bed)

### SUMMARY

In [142]:
remap2022_TFs = [ t.replace(".bed.gz", "") for t in sorted(os.listdir(escapeoverlapdir)) ]
summary_cols = ['TF', 'counts']

In [156]:
tss_summary = pd.DataFrame(data={'TF':remap2022_TFs}).merge(escapeoverlap, how='left').merge(df_escape_enrichTFs[summary_cols], how='left').fillna(0).query('query_overlap > 0')
tss_summary['region'] = 'TSS'
tss_summary['enriched'] = tss_summary['TF'].isin(escape_conEnrichTFs['TF'])
# tss_summary.to_csv(os.path.join(summary_dir, 'tss_overlapescape_2022.tsv'), sep="\t", index=False)
tss_summary = pd.read_table(os.path.join(summary_dir, 'tss_overlapescape_2022.tsv'), sep="\t")

up_summary = pd.DataFrame(data={'TF':remap2022_TFs}).merge(escapeUPoverlap, how='left').merge(df_escapeUP_enrichTFs[summary_cols], how='left').fillna(0).query('query_overlap > 0')
up_summary['region'] = 'UP'
up_summary['enriched'] = up_summary['TF'].isin(escapeUP_conEnrichTFs['TF'])
# up_summary.to_csv(os.path.join(summary_dir, 'upshell_overlapescape_2022.tsv'), sep="\t", index=False)
up_summary = pd.read_table(os.path.join(summary_dir, 'upshell_overlapescape_2022.tsv'), sep="\t")

down_summary = pd.DataFrame(data={'TF':remap2022_TFs}).merge(escapeDOWNoverlap, how='left').merge(df_escapeDOWN_enrichTFs[summary_cols], how='left').fillna(0).query('query_overlap > 0')
down_summary['region'] = 'DOWN'
down_summary['enriched'] = down_summary['TF'].isin(escapeDOWN_conEnrichTFs['TF'])
# down_summary.to_csv(os.path.join(summary_dir, 'downshell_overlapescape_2022.tsv'), sep="\t", index=False)
down_summary = pd.read_table(os.path.join(summary_dir, 'downshell_overlapescape_2022.tsv'), sep="\t")


In [161]:
long_sum = pd.concat([tss_summary, up_summary, down_summary]).fillna(-0.01)
long_sum['qoverlap_percent'] = long_sum['query_overlap'] / 55 * 100
long_sum['iter_per'] = long_sum['counts'] / 20 * 100
long_sum['category'] = [ 'Threshold Stringent' if (row['qoverlap_percent'] >= 20 and row['counts'] >= 15 and row['region'] == "TSS") else str(row['enriched']) for i, row in long_sum.iterrows() ]
long_sum = long_sum.replace({'category':"True"}, {'category':"Threshold Met"})
long_sum['iter_cat'] = [ '>=75% iterations' if row['iter_per'] >= 75 else str(row['category']) for i, row in long_sum.iterrows() ]
long_sum['iter_cat'] = long_sum['iter_cat'].str.replace('Threshold Met','>=50% iterations')
long_sum

Unnamed: 0,TF,query_overlap,counts,region,enriched,qoverlap_percent,iter_per,category,iter_cat
0,ARID3A,26,7.0,TSS,False,47.272727,35.0,False,False
1,ARNT,9,2.0,TSS,False,16.363636,10.0,False,False
2,ASH2L,25,0.0,TSS,False,45.454545,0.0,False,False
3,ATF2,8,14.0,TSS,True,14.545455,70.0,Threshold Met,>=50% iterations
4,ATF3,8,1.0,TSS,False,14.545455,5.0,False,False
...,...,...,...,...,...,...,...,...,...
102,ZNF24,5,2.0,DOWN,False,9.090909,10.0,False,False
103,ZNF384,1,0.0,DOWN,False,1.818182,0.0,False,False
104,ZNF592,3,4.0,DOWN,False,5.454545,20.0,False,False
105,ZNF597,1,0.0,DOWN,False,1.818182,0.0,False,False


In [165]:
enriched_escaperegions = long_sum.query('enriched == True')['TF'].unique().tolist()

In [181]:
tss_sortorder = long_sum.query('TF in @enriched_escaperegions & region == "TSS"').sort_values(['qoverlap_percent'], ascending=False)['TF'].tolist()
tss_bar = alt.Chart(long_sum.query('TF in @enriched_escaperegions & region == "TSS"')).mark_bar(color='dimgrey').encode(
    alt.X('TF', title=None, axis=alt.Axis(labels=False), sort=tss_sortorder),
    alt.Y('qoverlap_percent', title="percent of escape genes", scale=alt.Scale(domain=(0,50)), axis=alt.Axis(labelFontSize=25, titleFontSize=25)),
    ).properties(width=1500, height=350)

tss_dot = tss_bar.mark_circle().encode(
    alt.X('TF', title=None, axis=alt.Axis(labelFontSize=25), sort=long_sum_tssorder),
    alt.Y('region', title=None, axis=alt.Axis(labelFontSize=25)),
    alt.Color('iter_cat', scale=alt.Scale(range=['dimgrey', 'darkviolet', 'green'], domain=['False', '>=50% iterations', '>=75% iterations']), title="Enrichment", 
        legend=alt.Legend(values=['>=75% iterations', '>=50% iterations'], labelFontSize=25, titleFontSize=28)),
    alt.Size('iter_per', title="% of iterations", scale=alt.Scale(range=[0, 1000]), legend=alt.Legend(values=[0, 25, 50, 75, 100], labelFontSize=25, titleFontSize=28)),
    alt.Tooltip('counts')
).properties(height=40)

tss_plot = alt.vconcat(tss_bar, tss_dot, title=' ').configure_title(fontSize=40).configure_legend(titleFontSize=20,labelFontSize=20,labelLimit=800, titleLimit=200)
tss_plot

In [182]:
up_sortorder = long_sum.query('TF in @enriched_escaperegions & region == "UP"').sort_values(['qoverlap_percent'], ascending=False)['TF'].tolist()

up_bar = alt.Chart(long_sum.query('TF in @enriched_escaperegions & region == "UP"')).mark_bar().encode(
    alt.X('TF', title=None, axis=alt.Axis(labels=False), sort=up_sortorder),
    alt.Y('qoverlap_percent', title="percent of escape genes", scale=alt.Scale(domain=(0,50)), axis=alt.Axis(labelFontSize=20, titleFontSize=25)),
    alt.Color('iter_cat', scale=alt.Scale(range=['dimgrey', 'darkviolet', 'green'], domain=['False', '>=50% iterations', '>=75% iterations']), title="Enrichment", 
        legend=alt.Legend(values=['>=75% iterations', '>=50% iterations'], labelFontSize=25, titleFontSize=28)),
).properties(width=1500, height=300)

up_dot = up_bar.mark_circle().encode(
    alt.X('TF', title=None, axis=alt.Axis(labelFontSize=25), sort=up_sortorder),
    alt.Y('region', title=None, axis=alt.Axis(labelFontSize=25)),
    alt.Size('iter_per', title="% of iterations", scale=alt.Scale(range=[0, 1000]), legend=alt.Legend(values=[0, 25, 50, 75, 100], labelFontSize=25, titleFontSize=28)),
    alt.Tooltip('counts')
).properties(height=40)

up_plot = alt.vconcat(up_bar, up_dot).configure_title(fontSize=40).configure_legend(titleFontSize=20,labelFontSize=20)
up_plot

In [187]:
down_sortorder = long_sum.query('TF in @enriched_escaperegions & region == "DOWN"').sort_values(['qoverlap_percent'], ascending=False)['TF'].tolist()

down_bar = alt.Chart(long_sum.query('TF in @enriched_escaperegions & region == "DOWN"')).mark_bar().encode(
    alt.X('TF', title=None, axis=alt.Axis(labels=False), sort=down_sortorder),
    alt.Y('qoverlap_percent', title="percent of escape genes", scale=alt.Scale(domain=(0,50)), axis=alt.Axis(labelFontSize=20, titleFontSize=25)),
    alt.Color('iter_cat', scale=alt.Scale(range=['dimgrey', 'darkviolet', 'green'], domain=['False', '>=50% iterations', '>=75% iterations']), title="Enrichment", 
        legend=alt.Legend(values=['>=75% iterations', '>=50% iterations'], labelFontSize=25, titleFontSize=28))
).properties(width=1500, height=300)

down_dot = down_bar.mark_circle().encode(
    alt.X('TF', title=None, axis=alt.Axis(labelFontSize=25), sort=down_sortorder),
    alt.Y('region', title=None, axis=alt.Axis(labelFontSize=25)),
    alt.Size('iter_per', title="% of iterations", scale=alt.Scale(range=[0, 1000]), legend=alt.Legend(values=[0, 25, 50, 75, 100], labelFontSize=25, titleFontSize=28)),
    alt.Tooltip('counts')
).properties(height=40)

down_plot = alt.vconcat(down_bar, down_dot).configure_title(fontSize=40).configure_legend(titleFontSize=20,labelFontSize=20)
down_plot