In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import mannwhitneyu
from scipy.stats import pearsonr, spearmanr

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import warnings
import re
warnings.filterwarnings('ignore')
%matplotlib inline

In [37]:
df = pd.read_csv("genes/combined-gene-on-off.csv")
df["inr_effect"] = df["Sum of initiator effect"]
for motif in ["NRF1", "ZNF143", "NFY", "ETS", "CREB", "SP"]:
    df[motif] = df[f'{motif} +'] + df[f'{motif} -']

df_motif = df
df.shape, df_motif.shape

((83, 53), (83, 53))

In [38]:
df.head(1)

Unnamed: 0,gene,off-period,on-period,off-mean,on-mean,K-off-rate,K-on-rate,off-median,on-median,off-period-counts,on-period-counts,TATA_group,inr_group,chr_x,strand_x,TSS,geneID,TATA +,YY1 +,SP +,SP -,ETS +,ETS -,NFY +,NFY -,CREB +,CREB -,NRF1 +,NRF1 -,ZNF143 +,ZNF143 -,U1 snRNP +,chr_y,start,end,strand_y,gene_id,transcript_id,distance2tss,Sum of initiator effect,celltype,strand,H3K27me3_tss,cluster,gene_type,comment,inr_effect,NRF1,ZNF143,NFY,ETS,CREB,SP
0,LUZP1,"[213, 55, 11, 40]","[7, 7, 2]",79.75,5.333333,0.1875,0.012539,47.5,7.0,4,3,without_TATA,without_inr,chr1,-,23178121.0,ENSG00000169641.9,0.009,0.011,-0.126,-0.16,-0.028,-0.07,0.045,0.065,0.498,0.435,-0.004,0.001,-0.001,-0.067,0.186,chr1,23084023.0,23177808.0,-,ENSG00000169641,,313.0,1.355976,HBEC,-,0.0,cluster0,simple,,1.355976,-0.003,-0.068,0.11,-0.098,0.933,-0.286


In [39]:
df_gene = df[["gene", "celltype", "on-mean", "CREB +", "CREB -", "CREB", "on-period-counts", "on-period", ]]
df_gene = df_gene[df_gene["CREB"].notnull()]
df_gene["CREB + high"] = df_gene["CREB +"] > 1
df_gene["CREB - high"] = df_gene["CREB -"] > 1
df_gene["CREB high"] = df_gene["CREB"] > 1

In [40]:
df_gene[df_gene["CREB + high"]]

Unnamed: 0,gene,celltype,on-mean,CREB +,CREB -,CREB,on-period-counts,on-period,CREB + high,CREB - high,CREB high
20,EEF2,HBEC,14.611111,1.007,0.594,1.601,36,"[5, 12, 16, 6, 7, 4, 22, 38, 1, 15, 6, 8, 3, 9...",True,False,True
25,GNB1,HBEC,16.777778,1.248,0.116,1.364,54,"[21, 34, 12, 6, 15, 1, 7, 43, 50, 6, 20, 38, 8...",True,False,True
28,EIF4G1,HBEC,17.555556,1.066,0.461,1.527,9,"[55, 24, 36, 5, 6, 2, 16, 4, 10]",True,False,True
54,EEF2,H9D0,23.666667,1.007,0.594,1.601,9,"[72, 6, 11, 33, 5, 7, 19, 20, 40]",True,False,True
76,ILF3,H9D3,29.615385,1.017,0.535,1.552,13,"[57, 36, 1, 29, 33, 24, 24, 15, 15, 3, 83, 43,...",True,False,True


In [41]:
pos_gene_diff = set(df_gene[df_gene["CREB high"]]["gene"])  - set(df_gene[df_gene["CREB + high"]]["gene"]) 
pos_gene_diff

{'IVNS1ABP', 'PAX6', 'SEPTIN2', 'WDR4', 'ZNF146'}

In [42]:
gene_diff = set(df_gene[df_gene["CREB high"]]["gene"])  - set(df_gene[df_gene["CREB - high"]]["gene"]) 
gene_diff

{'EEF2', 'EIF4G1', 'GNB1', 'ILF3', 'IVNS1ABP', 'PAX6', 'SEPTIN2', 'ZNF146'}

In [43]:
df_gene[df_gene["gene"].isin(pos_gene_diff)]

Unnamed: 0,gene,celltype,on-mean,CREB +,CREB -,CREB,on-period-counts,on-period,CREB + high,CREB - high,CREB high
29,SEPTIN2,HBEC,18.0,0.885,0.861,1.746,29,"[92, 1, 3, 4, 9, 15, 8, 13, 7, 18, 2, 8, 3, 29...",False,False,True
43,IVNS1ABP,H9D0,32.428571,0.889,0.19,1.079,7,"[1, 11, 32, 43, 39, 59, 42]",False,False,True
49,ZNF146,H9D0,42.8,0.595,0.561,1.156,5,"[42, 5, 61, 5, 101]",False,False,True
69,WDR4,H9D3,11.25,0.064,1.059,1.123,4,"[24, 8, 3, 10]",False,True,True
77,IVNS1ABP,H9D3,5.333333,0.889,0.19,1.079,3,"[1, 12, 3]",False,False,True
82,PAX6,H9D3,5.471396,0.996,0.527,1.523,437,"[4, 8, 1, 8, 1, 1, 5, 9, 4, 2, 13, 14, 17, 10,...",False,False,True


In [None]:
df_gene[df_gene["gene"].isin(pos_gene_diff)]

In [None]:
df_on = pd.DataFrame(columns=['gene', 'celltype', 'off-period'])
for index, row in df_gene.iterrows():
    gene = row['gene']
    celltype = row['celltype']

    # Extract off-period values from the string
    off_periods = [int(period) for period in re.findall(r'\d+', row['off-period'])]
    
    # Append each off-period to the new DataFrame
    for off_period in off_periods:
        if off_period >= 2:
            df_off = df_off.append({'gene': gene, 'celltype': celltype, 'off-period': off_period}, ignore_index=True)


df_off['off-period'] = df_off['off-period'].astype(int)
df_off = pd.merge(df_off, df.drop('off-period', axis=1), on=["gene", "celltype"])
df_off.shape