In [1]:
import pandas as pd
import numpy as np
import loompy as lp
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import scipy.stats

import os

# import hdbscan
import sklearn

## **Read in saved results files**

Read in files with saved DE-$\theta$ results for CBL-CNN1 and DUSP9-ETS2 conditions

**Genes with likely-repressed (by mult or super-mult strategy) $k$ in CBl/CNN1**

In [75]:
repress_gs = pd.read_csv('0607_norman_repress_genes_mult.csv')
repress_gs.head()

Unnamed: 0,Gene,Annot,BetaFC,Cond
0,CFH,mult,2.377362,CBL/CNN1
1,IVNS1ABP,mult,1.762381,CBL/CNN1
2,ZMYND11,supermult,-0.39058,CBL/CNN1
3,WDR74,supermult,1.983459,CBL/CNN1
4,PUS1,supermult,0.705497,CBL/CNN1


In [76]:
mult_gs = list(repress_gs['Gene'][repress_gs.Cond.isin(['CBL/CNN1'])])
for i in mult_gs:
    print(i)

CFH
IVNS1ABP
ZMYND11
WDR74
PUS1
RSRC2
CCNB1IP1
ATP2A3
TSEN54
STAT5A
CAVIN1
NME2
RUVBL2
CDC37
PRSS57
GPI
ATF5
HNRNPLL
HK2
SMARCB1
MRPL3
THOC7
ZNF445
USP46
RNF130
NDUFAF2
WASF1
NDUFAF4
FLOT1
LSM5
CHST12
PMS2
ARPC5L
FNBP1


In [77]:
s = ','.join(np.unique(mult_gs))
s

'ARPC5L,ATF5,ATP2A3,CAVIN1,CCNB1IP1,CDC37,CFH,CHST12,FLOT1,FNBP1,GPI,HK2,HNRNPLL,IVNS1ABP,LSM5,MRPL3,NDUFAF2,NDUFAF4,NME2,PMS2,PRSS57,PUS1,RNF130,RSRC2,RUVBL2,SMARCB1,STAT5A,THOC7,TSEN54,USP46,WASF1,WDR74,ZMYND11,ZNF445'

In [81]:
new_repr = repress_gs[repress_gs.Cond.isin(['CBL/CNN1'])]

In [82]:
repr_means = [np.mean(new_repr['BetaFC'][new_repr['Gene'].isin([i])]) for i in np.unique(mult_gs)]

repr_dict = dict(zip(np.unique(mult_gs),repr_means))

Use https://useast.ensembl.org/biomart/martview/1951423d0d221093fcafc3cdca942e65

Human genes (GRCh38.p14)

Attributes
- Gene name
- Chromosome/scaffold name
- Gene start (bp)
- Gene end (bp)

In [83]:
#Save in correct format

#chr, start, end, value
repr_annots = pd.read_csv('repress_gs_chr_start_end.txt')
repr_annots.head()

Unnamed: 0,Gene name,Chromosome/scaffold name,Gene start (bp),Gene end (bp)
0,CCNB1IP1,HG2526_HG2573_PATCH,514049,535993
1,ATP2A3,17,3923870,3964464
2,ARPC5L,9,124862130,124877733
3,CDC37,19,10391133,10420121
4,NDUFAF4,6,96889315,96897891


In [84]:
np.unique(repr_annots['Chromosome/scaffold name'])

array(['1', '10', '11', '12', '14', '17', '19', '2', '22', '3', '4', '5',
       '6', '7', '9', 'HG2526_HG2573_PATCH', 'HSCHR19_2_CTG3_1',
       'HSCHR22_1_CTG7', 'HSCHR6_MHC_APD_CTG1', 'HSCHR6_MHC_COX_CTG1',
       'HSCHR6_MHC_DBB_CTG1', 'HSCHR6_MHC_MANN_CTG1',
       'HSCHR6_MHC_MCF_CTG1', 'HSCHR6_MHC_QBL_CTG1',
       'HSCHR6_MHC_SSTO_CTG1'], dtype=object)

In [85]:
repr_annots = repr_annots[~repr_annots['Chromosome/scaffold name'].isin(['HG2526_HG2573_PATCH', 'HSCHR19_2_CTG3_1',
       'HSCHR22_1_CTG7', 'HSCHR6_MHC_APD_CTG1', 'HSCHR6_MHC_COX_CTG1',
       'HSCHR6_MHC_DBB_CTG1', 'HSCHR6_MHC_MANN_CTG1',
       'HSCHR6_MHC_MCF_CTG1', 'HSCHR6_MHC_QBL_CTG1',
       'HSCHR6_MHC_SSTO_CTG1'])]
repr_annots.head()

Unnamed: 0,Gene name,Chromosome/scaffold name,Gene start (bp),Gene end (bp)
1,ATP2A3,17,3923870,3964464
2,ARPC5L,9,124862130,124877733
3,CDC37,19,10391133,10420121
4,NDUFAF4,6,96889315,96897891
5,LSM5,7,32485338,32495283


In [123]:
repr_bar = pd.DataFrame()
repr_bar['chr'] = ['chr'+i for i in list(repr_annots['Chromosome/scaffold name'])]
repr_bar['start'] = list(repr_annots['Gene start (bp)'])
repr_bar['end'] = list(repr_annots['Gene end (bp)'])
repr_bar['value'] = [repr_dict[i] for i in list(repr_annots['Gene name'])]
repr_bar['color'] = [list(new_repr.Annot[new_repr.Gene.isin([i])])[0] for i in list(repr_annots['Gene name'])]
repr_bar.head()

Unnamed: 0,chr,start,end,value,color
0,chr17,3923870,3964464,0.841446,mult
1,chr9,124862130,124877733,1.625649,supermult
2,chr19,10391133,10420121,0.308934,supermult
3,chr6,96889315,96897891,1.275879,supermult
4,chr7,32485338,32495283,0.480737,supermult


In [87]:
#Save gene labels in correct format

#chr, start, end, label
repr_labs = pd.DataFrame()
repr_labs['chr'] = ['chr'+i for i in list(repr_annots['Chromosome/scaffold name'])]
repr_labs['start'] = list(repr_annots['Gene start (bp)'])
repr_labs['end'] = list(repr_annots['Gene end (bp)'])
repr_labs['label'] = list(repr_annots['Gene name'])
repr_labs.head()

Unnamed: 0,chr,start,end,label
0,chr17,3923870,3964464,ATP2A3
1,chr9,124862130,124877733,ARPC5L
2,chr19,10391133,10420121,CDC37
3,chr6,96889315,96897891,NDUFAF4
4,chr7,32485338,32495283,LSM5


In [124]:
repr_bar.to_csv('repress_barplot.csv',index=None)
repr_labs.to_csv('repress_gene_labels.csv',index=None)

3964464

**All genes with higher burst sizes in CBL/CNN1**

In [2]:
allguides = pd.read_csv('bFC_allguides_cblcnn1_dusp9ets2_norman.csv')
allguides.head()

Unnamed: 0,FC,Param,sgRNA,meanS,errs,Gene
0,0.032707,b,"['CBL', 'CNN1']",0.207317,0.268927,AL592183.1
1,0.021378,b,"['CBL', 'CNN1']",0.146341,0.186889,VPS13D
2,-0.143794,b,"['CBL', 'CNN1']",0.195122,0.264211,TARBP1
3,0.063379,b,"['CBL', 'CNN1']",1.715447,0.139588,HSPB11
4,0.693564,b,"['CBL', 'CNN1']",0.219512,0.309607,KLHL20


In [11]:
np.mean(allguides['FC'][allguides['Gene'].isin(['CBL','CNN1']) & allguides['sgRNA'].isin(["['CBL', 'CNN1']"]) & allguides['Param'].isin(['b'])])


2.980778963981756

In [12]:
np.mean(allguides['FC'][allguides['Gene'].isin(['DUSP9','ETS2']) & allguides['sgRNA'].isin(["['DUSP9', 'ETS2']"]) & allguides['Param'].isin(['b'])])


0.6943951374457078

In [14]:
allguides[allguides['Gene'].isin(['DUSP9','ETS2']) & allguides['sgRNA'].isin(["['DUSP9', 'ETS2']"])]

Unnamed: 0,FC,Param,sgRNA,meanS,errs,Gene
27988,0.595973,b,"['DUSP9', 'ETS2']",0.178233,0.222511,ETS2
30899,1.102949,b,"['DUSP9', 'ETS2']",0.178233,0.222511,ETS2
33810,0.384264,b,"['DUSP9', 'ETS2']",0.178233,0.222511,ETS2
36721,-2.046486,beta,"['DUSP9', 'ETS2']",0.178233,0.218436,ETS2
39632,-1.613499,beta,"['DUSP9', 'ETS2']",0.178233,0.218436,ETS2
42543,-2.205772,beta,"['DUSP9', 'ETS2']",0.178233,0.218436,ETS2
45454,-1.653086,gamma,"['DUSP9', 'ETS2']",0.178233,0.224914,ETS2
48365,-1.679085,gamma,"['DUSP9', 'ETS2']",0.178233,0.224914,ETS2
51276,-2.007366,gamma,"['DUSP9', 'ETS2']",0.178233,0.224914,ETS2


In [31]:
guides_filt = (allguides['FC']>2) & (allguides['sgRNA'].isin(["['CBL', 'CNN1']"])) & (allguides['Param'].isin(['b'])) & (allguides['meanS']>0)
cblcnn1_gs = list(allguides['Gene'][guides_filt])
cblcnn1_gs


['MKNK1',
 'NTRK1',
 'TFB2M',
 'TMEM9',
 'CHI3L2',
 'SIRT1',
 'MTPAP',
 'PTER',
 'OAT',
 'HBD',
 'AAMDC',
 'CCND2',
 'RBM19',
 'SOCS2-AS1',
 'RAB20',
 'DLST',
 'AC138649.1',
 'LPCAT2',
 'DHX8',
 'NME2',
 'ROCK1',
 'CDC37',
 'DDX49',
 'CNN1',
 'IL27RA',
 'AC016629.2',
 'LPIN1',
 'DPY30',
 'B3GNT2',
 'FEZ2',
 'RPL37A',
 'CDS2',
 'LTN1',
 'HMGXB4',
 'PARL',
 'ZNF445',
 'GATB',
 'USP46',
 'BRD8',
 'RUFY1',
 'PSPH',
 'RAB11FIP1',
 'FAM92A',
 'TBC1D13',
 'RPS6',
 'NUP188',
 'CD99',
 'CEBPB',
 'COL1A1',
 'FOXA1',
 'IER5L',
 'NANOG',
 'SLC4A1',
 'ZNF318',
 'SLC39A1',
 'NTRK1',
 'TMEM9',
 'GNAI3',
 'INPP5B',
 'TMEM35B',
 'OAT',
 'HBD',
 'UHRF1BP1L',
 'SOCS2-AS1',
 'RAB20',
 'DLST',
 'CHD8',
 'LPCAT2',
 'DHX8',
 'ROCK1',
 'HAUS5',
 'CNN1',
 'UPF1',
 'LRRC4B',
 'AC016629.2',
 'EIF2AK2',
 'DPY30',
 'B3GNT2',
 'WDR75',
 'RPL37A',
 'CDC25B',
 'DNTTIP1',
 'LTN1',
 'SENP2',
 'PPWD1',
 'RUFY1',
 'EEF1A1',
 'SNHG5',
 'SEM1',
 'RAB11FIP1',
 'FAM92A',
 'VPS13B-DT',
 'TBC1D13',
 'TMOD1',
 'RPS6',
 'NUP188'

In [63]:
new_allguides = allguides[guides_filt]

2.0688956933346

In [57]:
cbcn = [i for i in cblcnn1_gs if i not in mult_gs]
cbcn_str = ','.join(np.unique(cbcn))
cbcn_str

'AAMDC,AC016629.2,AC138649.1,ALAS2,AP000547.3,AZIN1,B3GNT2,BCL2L11,BPGM,BRD8,CBL,CCND2,CD99,CDC25B,CDC5L,CDK11B,CDS2,CEBPB,CEBPE,CHCHD2,CHD8,CHI3L2,CNN1,COL1A1,CSRNP1,DDX49,DHX8,DLST,DLX2,DNTTIP1,DPY30,EEF1A1,EEF2,EGR1,EIF2AK2,FAM92A,FEZ2,FOXA1,FOXO4,GATB,GNAI3,GORASP2,GTF2H3,HAGH,HAUS5,HBD,HMGXB4,IDH1,IER5L,IL27RA,INPP5B,KCTD3,KIAA1143,KRAS,LIG1,LPCAT2,LPIN1,LRRC4B,LTN1,LYL1,METTL9,MKNK1,MTPAP,NANOG,NTRK1,NUP188,OAT,PARL,PCMTD1,POLR3G,PPWD1,PRPF4,PSPH,PTER,QTRT1,RAB11FIP1,RAB20,RBM19,RIPK2,ROCK1,RPE,RPL37A,RPS6,RPS6KA1,RUFY1,SEM1,SENP2,SERINC1,SIRT1,SLC39A1,SLC4A1,SNHG5,SOCS2-AS1,STAC3,TBC1D13,TCAF1,TFB2M,TGFBR3,TMEM189,TMEM35B,TMEM9,TMOD1,TPGS2,TRAF3IP1,TRPM4,UHRF1BP1L,UPF1,VPS13A,VPS13B-DT,VPS35,WDR75,WRNIP1,Z98745.2,ZNF318'

In [65]:
bfc_means = [np.mean(new_allguides['FC'][new_allguides['Gene'].isin([i])]) for i in cbcn]

cbcn_dict = dict(zip(cbcn,bfc_means))

Use https://useast.ensembl.org/biomart/martview/1951423d0d221093fcafc3cdca942e65

Human genes (GRCh38.p14)

Attributes
- Gene name
- Chromosome/scaffold name
- Gene start (bp)
- Gene end (bp)

In [69]:
#Save in correct format

#chr, start, end, value
bFC_annots = pd.read_csv('bFC_cblcnn1_chr_start_end.txt')
bFC_annots.head()

Unnamed: 0,Gene name,Chromosome/scaffold name,Gene start (bp),Gene end (bp)
0,CD99,Y,2691187,2741309
1,RPS6KA1,HG2058_PATCH,17379,62648
2,KIAA1143,HG2066_PATCH,263011,286969
3,TPGS2,18,36777647,36829216
4,RAB20,13,110523066,110561722


In [70]:
np.unique(bFC_annots['Chromosome/scaffold name'])

array(['1', '10', '11', '12', '13', '14', '16', '17', '18', '19', '2',
       '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'HG109_PATCH',
       'HG2058_PATCH', 'HG2066_PATCH', 'HG2219_PATCH', 'HG30_PATCH',
       'HG926_PATCH', 'X', 'Y'], dtype=object)

In [71]:
bFC_annots = bFC_annots[~bFC_annots['Chromosome/scaffold name'].isin(['HG109_PATCH',
       'HG2058_PATCH', 'HG2066_PATCH', 'HG2219_PATCH', 'HG30_PATCH',
       'HG926_PATCH'])]
bFC_annots.head()

Unnamed: 0,Gene name,Chromosome/scaffold name,Gene start (bp),Gene end (bp)
0,CD99,Y,2691187,2741309
3,TPGS2,18,36777647,36829216
4,RAB20,13,110523066,110561722
6,PCMTD1,8,51817575,51899186
7,TMOD1,9,97501180,97601743


In [72]:
cbcn_bar = pd.DataFrame()
cbcn_bar['chr'] = ['chr'+i for i in list(bFC_annots['Chromosome/scaffold name'])]
cbcn_bar['start'] = list(bFC_annots['Gene start (bp)'])
cbcn_bar['end'] = list(bFC_annots['Gene end (bp)'])
cbcn_bar['value'] = [cbcn_dict[i] for i in list(bFC_annots['Gene name'])]
cbcn_bar.head()

Unnamed: 0,chr,start,end,value
0,chrY,2691187,2741309,2.688198
1,chr18,36777647,36829216,2.828992
2,chr13,110523066,110561722,2.066996
3,chr8,51817575,51899186,2.037837
4,chr9,97501180,97601743,2.32195


In [73]:
#Save gene labels in correct format

#chr, start, end, label
cbcn_labs = pd.DataFrame()
cbcn_labs['chr'] = ['chr'+i for i in list(bFC_annots['Chromosome/scaffold name'])]
cbcn_labs['start'] = list(bFC_annots['Gene start (bp)'])
cbcn_labs['end'] = list(bFC_annots['Gene end (bp)'])
cbcn_labs['label'] = list(bFC_annots['Gene name'])
cbcn_labs.head()

Unnamed: 0,chr,start,end,label
0,chrY,2691187,2741309,CD99
1,chr18,36777647,36829216,TPGS2
2,chr13,110523066,110561722,RAB20
3,chr8,51817575,51899186,PCMTD1
4,chr9,97501180,97601743,TMOD1


In [116]:
cbcn_bar.to_csv('bFC_barplot.csv',index=None)
cbcn_labs.to_csv('bFC_gene_labels.csv',index=None)

**Use Grch38 chr file for circos**

In [47]:
!wget --content-disposition https://raw.githubusercontent.com/sehilyi/gemini-datasets/master/data/UCSC.HG38.Human.CytoBandIdeogram.csv

--2024-06-10 13:11:55--  https://raw.githubusercontent.com/sehilyi/gemini-datasets/master/data/UCSC.HG38.Human.CytoBandIdeogram.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30789 (30K) [text/plain]
Saving to: ‘UCSC.HG38.Human.CytoBandIdeogram.csv’


2024-06-10 13:11:55 (128 MB/s) - ‘UCSC.HG38.Human.CytoBandIdeogram.csv’ saved [30789/30789]



In [48]:
hg38 = pd.read_csv('UCSC.HG38.Human.CytoBandIdeogram.csv')
hg38.head()

Unnamed: 0,Chromosome,chromStart,chromEnd,Name,Stain
0,chr1,0,2300000,p36.33,gneg
1,chr1,2300000,5300000,p36.32,gpos25
2,chr1,5300000,7100000,p36.31,gneg
3,chr1,7100000,9100000,p36.23,gpos25
4,chr1,9100000,12500000,p36.22,gneg


In [49]:
np.unique(hg38.Chromosome)

array(['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21',
       'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
       'chrX', 'chrY'], dtype=object)

In [103]:
#save in correct format
new_hg38 = pd.DataFrame()
new_hg38['chr'] = list(hg38.Chromosome)
new_hg38['start'] = [i+1 for i in list(hg38.chromStart)]
new_hg38['end'] = [i+1 for i in list(hg38.chromEnd)]
new_hg38['value1'] = list(hg38.Name)
new_hg38['value2'] = list(hg38.Stain)

new_hg38.to_csv('norman_chromosome_general.csv',index=None)
new_hg38.head()



Unnamed: 0,chr,start,end,value1,value2
0,chr1,1,2300001,p36.33,gneg
1,chr1,2300001,5300001,p36.32,gpos25
2,chr1,5300001,7100001,p36.31,gneg
3,chr1,7100001,9100001,p36.23,gpos25
4,chr1,9100001,12500001,p36.22,gneg


In [None]:
#For circos figures cite: https://venyao.xyz/shinycircos/
#Wang et al. iMeta. 2023 https://onlinelibrary.wiley.com/doi/full/10.1002/imt2.109

In [None]:
#Potential findings
#ALAS2 --> erythroid diff, near FOXO4 (TF) upreg, does it have Foxo4 binding sites?
#Several other TFs


#repression
#