In [None]:
import umap
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.figure_factory as ff
import gzip
import pickle
from random import sample

from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

In [None]:
with gzip.open('../data/idx_files/regions_idx.pickle.gz', 'rb') as f:
    regions = pickle.load(f) #1817918
    
with gzip.open('../data/idx_files/samples_idx.pickle.gz', 'rb') as f:
    samples = pickle.load(f) #52
    
with gzip.open('../data/idx_files/tfs_idx.pickle.gz', 'rb') as f:
    tfs = pickle.load(f) #163
    
tfs = pd.Series(tfs).sort_values()
regions = pd.Series(regions).sort_values()

data = np.load("../data/matrices/matrix2d.ReMap+UniBind.full.npz")

for i in data.files:
    matrix2d_full = data[i] #(1817918, 163)
    
df = pd.DataFrame(data=matrix2d_full, index=regions.index, columns=tfs.index)
df_transposed = df.T

In [None]:
X = df_transposed
X = X[(X.T != 0).any()]

## UMAP plot

In [None]:
reducer = umap.UMAP()

#embedding = reducer.fit_transform(df_transposed.to_numpy())
embedding = reducer.fit_transform(X.to_numpy())

In [None]:
umap_embedding = pd.DataFrame(data=embedding, index=X.index, columns=["x","y"])

In [None]:
import plotly.express as px

fig = px.scatter(umap_embedding, x="x", y="y", #color="Labels",
                text=umap_embedding.index)

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout(title_text='', 
                  xaxis_title='',
                  yaxis_title='',
                  plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.update_layout({'width':800, 'height':800,
                         'showlegend':False
                         })

layout = go.Layout(
   title = "",
   xaxis = dict(
      title = 'UMAP1',
      titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'
      )     
   ),
   yaxis = dict(
      title = 'UMAP2',
      titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'
      )
   )
)

fig.update_layout(layout)

fig.update_traces(textposition="bottom center")
fig.update_layout(uniformtext_minsize=8)

fig.show()

## Heatmap of the TF-TF similarity matrix

In [None]:
data = np.load("../data/matrices/matrix2d.ReMap+UniBind.partial.npz")

for i in data.files:
    matrix2d_partial = data[i] #(1817918, 163)
    
df_partial = pd.DataFrame(data=matrix2d_partial, index=regions.index, columns=tfs.index)

In [None]:
tf_classes = {}

with open("../data/clusters.txt", "r") as f:
    for line in f:
        if line.startswith("#"):
            continue
        
        line_parts = line.strip().split()
        tf_class = line_parts[-1]
        tf_name = line_parts[0]

        tf_classes[tf_name.upper()] = tf_class
    
tf_classes = pd.Series(tf_classes)

In [None]:
tf_classes = tf_classes.sort_values(ascending=True)

In [None]:
col_del = np.array(list(df_partial))[~np.isin(list(df_partial), tf_classes.index)]
df_partial = df_partial.drop(col_del, axis=1)

tf_classes_df = tf_classes[list(df_partial)].sort_values(ascending=True)
df_partial = df_partial[tf_classes_df.index]

df_partial_transposed = df_partial.T
df_partial_transposed = df_partial_transposed[(df_partial_transposed.fillna(0).T != 0).any()]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

def cosine_without_nones(u, v):
    
    dataframe = pd.DataFrame({0:u,1:v})
    
    return cosine_similarity(dataframe.T.dropna(axis=1))[0,1]

In [None]:
dm = squareform(pdist(df_partial_transposed.to_numpy(), cosine_without_nones))

In [None]:
for i in range(dm.shape[0]):
    dm[i,i] = 1
    
tf_classes_df = tf_classes_df.drop(labels=["SMAD3"])

In [None]:
with open('../data/tf_cosine_matrix.pickle', 'wb') as f:
    pickle.dump(dm, f)

In [None]:
fig = go.Figure(data=go.Heatmap(
                   z=dm,
                   x=df_partial_transposed.index,
                   y=df_partial_transposed.index,
                   hoverongaps = False
    ))

fig.update_layout(title='',
                 font=dict(
                     family="Arial",
                     size=14,
                     color="black"
                 ))


layout = go.Layout(
   title = "",
   xaxis = dict(
      title = '',
      tickfont = dict(
         family = 'Arial',
         size = 3,
         color = 'black'
      )     
   ),
   yaxis = dict(
      title = '',
      tickfont = dict(
         family = 'Arial',
         size = 3,
         color = 'black'
      )
   )
)

fig.update_layout(layout)

fig.update_layout(autosize=False,width=800,height=800)

fig.show()

## Finding the TFs for the TL experiment

### Looking for TFs with the same BM

In [None]:
data = np.load("../data/matrices/matrix2d.ReMap+UniBind.sparse.npz")

for i in data.files:
    matrix2d_sparse = data[i] #(1817918, 163)
    
df_sparse = pd.DataFrame(data=matrix2d_sparse, index=regions.index, columns=tfs.index)

In [None]:
df_sparse = df_sparse.loc[:, (df_sparse.fillna(0) != 0).any(axis=0)]

In [None]:
#load the TF - Jaspar cluster relations
tf_clust_corr = pd.read_csv("../data/TF_clust_correspond.tsv", sep="\t", header=None)
tf_clust_corr = pd.Series(tf_clust_corr[1].values, index = tf_clust_corr[0].values) 

clusters_multi_modes = {}
for tf in list(df_sparse): 
    clusts = tf_clust_corr[tf]
    clusts = clusts.split(",")
    for c in clusts:
        if c not in clusters_multi_modes.keys():
            clusters_multi_modes[c] = []
        clusters_multi_modes[c].append(tf)    
    
clusters_multi_modes = pd.Series(clusters_multi_modes)

In [None]:
def tf_sum(tf):
    return df_sparse[tf].sum()

clusters_multi_modes_sorted = clusters_multi_modes.apply(lambda x: sorted(x, key=tf_sum,
                                                                         reverse=True))

In [None]:
bms = clusters_multi_modes_sorted.apply(lambda x: len(x))
bms = list(bms.sort_values(ascending=False).index)

In [None]:
tf_classes = {}

with open("../data/clusters.txt", "r") as f:
    for line in f:
        if line.startswith("#"):
            continue
        
        line_parts = line.strip().split()
        tf_class = line_parts[-1]
        tf_name = line_parts[0]

        tf_classes[tf_name.upper()] = tf_class
    
tf_classes = pd.Series(tf_classes)

tf_classes = tf_classes.sort_values(ascending=True)
tf_classes_df = tf_classes[list(df_sparse)].sort_values(ascending=True)

tf_classes = tf_classes_df.apply(lambda x: ".".join(x.split(".")[:2]))

In [None]:
tfs = {}#set()

for bm in bms:
    if len(tfs.keys()) == 0:
        tfs[bm] = clusters_multi_modes_sorted[bm][0]
    else:
        skip = False
        for tf in tfs.values():
            if tf_classes[clusters_multi_modes_sorted[bm][0]] == tf_classes[tf]:
                skip = True
                break
        if not skip:
            tfs[bm] = clusters_multi_modes_sorted[bm][0]
            
tfs = pd.Series(tfs)

In [None]:
with open('../data/clusters_multi_modes_sorted.pickle', 'wb') as f:
    pickle.dump(clusters_multi_modes_sorted, f)
    
with open('../data/tf_clust_corr.pickle', 'wb') as f:
    pickle.dump(tf_clust_corr[list(df_sparse)], f)

### Looking for cofactors

In [None]:
TFs_to_analyze = ['MAX', 'JUND', 'SPI1', 'SP1', 'HNF4A', 'EGR1']
#dm
tfs = {}
#for tf in TFs_to_analyze:
for tf in df_partial_transposed.index:
    tfs[tf] = np.where(df_partial_transposed.index == tf)[0][0]
tfs = pd.Series(tfs)

In [None]:
with open('../data/tf_clust_corr.pickle', 'rb') as f:
    tf_clust_corr = pickle.load(f)

In [None]:
cofactors = {}
for tf in tfs.index:
    test = pd.Series(dm[tfs[tf],:], index=df_partial_transposed.index)
    test = test.sort_values(ascending=False)
    test2 = tf_clust_corr.apply(lambda x: x.split(","))
    test = test[test2.apply(lambda x: ~np.any(np.isin(x, test2[tf])))].sort_values(ascending=False)
    cofactors[tf] = list(test[:5].index)
    #cofactors[tf] = list(test[:10].index)
cofactors = pd.Series(cofactors)

In [None]:
with open('../data/cofactors.pickle', 'wb') as f:
    pickle.dump(cofactors, f)

### Looking for BM partners with the smallest correlation

In [None]:
with open('../data/clusters_multi_modes_sorted.pickle', 'rb') as f:
    clusters_multi_modes_sorted = pickle.load(f)
    
with open('../data/tf_clust_corr.pickle', 'rb') as f:
    tf_clust_corr = pickle.load(f)

In [None]:
binding_modes = {"MAX":"7", "JUND":"1", "SPI1":"16", "SP1":"34", "EGR1":"34", "HNF4A":"4"}
TFs_to_analyze = ["MAX", "JUND", "SPI1", "SP1", "EGR1", "HNF4A", "FOXA1", "SOX6", "TBP"]
tfs = {}
#for tf in TFs_to_analyze:
for tf in TFs_to_analyze:
    tfs[tf] = np.where(df_partial_transposed.index == tf)[0][0]
tfs = pd.Series(tfs)

In [None]:
not_cor_bms = {}
for tf in tfs.index:
    test = pd.Series(dm[tfs[tf],:], index=df_partial_transposed.index)    
    not_cor_bms[tf] = list(test[clusters_multi_modes_sorted[binding_modes[tf]]].sort_values(ascending=False)[-5:].index)
not_cor_bms = pd.Series(not_cor_bms)

In [None]:
with open('../data/not_cor_bms.pickle', 'wb') as f:
    pickle.dump(not_cor_bms, f)

### Looking for binding partners from STRING

In [None]:
#load string data
TFs_to_analyze = ['MAX', 'JUND', 'SPI1', 'SP1', 'HNF4A', 'EGR1']

string_data = pd.read_csv("../data/string.sorted.tsv", sep="\t", header=None)

tf_clust_corr = pd.read_csv("../data/TF_clust_correspond.tsv", sep="\t", header=None)
tf_clust_corr = pd.Series(tf_clust_corr[1].values, index = tf_clust_corr[0].values)

string_partners = {}
for tf in TFs_to_analyze:
    TF_data = string_data[string_data[0] == tf]
    TF_binding_buds = TF_data[1]
    TF_binding_buds = TF_binding_buds[np.isin(TF_binding_buds, df_partial_transposed.index)]
    
    test2 = tf_clust_corr.apply(lambda x: x.split(","))
    test2 = test2.apply(lambda x: ~np.any(np.isin(x, test2[tf])))
    test2 = test2[test2 == True]
    TF_binding_buds = TF_binding_buds[np.isin(TF_binding_buds.values, test2.index)]
    string_partners[tf]= list(TF_binding_buds.values)[:5]
    
string_partners = pd.Series(string_partners)

In [None]:
with open('../data/string_partners.pickle', 'wb') as f:
    pickle.dump(string_partners, f)

### Looking for best correlated TFs (doesn't matter if BM or not)

In [None]:
with open('../data/clusters_multi_modes_sorted.pickle', 'rb') as f:
    clusters_multi_modes_sorted = pickle.load(f)
    
with open('../data/tf_clust_corr.pickle', 'rb') as f:
    tf_clust_corr = pickle.load(f)

In [None]:
TFs_to_analyze = ["MAX", "JUND", "SPI1", "SP1", "HNF4A"]
tfs = {}
#for tf in TFs_to_analyze:
for tf in TFs_to_analyze: 
    tfs[tf] = np.where(df_partial_transposed.index == tf)[0][0]
tfs = pd.Series(tfs)

In [None]:
best_cor_tfs = {}

for tf in tfs.index:
    test = pd.Series(dm[tfs[tf],:], index=df_partial_transposed.index)
    test = test.sort_values(ascending=False)[1:6]
    
    best_cor_tfs[tf] = list(test.index)
    
best_cor_tfs = pd.Series(best_cor_tfs)

In [None]:
with open('../data/best_cor_tfs.pickle', 'wb') as f:
    pickle.dump(best_cor_tfs, f)

### Looking for STRING TFs (doesn't matter if BM or not)

In [None]:
with open('../data/clusters_multi_modes_sorted.pickle', 'rb') as f:
    clusters_multi_modes_sorted = pickle.load(f)
    
with open('../data/tf_clust_corr.pickle', 'rb') as f:
    tf_clust_corr = pickle.load(f)

In [None]:
#load string data
TFs_to_analyze = ['MAX', 'JUND', 'SPI1', 'SP1', 'HNF4A']

string_data = pd.read_csv("../data/string.sorted.tsv", sep="\t", header=None)

tf_clust_corr = pd.read_csv("../data/TF_clust_correspond.tsv", sep="\t", header=None)
tf_clust_corr = pd.Series(tf_clust_corr[1].values, index = tf_clust_corr[0].values)

In [None]:
string_partners = {}
for tf in TFs_to_analyze:
    TF_data = string_data[string_data[0] == tf]
    TF_binding_buds = TF_data[1]
    TF_binding_buds = TF_binding_buds[np.isin(TF_binding_buds, df_partial_transposed.index)]
    
    string_partners[tf]= list(TF_binding_buds.values)[:5]
    
string_partners = pd.Series(string_partners)

In [None]:
with open('../data/string_partners_best.pickle', 'wb') as f:
    pickle.dump(string_partners, f)