In [None]:
from tools import *
from models import *
import plotly.graph_objects as go
import plotly.figure_factory as ff
from Bio.SeqUtils import GC
from Bio import SeqIO
import os
from random import sample
from plotly.subplots import make_subplots
import pickle
from scipy import stats
from collections import Counter

plt.ioff()

import warnings
warnings.filterwarnings('ignore')

In [None]:
#for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
TFs = []
with open("../data/Analyzed_TFs.txt", "r") as f:
    for line in f:
        TFs.append(line.strip())

In [None]:
#RECORDING THE SIZES
tf_train_sizes = {}

for i in range(1,11):
    tfs = os.listdir("../TRAIN_DATA_50_SORTED/iterat_"+str(i)+"/h5_files/")
    for tf in tfs:
        tf_name = tf.split(".")[0]
        if tf_name not in tf_train_sizes.keys():
            tf_train_sizes[tf_name] = []
        data = h5py.File("../TRAIN_DATA_50_SORTED/iterat_"+str(i)+"/h5_files/" + tf, 'r')
        tf_train_sizes[tf_name].append(data['train_out'].shape[0])
    print("Done with " + str(i))

In [None]:
tf_train_sizes = pd.Series(tf_train_sizes)

In [None]:
tf_train_sizes_mean = tf_train_sizes.map(lambda x: np.mean(x))

In [None]:
#RECORDING THE PERFORMANCE
results = {}

old_model = {}
new_model = {}
old_model_TL = {}
new_model_TL = {}
new_model_no_TL = {}

for i in range(1,11):
    
    pkl_file = open("../RESULTS_50_SORTED/iterat_TL_"+str(i)+"/mccoef_old.pkl", 'rb')
    mccoef_new_model = pickle.load(pkl_file)
    pkl_file.close()
    
    pkl_file = open("../RESULTS_50_SORTED/iterat_TL_"+str(i)+"/mccoef.pkl", 'rb')
    mccoef_new_model_TL = pickle.load(pkl_file)
    pkl_file.close()
    
    pkl_file = open("../RESULTS_50_SORTED/iterat_noTL_"+str(i)+"/mccoef.pkl", 'rb')
    mccoef_new_model_no_TL = pickle.load(pkl_file)
    pkl_file.close()
    
    
    for TF in TFs:
        
        ##################################################################    
        if TF not in new_model.keys() and TF in mccoef_new_model.keys():
            new_model[TF] = []
            new_model[TF].append(mccoef_new_model[TF])
        elif TF in mccoef_new_model.keys():
            new_model[TF].append(mccoef_new_model[TF])
        ##################################################################
            
            
        ##################################################################
        if TF not in new_model_TL.keys() and TF in mccoef_new_model_TL.keys():
            new_model_TL[TF] = []
            new_model_TL[TF].append(mccoef_new_model_TL[TF])
        elif TF in mccoef_new_model_TL.keys():
            new_model_TL[TF].append(mccoef_new_model_TL[TF])
        ##################################################################
            
        ##################################################################
        if TF not in new_model_no_TL.keys() and TF in mccoef_new_model_no_TL.keys():
            new_model_no_TL[TF] = []
            new_model_no_TL[TF].append(mccoef_new_model_no_TL[TF])
        elif TF in mccoef_new_model_no_TL.keys():
            new_model_no_TL[TF].append(mccoef_new_model_no_TL[TF])
        ##################################################################

In [None]:
new_model = pd.Series(new_model) #49 - no ARNT, because it didn't have enough data
new_model_TL = pd.Series(new_model_TL) #148
new_model_no_TL = pd.Series(new_model_no_TL) #148

## Checking negative impact of TL

In [None]:
new_model_TL_mean_perf = new_model_TL.apply(lambda x: np.mean(x))
new_model_no_TL_mean_perf = new_model_no_TL.apply(lambda x: np.mean(x))

In [None]:
mcoerf_difference = new_model_TL_mean_perf.subtract(new_model_no_TL_mean_perf)
mcoerf_difference = mcoerf_difference.sort_values(ascending=True)

In [None]:
neg_changes = mcoerf_difference[mcoerf_difference < 0]

multi = new_model[neg_changes[np.isin(neg_changes.index, new_model.index)].index].apply(lambda x: np.mean(x))
rem = neg_changes[~np.isin(neg_changes.index, new_model.index)].index
x = pd.Series(np.ones(rem.shape), index=rem)*-1
multi = multi.append(x)

neg_impact = pd.concat([neg_changes,
                new_model_TL_mean_perf[neg_changes.index],
                new_model_no_TL_mean_perf[neg_changes.index],
                tf_train_sizes_mean[neg_changes.index],
                multi[neg_changes.index]],axis=1)

neg_impact = neg_impact.rename(columns={0:"McoefDiff", 1: "McoefTL", 
                                        2: "McoefNoTL", 3: "TrainSize",
                                        4: "MultiModel"})

neg_impact.to_csv("../data/neg_impact_TL.csv", sep='\t')

In [None]:
multi = new_model[mcoerf_difference[np.isin(mcoerf_difference.index, new_model.index)].index].apply(lambda x: np.mean(x))
rem = mcoerf_difference[~np.isin(mcoerf_difference.index, new_model.index)].index
x = pd.Series(np.ones(rem.shape), index=rem)*-1
multi = multi.append(x)

all_impact = pd.concat([mcoerf_difference,
                new_model_TL_mean_perf[mcoerf_difference.index],
                new_model_no_TL_mean_perf[mcoerf_difference.index],
                tf_train_sizes_mean[mcoerf_difference.index],
                multi[mcoerf_difference.index]],axis=1)

all_impact = all_impact.rename(columns={0:"McoefDiff", 1: "McoefTL", 
                                        2: "McoefNoTL", 3: "TrainSize",
                                        4: "MultiModel"})

all_impact.to_csv("../data/all_impact_TL.csv", sep='\t')

## BOX PLOT

In [None]:
tf_train_sizes_mean_sorted = tf_train_sizes_mean.sort_values(ascending=False)

train_sizes_tfs = []
for tf in new_model.index:
    train_sizes_tfs.append(tf_train_sizes_mean[tf])
    
new_model_sorted = new_model[tf_train_sizes_mean_sorted[np.isin(tf_train_sizes_mean_sorted.index, new_model.index)].index].dropna()

x_models = [[tf]*10 for tf in new_model.index]
x_models = [item for sublist in x_models for item in sublist]

x_models_sorted = [[tf]*10 for tf in new_model_sorted.index]
x_models_sorted = [item for sublist in x_models_sorted for item in sublist]

tf_to_test = [] #49
for tf in new_model.index:
    if tf in new_model_TL.index and tf in new_model_no_TL.index:
        tf_to_test.append(tf)
        
new_model_tfs = []
for tf in new_model.index:
    new_model_tfs = new_model_tfs + new_model[tf]
    
new_model_TL_tfs = []
for tf in new_model.index:
    new_model_TL_tfs = new_model_TL_tfs + new_model_TL[tf]   
    
new_model_no_TL_tfs = []
for tf in new_model.index:
    new_model_no_TL_tfs = new_model_no_TL_tfs + new_model_no_TL[tf]
    
new_model_tfs_sorted = []
for tf in new_model_sorted.index:
    new_model_tfs_sorted = new_model_tfs_sorted + new_model[tf]
    
new_model_TL_tfs_sorted = []
for tf in new_model_sorted.index:
    new_model_TL_tfs_sorted = new_model_TL_tfs_sorted + new_model_TL[tf]   
    
new_model_no_TL_tfs_sorted = []
for tf in new_model_sorted.index:
    new_model_no_TL_tfs_sorted = new_model_no_TL_tfs_sorted + new_model_no_TL[tf] 

In [None]:
#COMPARING MODELS
fig = make_subplots(specs=[[{"secondary_y": True}]])#this a one cell subplot

fig.add_trace(go.Box(
    #y=new_model_tfs,
    y=new_model_tfs_sorted,
    #x=x_models,
    x=x_models_sorted,
    name='Multi model',
    marker_color='rgb(247,240,86)',
    showlegend=True
))
fig.add_trace(go.Box(
    #y=new_model_TL_tfs,
    y=new_model_TL_tfs_sorted,
    #x=x_models,
    x=x_models_sorted,
    name='Individual model with TL',
    marker_color='rgb(25,101,176)',
    showlegend=True
))
fig.add_trace(go.Box(
    #y=new_model_no_TL_tfs,
    y=new_model_no_TL_tfs_sorted,
    #x=x_models,
    x=x_models_sorted,
    name='Individual model without TL',
    marker_color='rgb(220,5,12)',
    showlegend=True
))
fig.add_trace(go.Scatter(
    #x=new_model.index,
    x=new_model_sorted.index,
    #y=np.log10(tf_train_sizes_mean[new_model.index].values),
    y=np.log10(tf_train_sizes_mean[new_model_sorted.index].values),
    mode='markers',
    name='Sizes of the data sets',
    marker_color='black',
    marker=dict(size=8), showlegend=False), secondary_y=True)


layout = go.Layout(
   title = "",
   xaxis = dict(
      title = '',
      titlefont = dict(
         family = 'Arial',
         size = 12,
         color = 'black'
      )     
   ),
   yaxis = dict(
      title = 'MCC',
      titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'
      )
   )
)

#fig.update_yaxes(range=[0, 1], title= 'Mcor value', secondary_y=False)
fig.update_yaxes(range=[0, 1], secondary_y=False)
fig.update_yaxes(range=[2.5, 5.5], title= 'Train data size (log10)', titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'),secondary_y=True)

fig.update_layout(layout)

fig.update_layout(title='',
                 plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)',
                 )

fig.update_layout(legend=dict(x=1.1, y=1))
fig.update_layout(autosize=False,width=1200,height=500)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', secondary_y=False)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', secondary_y=True)

fig.show()

In [None]:
new_model_TL_mean = new_model_TL.apply(lambda x: np.mean(x))
new_model_no_TL_mean = new_model_no_TL.apply(lambda x: np.mean(x))

# Generated linear fit
slope_tl, intercept_tl, r_value_tl, p_value_tl, std_err_tl = stats.linregress(np.log10(tf_train_sizes_mean[new_model_TL_mean.index]),
                                                                                             new_model_TL_mean.values)
line_tl = slope_tl*np.log10(tf_train_sizes_mean[new_model_TL_mean.index])+intercept_tl

slope_no_tl, intercept_no_tl, r_value_no_tl, p_value_no_tl, std_err_no_tl = stats.linregress(np.log10(tf_train_sizes_mean[new_model_TL_mean.index]),
                                                                                             new_model_no_TL_mean[new_model_TL_mean.index].values)
line_no_tl = slope_no_tl*np.log10(tf_train_sizes_mean[new_model_TL_mean.index])+intercept_no_tl

In [None]:
#Scatter plot with performance and data size
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]),
                         y=new_model_TL_mean.values,
                         mode='markers',
                         name='With TL',
                         marker_color='rgb(25,101,176)',
                         marker=dict(size=8), showlegend=True))

fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]), 
                         y=new_model_no_TL_mean[new_model_TL_mean.index].values,
                         mode='markers',
                         name='Without TL',
                         marker_color='rgb(220,5,12)',
                         marker=dict(size=8), showlegend=True))

#fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]), 
#                         y=line_tl,
#                    mode='lines',
#                    name="TL_fit",
#                    marker=dict(size=8), marker_color='red', showlegend=False))

#fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]), 
#                         y=line_no_tl,
#                    mode='lines',
#                    name="No_TL_fit",
#                    marker=dict(size=8), marker_color='green', showlegend=False))

#fig.update_layout(legend=dict(x=1.1, y=1))
fig.update_layout(autosize=False,width=1000,height=500)

fig.update_layout(title='',
                 plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)',
                 font=dict(
                     family="Arial",
                     size=14,
                     color="black"
                 ))

fig.update_layout(title='',
                 yaxis_title='MCC',
                 xaxis_title='Train data size, log10',
                 plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'
      ))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', titlefont = dict(
         family = 'Arial',
         size = 18,
         color = 'black'
      ))

fig.show()

In [None]:
new_model_TL_std = new_model_TL.apply(lambda x: np.std(x))
new_model_no_TL_std = new_model_no_TL.apply(lambda x: np.std(x))

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]),
                         y=new_model_TL_std.values,
                         mode='markers',
                         name='With TL',
                         marker_color='red',
                         marker=dict(size=8)))

fig.add_trace(go.Scatter(x=np.log10(tf_train_sizes_mean[new_model_TL_mean.index]), 
                         y=new_model_no_TL_std[new_model_TL_mean.index].values,
                         mode='markers',
                         name='Without TL',
                         marker_color='green',
                         marker=dict(size=8)))


fig.update_layout(title='MCC vs training size',
                 yaxis_title='Standard deviation of Mcor',
                 xaxis_title='Data size, log10',
                 plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.show()

## Checking the effect of the binding mode

In [None]:
#load the TF - Jaspar cluster relations
tf_clust_corr = pd.read_csv("../data/TF_clust_correspond.tsv", sep="\t", header=None)
tf_clust_corr = pd.Series(tf_clust_corr[1].values, index = tf_clust_corr[0].values) 

In [None]:
tf_train_sizes_mean_small = tf_train_sizes_mean[tf_train_sizes_mean <= 20000]
tf_train_sizes_mean_small = tf_train_sizes_mean_small#[tf_train_sizes_mean_small > 1000] #56

In [None]:
new_model_TL_mean_small = new_model_TL_mean[tf_train_sizes_mean_small.index[np.isin(tf_train_sizes_mean_small.index, new_model_TL_mean.index)]]
new_model_TL_mean_small = new_model_TL_mean_small.dropna()

new_model_no_TL_mean_small = new_model_no_TL_mean[tf_train_sizes_mean_small.index[np.isin(tf_train_sizes_mean_small.index, new_model_no_TL_mean.index)]]
new_model_no_TL_mean_small = new_model_no_TL_mean_small.dropna()

In [None]:
mccoef_differences = new_model_TL_mean_small.subtract(new_model_no_TL_mean_small)

In [None]:
#find unique clusters, and their numbers:
clusters_multi_modes = []
for tf in new_model.index: #ARNT is missing
    clusts = tf_clust_corr[tf]
    clusts = clusts.split(",")
    clusters_multi_modes = clusters_multi_modes + clusts
    
clusts = tf_clust_corr["ARNT"]
clusts = clusts.split(",")
clusters_multi_modes = clusters_multi_modes + clusts

#there are 34 clusters in the multi model (out of 50 TFs)
clusters_multi_modes_sorted = pd.Series(Counter(clusters_multi_modes)).sort_values(ascending=False)

In [None]:
tf_analyzed_modes = tf_clust_corr[new_model_TL.index]
tf_analyzed_modes = tf_analyzed_modes.apply(lambda x: x.split(","))

In [None]:
tf_analyzed_modes_small = tf_analyzed_modes[mccoef_differences.index]

In [None]:
#0 - TF has different BM then TFs from multi model - 40;
id_vector = np.zeros((len(tf_analyzed_modes_small,)))

#1 - TF has the same BM as the multi model TFs, but not multi model output;
pos_ind = tf_analyzed_modes_small.apply(lambda x: np.any(np.isin(x, clusters_multi_modes_sorted.index)))
id_vector[np.where(pos_ind == True)[0]] = 1 #59

#2 - TF is the output of the multi model
id_vector[np.where(np.isin(tf_analyzed_modes_small.index, new_model.index))[0]] = 2 #49

In [None]:
tf_analyzed_modes_ids = pd.Series(id_vector, index=tf_analyzed_modes_small.index)

In [None]:
diff_bm_not_in_multi = mccoef_differences[tf_analyzed_modes_ids[tf_analyzed_modes_ids == 0].index]
same_bm_not_in_multi = mccoef_differences[tf_analyzed_modes_ids[tf_analyzed_modes_ids == 1].index]
multi_output = mccoef_differences[tf_analyzed_modes_ids[tf_analyzed_modes_ids == 2].index]

In [None]:
# Generated linear fit (optional)
slope_multi, intercept_multi, r_value_multi, p_value_multi, std_err_multi = stats.linregress(multi_output.values,
                                                                                             np.log10(tf_train_sizes_mean[multi_output.index].values))
line_multi = slope_multi*multi_output.values+intercept_multi

slope_tl_bm, intercept_tl_bm, r_value_tl_bm, p_value_tl_bm, std_err_tl_bm = stats.linregress(same_bm_not_in_multi.values,
                                                                                             np.log10(tf_train_sizes_mean[same_bm_not_in_multi.index].values))
line_tl_bm = slope_tl_bm*same_bm_not_in_multi.values+intercept_tl_bm

slope_tl_nobm, intercept_tl_nobm, r_value_tl_nobm, p_value_tl_nobm, std_err_tl_nobm = stats.linregress(diff_bm_not_in_multi.values,
                                                                                             np.log10(tf_train_sizes_mean[diff_bm_not_in_multi.index].values))
line_tl_nobm = slope_tl_nobm*diff_bm_not_in_multi.values+intercept_tl_nobm

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=multi_output.values, 
                         y=np.log10(tf_train_sizes_mean[multi_output.index].values),
                    mode='markers',
                    name='Multi-model TFs',
                    marker=dict(size=10), marker_color = "rgb(247,240,86)"))
fig.add_trace(go.Scatter(x=same_bm_not_in_multi.values, 
                         y=np.log10(tf_train_sizes_mean[same_bm_not_in_multi.index].values),
                    mode='markers',
                    name='Same BM is the multi-model',
                    marker=dict(size=10), marker_color = "rgb(25,101,176)"))
fig.add_trace(go.Scatter(x=diff_bm_not_in_multi.values, 
                         y=np.log10(tf_train_sizes_mean[diff_bm_not_in_multi.index].values),
                    mode='markers',
                    name='Different BM is the multi model',
                    marker=dict(size=10), marker_color = "rgb(220,5,12)"))


#################################################################
#linear fit
#fig.add_trace(go.Scatter(x=multi_output.values, 
#                         y=line_multi,
#                    mode='lines',
#                    name="multi_model_fit",
#                    marker=dict(size=8), marker_color='blue'))

#fig.add_trace(go.Scatter(x=same_bm_not_in_multi.values, 
#                         y=line_tl_bm,
#                    mode='lines',
#                    name="TL_BM_fit",
#                    marker=dict(size=8), marker_color='red'))

#fig.add_trace(go.Scatter(x=diff_bm_not_in_multi.values, 
#                         y=line_tl_nobm,
#                    mode='lines',
#                    name="TL_noBM_fit",
#                    marker=dict(size=8), marker_color='green'))
#################################################################


fig['layout'].update(shapes=[{'type': 'line','y0':2.5,
                              'y1': max(np.log10(tf_train_sizes_mean)),'x0':0, 
                              'x1':0,'xref':'x1','yref':'y1',
                              'line': {'color': 'black','width': 2.5}},
                            {'type': 'line','y0':np.log10(20000),
                              'y1': np.log10(20000),'x0':-0.4, 
                              'x1':0.8,'xref':'x1','yref':'y1',
                              'line': {'color': 'black','width': 2.5}}])


fig.update_layout(title_text='', 
                  xaxis_title='Change in MCC',
                  yaxis_title='Train dataset size, log10',
                  font=dict(
                     family="Arial",
                     size=14,
                     color="black"
                  ),
                  plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(y=multi_output, name="Multi-model TFs", marker_color="rgb(247,240,86)"))
fig.add_trace(go.Box(y=same_bm_not_in_multi, name="Same BM is the multi-model", marker_color="rgb(25,101,176)"))
fig.add_trace(go.Box(y=diff_bm_not_in_multi, name="Different BM is the multi model", marker_color="rgb(220,5,12)"))

fig.update_layout(title='MCC vs Binding mode',
                 plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)',
                 font=dict(
                     family="Arial",
                     size=14,
                     color="black"
                 ))

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', title="Change in MCC")

fig.show()

In [None]:
stats.ttest_ind(same_bm_not_in_multi.values, diff_bm_not_in_multi.values, equal_var = False)

In [None]:
stats.ttest_ind(same_bm_not_in_multi.values, multi_output.values, equal_var = False)

In [None]:
stats.ttest_ind(diff_bm_not_in_multi.values, multi_output.values, equal_var = False)