In [None]:
from tools import *
from models import *
import plotly.graph_objects as go
import plotly.figure_factory as ff
from Bio.SeqUtils import GC
import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
#for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
#CONSTANTS AND HYPERPARAMETERS (add to yaml)
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Hyper parameters
num_epochs = 15
batch_size = 100
learning_rate = 0.003 

## Train UniBind+ReMap.partial

Split the data set

**python split_the_dataset.py ../data/tf_peaks_50_noNs_partial.pkl 1 ../data/fasta_sequences_50_partial.pkl 0.1 0.1 ../data/tf_peaks_50_partial.h5 True**

In [None]:
dataloaders, target_labels, train_out = load_datas("../data/tf_peaks_50_partial.h5", batch_size)

In [None]:
#decode sequences
target_labels = [i.decode("utf-8") for i in target_labels]

with open('../data/multi_model_target_labels.pkl', 'wb') as f:
    pickle.dump(target_labels, f)

In [None]:
num_classes = len(target_labels) #number of classes

model = ConvNetDeep(num_classes).to(device)

criterion = nn.BCEWithLogitsLoss() #- no weights

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
model, train_error, test_error, train_fscore, test_fscore = train_model(dataloaders['train'], 
                                                                        dataloaders['valid'], model, 
                                                                        device, criterion,  optimizer, 
                                                                        num_epochs, 
                                                                        "../weights_multimodel_partial", 
                                                                        "", verbose=True) 

In [None]:
%matplotlib inline
showPlot(train_error, test_error, "Loss trend", "Loss")

In [None]:
#load the best model
model.load_state_dict(torch.load("../weights_multimodel_partial/model_epoch_6_.pth"))
model.eval();

In [None]:
#perform the evaluation
%matplotlib inline
model.to(device);

labels_E, outputs_E = run_test(model, dataloaders['test'], device)

compute_metrics(labels_E, outputs_E)

In [None]:
plot_results(labels_E, outputs_E, target_labels)

In [None]:
TP = np.sum(((labels_E == 1) * (np.round(outputs_E) == 1)),axis=0)
FP = np.sum(((labels_E == 0) * (np.round(outputs_E) == 1)),axis=0)
TN = np.sum(((labels_E == 0) * (np.round(outputs_E) == 0)),axis=0)
FN = np.sum(((labels_E == 1) * (np.round(outputs_E) == 0)),axis=0)


layout = go.Layout(
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
    title='Transcription factors'),
    yaxis=dict(
    title='Sequences'),
    font=dict(
        family = 'Courier New, monospace',
        size = 18,
        color='black'
    ))

fig = go.Figure(data=[
        go.Bar(name='TP', x=target_labels, y=TP),
        go.Bar(name='FP', x=target_labels, y=FP),
        go.Bar(name='TN', x=target_labels, y=TN),
        go.Bar(name='FN', x=target_labels, y=FN)
    ], layout=layout)
    # Change the bar mode
fig.update_layout(barmode='stack')

fig.update_layout({'width':1000, 'height':500,
                         'showlegend':False
                         })

fig.show()

In [None]:
no_skill_probs = [0 for _ in range(len(labels_E[:,0]))]
ns_fpr, ns_tpr, _ = metrics.roc_curve(labels_E[:,0], no_skill_probs)

roc_aucs = {}
raw_aucs = {}
roc_prcs = {}
raw_prcs = {}
for i in range(len(target_labels)):
    nn_fpr, nn_tpr, threshold = metrics.roc_curve(labels_E[:,i], outputs_E[:,i])
    roc_auc_nn = metrics.auc(nn_fpr, nn_tpr)
    
    precision_nn, recall_nn, thresholds = metrics.precision_recall_curve(labels_E[:,i], outputs_E[:,i])
    pr_auc_nn = metrics.auc(recall_nn, precision_nn)
    
    roc_aucs[target_labels[i]] = nn_fpr, nn_tpr
    raw_aucs[target_labels[i]] = roc_auc_nn
    
    roc_prcs[target_labels[i]] = recall_nn, precision_nn
    raw_prcs[target_labels[i]] = pr_auc_nn   

In [None]:
raw_prcs = pd.Series(raw_prcs).sort_values(ascending=False)
raw_aucs = pd.Series(raw_aucs).sort_values(ascending=False)

In [None]:
raw_aucs = raw_aucs[raw_prcs.index]

fig = go.Figure()

for i in raw_aucs[:8].index:
    fig.add_trace(go.Scatter(x=roc_aucs[i][0], y=roc_aucs[i][1],
                    mode='lines',
                    name=i))
    
fig.add_trace(go.Scatter(x=ns_fpr, y=ns_tpr,
                    mode='lines',
                    name='random'))
    
fig.update_layout({'width':500, 'height':500,
                         'showlegend':True
                         })

layout = go.Layout(
   title = "",
   xaxis = dict(
      title = 'FPR',
      titlefont = dict(
         family = 'Courier New, monospace',
         size = 18,
         color = 'black'
      )     
   ),
   yaxis = dict(
      title = 'TPR',
      titlefont = dict(
         family = 'Courier New, monospace',
         size = 18,
         color = 'black'
      )
   )
)

fig.update_layout(layout)

fig.update_layout(title_text='', plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
#fig.update_xaxes(showline=True, linewidth=2, linecolor='black', title="FPR")#, tickfont=dict(family='Courier New, monospace', color='black', size=18))
#fig.update_yaxes(showline=True, linewidth=2, linecolor='black', title="TPR")#, tickfont=dict(family='Courier New, monospace', color='black', size=18))
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.show()

In [None]:
fig = go.Figure()

for i in raw_prcs[:8].index:
    fig.add_trace(go.Scatter(x=roc_prcs[i][0], y=roc_prcs[i][1],
                    mode='lines',
                    name=i))
    
fig.update_layout({'width':500, 'height':500,
                         'showlegend':False
                         })

layout = go.Layout(
   title = "",
   xaxis = dict(
      title = 'Recall',
      titlefont = dict(
         family = 'Courier New, monospace',
         size = 18,
         color = 'black'
      )     
   ),
   yaxis = dict(
      title = 'Precision',
      titlefont = dict(
         family = 'Courier New, monospace',
         size = 18,
         color = 'black'
      )
   )
)

fig.update_layout(layout)

fig.update_layout(title_text='', plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
#fig.update_xaxes(showline=True, linewidth=2, linecolor='black', title="Recall")
#fig.update_yaxes(showline=True, linewidth=2, linecolor='black', title="Precision")
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.show()

## Build a table of results

In [None]:
import gzip

with gzip.open('../data/idx_files/regions_idx.pickle.gz', 'rb') as f:
    regions = pickle.load(f) #1817918
    
with gzip.open('../data/idx_files/samples_idx.pickle.gz', 'rb') as f:
    samples = pickle.load(f) #52
    
with gzip.open('../data/idx_files/tfs_idx.pickle.gz', 'rb') as f:
    tfs = pickle.load(f) #163
    
tfs = pd.Series(tfs).sort_values()
regions = pd.Series(regions).sort_values()

data = np.load("../data/matrices/matrix2d.ReMap+UniBind.partial.npz")

for i in data.files:
    matrix2d_partial = data[i] #(1817918, 163)
    
df = pd.DataFrame(data=matrix2d_partial, index=regions.index, columns=tfs.index)

In [None]:
data = h5py.File("../data/tf_peaks_50_partial.h5", 'r')
labels_train_matrix = np.array(data['train_out'])

In [None]:
res_df = pd.DataFrame(index=target_labels)

all_mcoef = {}
for i in range(len(target_labels)):
    mcorcoef = matthews_corrcoef(labels_E[:,i], np.round(outputs_E[:,i]))
    all_mcoef[target_labels[i]] = mcorcoef
all_mcoef = pd.Series(all_mcoef)

res_df["Ones_all"] = (df == 1).astype(int).sum()[target_labels]
res_df["Zeros_all"] = (df == 0).astype(int).sum()[target_labels]
res_df["NaN_all"] = df.isna().sum()[target_labels]
res_df["Ones_train"] = labels_train_matrix.sum(axis=0)
res_df["Zeros_train"] = (labels_train_matrix == 0).sum(axis=0)
res_df["TP"] = ((labels_E == 1) * (np.round(outputs_E) == 1)).sum(axis=0)
res_df["FP"] = ((labels_E == 0) * (np.round(outputs_E) == 1)).sum(axis=0)
res_df["TN"] = ((labels_E == 0) * (np.round(outputs_E) == 0)).sum(axis=0)
res_df["FN"] = ((labels_E == 1) * (np.round(outputs_E) == 0)).sum(axis=0)
res_df["AUROC"] = raw_aucs[target_labels]
res_df["AUPRC"] = raw_prcs[target_labels]
res_df["MCC"] = all_mcoef[target_labels]

In [None]:
res_df.to_csv("../data/multimodel_results_df.tsv", sep="\t")