In [None]:
from tools import *
from models import *
import plotly.graph_objects as go
import plotly.figure_factory as ff
from Bio.SeqUtils import GC
import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

batch_size = 100

## JUND cofactors

cofactors - MCC:
* SP1      0.349809
* MAFG     0.307209
* MAFF     0.268983
* NFIC     0.253942
* CEBPB    0.245328

### JUND multi-model

In [None]:
#inspecting both - cofactors and random tfs
#data = h5py.File("../for_Manu/TRAIN_DATA_COFACTORS_SUBSAMPLE_I_False/JUND_multi_1/h5_files/tf_peaks_JUND.h5", 'r')
data = h5py.File("../for_Manu/TRAIN_DATA_RANDOM_SUBSAMPLE_I_False/JUND_multi_1/h5_files/tf_peaks_JUND.h5", 'r')

x = torch.Tensor(data['train_in'])
y = torch.Tensor(data['valid_in'])
z = torch.Tensor(data['test_in'])

x_lab = torch.Tensor(data['train_out'])
y_lab = torch.Tensor(data['valid_out'])
z_lab = torch.Tensor(data['test_out'])

res = torch.cat((x, y, z), dim=0)
res_lab = torch.cat((x_lab, y_lab, z_lab), dim=0)

all_dataset = torch.utils.data.TensorDataset(res, res_lab)
dataloader = torch.utils.data.DataLoader(all_dataset, 
                                                  batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
model = ConvNetDeep(5).to(device)

#model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_COFACTORS_SUBSAMPLE_I_False/JUND_real_multimodel_weights_1/model_epoch_4_.pth"))
model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_RANDOM_SUBSAMPLE_I_False/JUND_real_multimodel_weights_1/model_epoch_4_.pth"))
model.eval();

#copy trained model weights to motif extraction model
motif_model = motifCNN(model, 5).to(device)
motif_model.load_state_dict(model.state_dict())
motif_model.eval();

In [None]:
# run predictions with full model on all data
running_outputs = []
running_labels = []
sequences = []
sigmoid = nn.Sigmoid()
with torch.no_grad():
    for seq, lbl in dataloader:
        sequences.extend(seq.numpy())
        seq = seq.to(device)
        out = model(seq)
        out = sigmoid(out.detach().cpu()) #for BCEWithLogits
        running_outputs.extend(out.numpy()) #for BCEWithLogits
        running_labels.extend(lbl.numpy())

running_labels = np.array(running_labels)
running_outputs = np.array(running_outputs)
sequences = np.array(sequences)

In [None]:
pred_full_round = np.round(running_outputs)

In [None]:
arr_comp = np.equal(pred_full_round, running_labels)
idx = np.argwhere(np.sum(arr_comp, axis=1) >= 5).squeeze() #43563

In [None]:
sampled_idx = np.random.choice(idx, size=80000, replace=False)

In [None]:
res2 = res[sampled_idx, :, :]
res_lab2 = res_lab[sampled_idx, :]

In [None]:
dataset = torch.utils.data.TensorDataset(res2, res_lab2)
data_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                          batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
predictions, activations = get_motifs(data_loader, motif_model, device)

In [None]:
#output_file_path = "../for_Manu/motifs/motifs_for_JUND_multimodel.meme"
output_file_path = "../for_Manu/motifs/motifs_for_JUND_random_multimodel.meme"

In [None]:
get_memes(activations, res2, res_lab2, output_file_path)

### JUND individual 

In [None]:
#data = h5py.File("../for_Manu/TRAIN_DATA_COFACTORS_SUBSAMPLE_I_False/JUND_indiv_1/h5_files/JUND_tl.h5", 'r')
data = h5py.File("../for_Manu/TRAIN_DATA_RANDOM_SUBSAMPLE_I_False/JUND_indiv_1/h5_files/JUND_tl.h5", 'r')

x = torch.Tensor(data['train_in'])
y = torch.Tensor(data['valid_in'])
z = torch.Tensor(data['test_in'])

x_lab = torch.Tensor(data['train_out'])
y_lab = torch.Tensor(data['valid_out'])
z_lab = torch.Tensor(data['test_out'])

res = torch.cat((x, y, z), dim=0)
res_lab = torch.cat((x_lab, y_lab, z_lab), dim=0)

all_dataset = torch.utils.data.TensorDataset(res, res_lab)
dataloader = torch.utils.data.DataLoader(all_dataset, 
                                                  batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
model = ConvNetDeep(1).to(device)

#model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_COFACTORS_SUBSAMPLE_I_False/JUND_real_indiv_weights_1/JUND_tl_weights/model_epoch_2_.pth"))
model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_RANDOM_SUBSAMPLE_I_False/JUND_real_indiv_weights_1/JUND_tl_weights/model_epoch_3_.pth"))
model.eval();

#copy trained model weights to motif extraction model
motif_model = motifCNN(model, 1).to(device)
motif_model.load_state_dict(model.state_dict())
motif_model.eval();

In [None]:
# run predictions with full model on all data
running_outputs = []
running_labels = []
sequences = []
sigmoid = nn.Sigmoid()
with torch.no_grad():
    for seq, lbl in dataloader:
        sequences.extend(seq.numpy())
        seq = seq.to(device)
        out = model(seq)
        out = sigmoid(out.detach().cpu()) #for BCEWithLogits
        running_outputs.extend(out.numpy()) #for BCEWithLogits
        running_labels.extend(lbl.numpy())

running_labels = np.array(running_labels)
running_outputs = np.array(running_outputs)
sequences = np.array(sequences)

In [None]:
pred_full_round = np.round(running_outputs)

In [None]:
arr_comp = np.equal(pred_full_round, running_labels)
idx = np.argwhere(np.sum(arr_comp, axis=1) >= 1).squeeze() #43563

res2 = res[idx, :, :]
res_lab2 = res_lab[idx, :]

dataset = torch.utils.data.TensorDataset(res2, res_lab2)
data_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                          batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
predictions, activations = get_motifs(data_loader, motif_model, device)

In [None]:
output_file_path = "../for_Manu/motifs/motifs_for_JUND_random_individual.meme"

In [None]:
get_memes(activations, res2, res_lab2, output_file_path)

In [None]:
#results for multimodel_cofactors (q.value 0.01)
multi_cofactors = {'filter11':'JUND', 'filter30':'MAFG/MAFF', 'filter5':'SP1',
                  'filter72':'Fos:Jun', 'filter75':'CEBPB', 'filter64':'CTCF',
                  'filter40':'CEBPB', 'filter8':'MAFG', 'filter37':'MAFG/MAFF',
                  'filter19':'MAFG/MAFF','filter80':'MAFG/MAFF','filter94':'JUND',
                  'filter60':'NRL', 'filter43':'Gmeb1'}

indiv_cofactors = {'filter11':'JUND', 'filter30':'MAFG/MAFF', 'filter72':'Fos:Jun',
                  'filter5':'SP1','filter8':'Fos:JUN', 'filter94':'JUND', 'filter40':'CEBPB',
                  'filter37':'Fos:JUN', 'filter76':'JUND', 'filter90':'JUND',
                  'filter52':'JUND', 'filter36':'MAFG', 'filter39':'JUND',
                  'filter43':'NRL', 'filter93':'JUND', 'filter27':'JUND', 'filter1':'JUND',
                  'filter60':'NRL', 'filter50':'JUND', 'filter73':'JUND', 'filter63':'JUND',
                  'filter19':'MAFG', 'filter82':'JUND', 'filter18':'MAFG'}

#['ZNF143', 'TP63', 'GATA3', 'ELK1', 'RXRA']
multi_random = {'filter17':'RXRA', 'filter87':'TP73', 'filter96':'ELF5', 'filter68':'CTCF',
               'filter74':'NRL', 'filter41':'CTCF', 'filter44':'CTCF', 
                'filter43':'TP73', 'filter88':'CTCF', 'filter99':'TP73', 'filter':'HOXD3',
               'filter42':'TP73', 'filter66':'CTCF', 'filter91':'Rhox11',
               'filter89':'Gmeb1'}

indiv_random = {'filter85':'JUND', 'filter74':'NRL', 'filter87':'TP73',
               'filter75':'HOXD3', 'filter30':'JUND', 'filter96':'ELF5',
               'filter39':'Gmeb1', 'filter89':'Gmeb1', 'filter91':"Rhox11",
               'filter43':'TP73'}

## HNF4A cofactors

cofactors - MCC
* NR2F2    0.253237
* FOXA2    0.238407
* FOXA1    0.235406
* SP1      0.212225
* MYBL2    0.197924

### HNF4A multi-model

In [None]:
#data = h5py.File("../for_Manu/TRAIN_DATA_COFACTORS_SUBSAMPLE_I_False/HNF4A_multi_1/h5_files/tf_peaks_HNF4A.h5", 'r')
data = h5py.File("../for_Manu/TRAIN_DATA_RANDOM_SUBSAMPLE_I_False/HNF4A_multi_1/h5_files/tf_peaks_HNF4A.h5", 'r')

x = torch.Tensor(data['train_in'])
y = torch.Tensor(data['valid_in'])
z = torch.Tensor(data['test_in'])

x_lab = torch.Tensor(data['train_out'])
y_lab = torch.Tensor(data['valid_out'])
z_lab = torch.Tensor(data['test_out'])

res = torch.cat((x, y, z), dim=0)
res_lab = torch.cat((x_lab, y_lab, z_lab), dim=0)

all_dataset = torch.utils.data.TensorDataset(res, res_lab)
dataloader = torch.utils.data.DataLoader(all_dataset, 
                                                  batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
model = ConvNetDeep(5).to(device)

#model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_COFACTORS_SUBSAMPLE_I_False/HNF4A_real_multimodel_weights_1/model_epoch_4_.pth"))
model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_RANDOM_SUBSAMPLE_I_False/HNF4A_real_multimodel_weights_1/model_epoch_4_.pth"))
model.eval();

#copy trained model weights to motif extraction model
motif_model = motifCNN(model, 5).to(device)
motif_model.load_state_dict(model.state_dict())
motif_model.eval();

In [None]:
# run predictions with full model on all data
running_outputs = []
running_labels = []
sequences = []
sigmoid = nn.Sigmoid()
with torch.no_grad():
    for seq, lbl in dataloader:
        sequences.extend(seq.numpy())
        seq = seq.to(device)
        out = model(seq)
        out = sigmoid(out.detach().cpu()) #for BCEWithLogits
        running_outputs.extend(out.numpy()) #for BCEWithLogits
        running_labels.extend(lbl.numpy())

running_labels = np.array(running_labels)
running_outputs = np.array(running_outputs)
sequences = np.array(sequences)

In [None]:
pred_full_round = np.round(running_outputs)

In [None]:
arr_comp = np.equal(pred_full_round, running_labels)
idx = np.argwhere(np.sum(arr_comp, axis=1) >= 5).squeeze() #43563

In [None]:
sampled_idx = np.random.choice(idx, size=80000, replace=False)

res2 = res[sampled_idx, :, :]
res_lab2 = res_lab[sampled_idx, :]

dataset = torch.utils.data.TensorDataset(res2, res_lab2)
data_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                          batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
predictions, activations = get_motifs(data_loader, motif_model, device)

In [None]:
#output_file_path = "../for_Manu/motifs/motifs_for_HNF4A_multimodel.meme"
output_file_path = "../for_Manu/motifs/motifs_for_HNF4A_random_multimodel.meme"

In [None]:
get_memes(activations, res2, res_lab2, output_file_path)

### HNF4A individual 

In [None]:
#data = h5py.File("../for_Manu/TRAIN_DATA_COFACTORS_SUBSAMPLE_I_False/HNF4A_indiv_1/h5_files/HNF4A_tl.h5", 'r')
data = h5py.File("../for_Manu/TRAIN_DATA_RANDOM_SUBSAMPLE_I_False/HNF4A_indiv_1/h5_files/HNF4A_tl.h5", 'r')

x = torch.Tensor(data['train_in'])
y = torch.Tensor(data['valid_in'])
z = torch.Tensor(data['test_in'])

x_lab = torch.Tensor(data['train_out'])
y_lab = torch.Tensor(data['valid_out'])
z_lab = torch.Tensor(data['test_out'])

res = torch.cat((x, y, z), dim=0)
res_lab = torch.cat((x_lab, y_lab, z_lab), dim=0)

all_dataset = torch.utils.data.TensorDataset(res, res_lab)
dataloader = torch.utils.data.DataLoader(all_dataset, 
                                                  batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
target_labels = list(data['target_labels'])

target_labels = [i.decode("utf-8") for i in target_labels]
target_labels

In [None]:
model = ConvNetDeep(1).to(device)

#model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_COFACTORS_SUBSAMPLE_I_False/HNF4A_real_indiv_weights_1/HNF4A_tl_weights/model_epoch_4_.pth"))
model.load_state_dict(torch.load("../for_Manu/MODEL_WEIGHTS_RANDOM_SUBSAMPLE_I_False/HNF4A_real_indiv_weights_1/HNF4A_tl_weights/model_epoch_4_.pth"))
model.eval();

#copy trained model weights to motif extraction model
motif_model = motifCNN(model, 1).to(device)
motif_model.load_state_dict(model.state_dict())
motif_model.eval();

In [None]:
# run predictions with full model on all data
running_outputs = []
running_labels = []
sequences = []
sigmoid = nn.Sigmoid()
with torch.no_grad():
    for seq, lbl in dataloader:
        sequences.extend(seq.numpy())
        seq = seq.to(device)
        out = model(seq)
        out = sigmoid(out.detach().cpu()) #for BCEWithLogits
        running_outputs.extend(out.numpy()) #for BCEWithLogits
        running_labels.extend(lbl.numpy())

running_labels = np.array(running_labels)
running_outputs = np.array(running_outputs)
sequences = np.array(sequences)

In [None]:
pred_full_round = np.round(running_outputs)

In [None]:
arr_comp = np.equal(pred_full_round, running_labels)
idx = np.argwhere(np.sum(arr_comp, axis=1) >= 1).squeeze() #43563

In [None]:
res2 = res[idx, :, :]
res_lab2 = res_lab[idx, :]

dataset = torch.utils.data.TensorDataset(res2, res_lab2)
data_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                          batch_size=100, shuffle=False,
                                                  num_workers=0)

In [None]:
predictions, activations = get_motifs(data_loader, motif_model, device)

In [None]:
#output_file_path = "../for_Manu/motifs/motifs_for_HNF4A_individual.meme"
output_file_path = "../for_Manu/motifs/motifs_for_HNF4A_random_individual.meme"

In [None]:
get_memes(activations, res2, res_lab2, output_file_path)

In [None]:
#results for multimodel_cofactors (q.value 0.01)
#NR2F2 FOXA2 FOXA1 SP1 MYBL2 
#NR2F2 and RXRA - 0.24 (RXRA and HNF4A are 0.41), FOXA2/FOXA1 and RXRA - 0.16/0.12
#SP1 and RXRA - 0.23, MYBL2 and RXRA - 0.20;

#and RXRA and HNFA are the same BM - 13
multi_cofactors = {'filter28':'RXRA/HNF4G/NR2C2', 'filter66':"Gmeb1", "filter5":'FOXA1',
                  'filter34':'Gmeb1', 'filter12':"FOXJ3", "filter19":'Gmeb1',
                  "filter64":"Fos:JUN", "filter99":"FOXA2", "filter6":"FOXA1",
                  "filter2":"NR2F1", "filter92":"RXRA/NR4A2/HNF4G", "filter40":"NR1H4",
                  "filter96":"PPARA:RXRA/HNF4G", "filter29":"PPARA:RXRA/HNF4G",
                  "filter62":"RARA:RXRG/FOXA1", "filter71":"MEOX1", 
                   "filter78":"Gmeb1", "filter54":"Gmeb1", "filter13":"FOXJ3/NR1H3:RXRA",
                  "filter21":"Fos:JUN/CREB1"}

indiv_cofactors = {'filter28':'HNF4G', 'filter96':'HNF4G', 'filter66':'Gmeb1',
                  'filter5':'FOXA1', 'filter29':'HNF4G', 'filter2':'RXRA/HNF4G',
                  'filter99':'FOXA2', 'filter92':'HNF4G', 'filter12':'FOXJ3',
                  'filter34':'Gmeb1', 'filter16':"HNF4G", "filter19":"HNF4G",
                  'filter71':'MEOX2', 'filter62':'FOXA1', 'filter27':'Gmeb1',
                  'filter67':'FOXA1', 'filter95':'Gmeb1', 'filter13':'FOXJ3',
                  'filter64':'Gmeb1', 'filter47':'Gmeb1', 'filter6':'FOXK1',
                  'filter40':'NR1A4:RXRA'}

#['NR3C1', 'MEF2A', 'TFAP4 (NEUROD1/TWIST1/FIGLA)', 'KLF1', 'ATF1 (FOS:JUN)']
multi_random = {'filter18':'Gmeb1/FOS:JUN', 'filter64':'Gmeb1', 'filter14':'HES2/MYC',
               'filter17':'FIGLA/NEUROD1', 'filter28':'FOS:JUN', 'filter79':'NEUROD1/TWIST1',
               'filter2':'Gmeb1', 'filter76':'MITF/USF2', 'filter9':"NEUROD1/TWIST1",
               'filter5':'Gmeb1', 'filter31':'Gmeb1', 'filter70':'TWIST1',
               'filter43':'RARA:RXRG', 'filter33':'Gmeb1', 'filter29':'FOS:JUN',
               'filter85':'GATA1:TAL1', 'filter27':'Gmeb1', 'filter7':'USF2/ZEB1',
               'filter47':'FOS:JUN', 'filter60':'Gmeb1', 'filter12':'Myog',
               'filter36':'Gmeb1'}

indiv_random = {}