In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import multibind as mb
import numpy as np
import pandas as pd
import torch
import bindome as bd
bd.constants.ANNOTATIONS_DIRECTORY = 'annotations'
# mb.models.MultiBind
import torch.optim as topti
import torch.utils.data as tdata
import matplotlib.pyplot as plt
import logomaker
import seaborn as sns
from sklearn.metrics import r2_score

# Use a GPU if available, as it should be faster.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device: " + str(device))

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [3]:
%load_ext line_profiler

In [3]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 5, 1

In [4]:
df = mb.bindome.datasets.ProBound.ctcf(flank_length=0)
data = df
# data = df.sample(n=2000)
# data['seq'] = data['seq'].str[-30:]

In [5]:
data

Unnamed: 0,seq,0,1
0,AAAAAAAGCCCGGAAATAGGCAACTTGTAG,0,1
1,AAAAAAAGGATGTTCCTAGCAACTTATAAA,1,0
2,AAAAAACAACGATAACCAACTGCTGCCGGA,0,1
3,AAAAAACACATGTATGAGTTTTTGATGGAG,1,0
4,AAAAAACCCTCCTTGGTGTCGGACGGCTAT,0,1
...,...,...,...
120091,TTTTTTTTCTTCATTGTTACAGTAGGTAGC,1,0
120092,TTTTTTTTGACTGCTTGGCTGGCTCCTGTG,1,0
120093,TTTTTTTTGGTCGGATTCGCTGTTGTTCAC,0,1
120094,TTTTTTTTTGAACCGGCCGCTCCTATGATC,1,0


In [5]:
n_rounds = 1
dataset = mb.datasets.SelexDataset(data, n_rounds=n_rounds)
train = tdata.DataLoader(dataset=dataset, batch_size=256, shuffle=True)

## Optimizing last steps of forward

In [7]:
model = mb.models.DinucSelex(use_dinuc=False, kernels=[0, 14, 12], n_rounds=1, n_batches=1).to(device)
optimiser = topti.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
# optimiser = topti.LBFGS(model.parameters())
criterion = mb.tl.PoissonLoss()

In [16]:
i, batch = enumerate(train).__next__()
mononuc = batch["mononuc"].to(device)
b = batch["batch"].to(device) if "batch" in batch else None
countsum = batch["countsum"].to(device) if "countsum" in batch else None
inputs = (mononuc, b, countsum)

optimiser.zero_grad()

In [24]:
%lprun -f model.forward model(inputs)

Timer unit: 1e-06 s

Total time: 0.006034 s
File: /home/johanna/ICB/multibind/multibind/models/models.py
Function: forward at line 105

Line #      Hits         Time  Per Hit   % Time  Line Contents
   105                                               def forward(self, x, min_value=1e-15):
   106                                                   # Create the forward pass through the network.
   107         1          1.0      1.0      0.0          mono, batch, countsum = x
   108                                           
   109                                                   # convert mono to dinuc
   110                                                   # print(mono.shape)
   111                                           
   112                                                   # print(mono.shape, di.shape)
   113                                                   # assert False
   114                                           
   115                                                 

## Testing flip vs advanced indexing for _mono2revmono

In [6]:
i, batch = enumerate(train).__next__()
mononuc = batch["mononuc"].to(device)
b = batch["batch"].to(device) if "batch" in batch else None
countsum = batch["countsum"].to(device) if "countsum" in batch else None
inputs = (mononuc, b, countsum)

In [7]:
mononuc.shape

torch.Size([256, 4, 30])

In [8]:
mononuc.device

device(type='cuda', index=0)

In [16]:
def _mono2revmono_flip(x):
    return torch.flip(x, [2])[:, [3, 2, 1, 0], :]

In [66]:
%timeit _mono2revmono_flip(mononuc) #CPU

62.5 µs ± 8.34 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [42]:
%timeit _mono2revmono_flip(mononuc) #GPU

67.4 µs ± 109 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [18]:
def _mono2revmono_flip2(x):
    return torch.flip(x, [1, 2])

In [68]:
%timeit _mono2revmono_flip2(mononuc) #CPU

18.6 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [43]:
%timeit _mono2revmono_flip2(mononuc) #GPU

73.4 µs ± 150 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [24]:
def _mono2revmono_index(x):
    n = mononuc.shape[2]
    reverse_index = torch.arange(n - 1, -1, -1, device=mononuc.device)
    # reverse_index
    compl_bases = torch.tensor([3, 2, 1, 0], device=mononuc.device)
    compl_bases = compl_bases.repeat(30, 1).T
    reverse_index = reverse_index.repeat(4, 1)
    return mononuc[:, compl_bases, reverse_index]

In [70]:
%timeit _mono2revmono_index(mononuc) #CPU

47 µs ± 2.73 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [44]:
%timeit _mono2revmono_index(mononuc) #GPU

141 µs ± 202 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [28]:
def _mono2revmono_index2(x):
    n = mononuc.shape[2]
    reverse_index = torch.arange(n - 1, -1, -1, device=mononuc.device)
    # reverse_index
    compl_bases = torch.tensor([3, 2, 1, 0], device=mononuc.device)
    compl_bases = compl_bases.expand(30, 4).T
    reverse_index = reverse_index.expand(4, 30)
    return mononuc[:, compl_bases, reverse_index]

In [72]:
%timeit _mono2revmono_index2(mononuc) #CPU

38.4 µs ± 2.86 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
%timeit _mono2revmono_index2(mononuc) #GPU

113 µs ± 88.5 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [36]:
%lprun -f _mono2revmono_index2 _mono2revmono_index2(mononuc)

Timer unit: 1e-06 s

Total time: 0.00049 s
File: <ipython-input-28-ede875c70c4c>
Function: _mono2revmono_index2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def _mono2revmono_index2(x):
     2         1          9.0      9.0      1.8      n = mononuc.shape[2]
     3         1        204.0    204.0     41.6      reverse_index = torch.arange(n - 1, -1, -1, device=mononuc.device)
     4                                               # reverse_index
     5         1        107.0    107.0     21.8      compl_bases = torch.tensor([3, 2, 1, 0], device=mononuc.device)
     6         1         30.0     30.0      6.1      compl_bases = compl_bases.expand(30, 4).T
     7         1          5.0      5.0      1.0      reverse_index = reverse_index.expand(4, 30)
     8         1        135.0    135.0     27.6      return mononuc[:, compl_bases, reverse_index]

In [51]:
def _mono2revmono_index3(x):
    n = mononuc.shape[2]
    reverse_index = torch.tensor(np.arange(n-1, -1, -1), device=mononuc.device)
    # reverse_index
    compl_bases = torch.tensor([3, 2, 1, 0], device=mononuc.device)
    compl_bases = compl_bases.expand(30, 4).T
    reverse_index = reverse_index.expand(4, 30)
    return mononuc[:, compl_bases, reverse_index]

In [52]:
%timeit _mono2revmono_index3(mononuc) #GPU

129 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
n = mononuc.shape[2]
reverse_index = torch.tensor(np.arange(n-1, -1, -1), device=mononuc.device).expand(4, 30)
compl_bases = torch.tensor([3, 2, 1, 0], device=mononuc.device).expand(30, 4).T

def _mono2revmono_index4(x):
    return mononuc[:, compl_bases, reverse_index]

In [10]:
%timeit _mono2revmono_index4(mononuc) #GPU

25.8 µs ± 95.4 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [11]:
from timeit import default_timer as timer
def measure_time_gpu(func, x, n_repeats=10000, n_runs=7, n_warmups=10):
    for _ in range(n_warmups):
        _ = func(x)
        
    elapsed_time_ms = 0

    for _ in range(n_runs):
        start = timer()
        for _ in range(n_repeats):
            _ = func(x)
        torch.cuda.synchronize()
        end = timer()
        elapsed_time_ms = (end - start) * 1000

    return elapsed_time_ms / (n_repeats*n_runs)

In [63]:
measure_time_gpu(_mono2revmono_flip, mononuc)

0.009582211183650153

In [64]:
measure_time_gpu(_mono2revmono_flip2, mononuc)

0.010386869417769569

In [65]:
measure_time_gpu(_mono2revmono_index, mononuc)

0.020045482793024608

In [66]:
measure_time_gpu(_mono2revmono_index2, mononuc)

0.016025118849107196

In [12]:
measure_time_gpu(_mono2revmono_index4, mononuc)

0.0037535334272044045

In [21]:
import time
import torch

n = 1024
batch_size = 256
ntrials = 1000
x = torch.randn(batch_size, n)

In [75]:
start = time.perf_counter()
[x.flip(-1) for _ in range(ntrials)]
end = time.perf_counter()
print('Flip time (CPU): {}s'.format(end - start))

reverse_index = torch.arange(n - 1, -1, -1)
start = time.perf_counter()
[x[..., reverse_index] for _ in range(ntrials)]
end = time.perf_counter()
print('Advanced indexing time (CPU): {}s'.format(end - start))  #calculated on my local CPU

Flip time (CPU): 0.0904085500005749s
Advanced indexing time (CPU): 0.12989644399931422s


In [22]:
start = time.perf_counter()
[x.flip(-1) for _ in range(ntrials)]
end = time.perf_counter()
print('Flip time (CPU): {}s'.format(end - start))

reverse_index = torch.arange(n - 1, -1, -1)
start = time.perf_counter()
[x[..., reverse_index] for _ in range(ntrials)]
end = time.perf_counter()
print('Advanced indexing time (CPU): {}s'.format(end - start))  #calculated on a GPU server

Flip time (CPU): 2.085365444421768s
Advanced indexing time (CPU): 0.8215824514627457s


In [41]:
start = time.perf_counter()
[x.flip(-1) for _ in range(ntrials)]
end = time.perf_counter()
print('Flip time (CPU): {}s'.format(end - start))

start = time.perf_counter()
reverse_index = torch.arange(n - 1, -1, -1)
[x[..., reverse_index] for _ in range(ntrials)]
end = time.perf_counter()
print('Advanced indexing time (CPU): {}s'.format(end - start))  #calculated on a GPU server

Flip time (CPU): 0.012969546020030975s
Advanced indexing time (CPU): 0.05524316430091858s


In [23]:
x = x.to('cuda')
reverse_index = reverse_index.to('cuda')

torch.cuda.synchronize()
start = time.perf_counter()
[x.flip(-1) for _ in range(ntrials)]
torch.cuda.synchronize()
end = time.perf_counter()
print('Flip time (CUDA): {}s'.format(end - start))

start = time.perf_counter()
[x[..., reverse_index] for _ in range(ntrials)]
torch.cuda.synchronize()
end = time.perf_counter()
print('Advanced indexing time (CUDA): {}s'.format(end - start))

Flip time (CUDA): 0.09787975251674652s
Advanced indexing time (CUDA): 0.021840713918209076s


In [40]:
x = x.to('cuda')
reverse_index = reverse_index.to('cuda')

torch.cuda.synchronize()
start = time.perf_counter()
[x.flip(-1) for _ in range(ntrials)]
torch.cuda.synchronize()
end = time.perf_counter()
print('Flip time (CUDA): {}s'.format(end - start))

start = time.perf_counter()
reverse_index = torch.arange(n - 1, -1, -1, device=x.device)  # creating reverse_index within measured time
[x[..., reverse_index] for _ in range(ntrials)]
torch.cuda.synchronize()
end = time.perf_counter()
print('Advanced indexing time (CUDA): {}s'.format(end - start))

Flip time (CUDA): 0.013288140296936035s
Advanced indexing time (CUDA): 0.023818977177143097s


## Testing encoding methods for storing the reverse strand
This works with numpy on CPU, since it is dataloading.

In [6]:
n_rounds = 1

In [10]:
%timeit dataset = mb.datasets.SelexDataset(data, n_rounds=n_rounds, single_encoding_step=False)

2.95 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit dataset = mb.datasets.SelexDataset(data, n_rounds=n_rounds, single_encoding_step=True)

1.2 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
dataset1 = mb.datasets.SelexDataset(data, n_rounds=n_rounds, single_encoding_step=False)
dataset2 = mb.datasets.SelexDataset(data, n_rounds=n_rounds, single_encoding_step=True)

False

In [53]:
np.all(dataset2.mononuc == dataset1.mononuc)

True

In [54]:
mononuc = dataset1.mononuc

In [55]:
def revert_onehot_mononuc(mononuc):
    return np.flip(mononuc, (1, 2))

In [56]:
%timeit mononuc_rev = revert_onehot_mononuc(mononuc)

1.78 µs ± 6.21 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [61]:
def revert_onehot_mononuc2(mononuc):
    n = mononuc.shape[2]
    reverse_index = np.arange(n - 1, -1, -1)
    # reverse_index
    compl_bases = np.array([3, 2, 1, 0])
    compl_bases = np.tile(compl_bases, (30, 1)).T
    reverse_index = np.tile(reverse_index, (4, 1))
    return mononuc[:, compl_bases, reverse_index]

In [62]:
%timeit mononuc_rev = revert_onehot_mononuc2(mononuc)

18.4 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [63]:
mononuc_rev1 = revert_onehot_mononuc(mononuc)
mononuc_rev2 = revert_onehot_mononuc2(mononuc)
np.all(mononuc_rev1 == mononuc_rev2)

True