# Data preprocessing
This notebook cleans data for TF-MoDISco workflow. Specifically, it 
- filters out all zero sequences
- corrects the one-hot endcoding format from AGCT to ACGT ordering
- creates a sample test dataset

In [1]:
# Select Environment(conda_kipoi-shared__env__kipoi-py3-keras2)
import os
import time
import h5py
import torch
import scipy.io
import numpy as np
import matplotlib.pyplot as plt

In [2]:
### set the path
data_dir = '../../../data/deepsea_train/'
result_dir = '../../../data/deepsea_train/'

### Load data

In [3]:
# load the test datat mat
tic = time.time()
test_mat = scipy.io.loadmat(data_dir + 'test.mat')
# Y_test = torch.FloatTensor(test_mat['testdata'])
X_test = test_mat['testxdata'].astype('float32')
toc = time.time()
print(round(toc - tic), 'sec elapsed')

12 sec elapsed


In [4]:
X_test.shape

(455024, 4, 1000)

### Delete all zero sequences

In [5]:
#delete seqs with all zero encoding
idx = []
for i in range(X_test.shape[0]):
    a = X_test[i]
    is_empty = np.sum(np.all(a[..., :] == 0, axis=0))
    if is_empty:
        idx.append(i)
    if i%100000 == 0:
        print(i, 'seqs checked')

0 seqs checked
100000 seqs checked
200000 seqs checked
300000 seqs checked
400000 seqs checked


In [6]:
X_test = np.delete(X_test, idx, axis = 0)

In [7]:
np.save(result_dir + 'X_test_clean.npy', X_test)

### Correct the one-hot encoding format
Restart the kernel to do the following to prevent kernel crash

In [3]:
with open(result_dir + 'X_test_clean.npy', 'rb') as f:
    X_test = np.load(f)

In [4]:
# change AGCT to ACGT ordering
temp_G = np.copy(X_test[:, 1, :])
temp_C = np.copy(X_test[:, 2, :])
X_test[:, 1, :] = temp_C
X_test[:, 2, :] = temp_G

In [5]:
# reshape for kipoi input 
X_test = np.expand_dims(X_test, axis=2)
X_test.shape

(454912, 4, 1, 1000)

### Create a sample test set

In [6]:
# take a sample, otherwise will crash the server when computing importance scores
np.random.seed(42)
sample_idx = np.random.permutation(X_test.shape[0])[:10000]
X_test_sample = X_test[sample_idx]
print(X_test_sample.shape)

(10000, 4, 1, 1000)


In [7]:
# save for future use
np.save(result_dir + 'X_test_final.npy', X_test)
np.save(result_dir + 'X_test_sample.npy', X_test_sample)