In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import scipy.io as sio
import numpy as np
import scipy
import pandas as pd

from concise.preprocessing import encodeDNA
import h5py

## Summary
- Extract data from data files
- Preprocess data (clean, normalize, encode)
- Split data into training, evaluate, test set and save as .h5 files to support fit_generator api in Keras

## Helper functions

In [10]:
## prepare Seq and Y value to train the model

def get_data(seqL = 100, prefix = 'GACAAGCAGAAGAACGGCATCAAAGTGAACTTCAAGATCCGCCACAACATCGAGG'
             , data_file = '../data/Reads.mat'
            , seq_file = '../data/A5SS_Seqs.csv'):
    data = sio.loadmat(data_file)

    A5SS_data = data['A5SS'][:]
    A5SS_reads = np.array(A5SS_data.sum(1)).flatten()
    A5SS_data = np.array(A5SS_data.todense())
    # Get minigenes with reads
    A5SS_nn = np.where(A5SS_data.sum(axis=1))
    A5SS_reads = A5SS_reads[A5SS_nn]
    A5SS_data = A5SS_data[A5SS_nn]
    
    count = A5SS_data.sum(axis=1)
    A5SS_data = A5SS_data/A5SS_data.sum(axis=1)[:,np.newaxis]

    # Only include the first 80 positions and the the last position (unspliced)
    YA5 = scipy.matrix(A5SS_data)
    YA5 = scipy.hstack((YA5[:,:80],YA5[:,-1:]))
    A5Seq =  pd.read_csv(seq_file).Seq[np.array(A5SS_nn[0])]
    
    nonZeroIndex = np.where(np.array((np.sum(YA5, axis = 0) !=0))[0])[0][:-1]
    
    ##extract X
    nonSlicingSeq = 'N' * seqL
    Seqs = prefix[len(prefix)-int(seqL/2):]+ A5Seq 
    X = np.array([[x[nonZeroIndex[0]:nonZeroIndex[0]+seqL] for x in Seqs]]) #np.array([]) nonZeroIndex[0]+seqL
    for i in (nonZeroIndex[1:]):
        if (np.sum(YA5[:,i]) != 0):
            X = np.concatenate((X, [[x[i:i+seqL] for x in Seqs]]), axis = 0)
    X = np.concatenate((X, [[nonSlicingSeq for x in Seqs]]), axis = 0)
    X = X.T
    
    #extract Y
    Y = YA5[:,nonZeroIndex]
    Y = scipy.hstack((Y,YA5[:,-1:]))
    
    #remove noises
    unoiseIndexes = np.where(np.sum(Y, axis = 1) == 1)[0]
    X = X[unoiseIndexes]
    Y = Y[unoiseIndexes]
    count = count[unoiseIndexes]
    return X, Y, count

In [1]:
## extract X, Y from index start to end and save as .h5 file format
def create_h5_file(X, Y, start, end, file_name):
    encodeX = encodeDNA(np.array(np.ravel(X[start:end]), dtype='object'))
    encodeTrain = encodeX.reshape(start-end, X.shape[1], encodeX.shape[1], encodeX.shape[2]) 
    Ytrain = Y[start:end,:]
    with h5py.File(file_name, "w") as f:
        
        f.attrs['sample_count'] = end - start
        xDSet = f.create_dataset("X", encodeTrain.shape, dtype='f')  
        xDSet[...] = encodeTrain 

        yDset = f.create_dataset("Y", Ytrain.shape, dtype='f')
        yDset[...] = Ytrain


Plasmid sequence around the random sequence

"gacaagcagaagaacggcatcaaagtgaacttcaagatccgccacaacatcgaggtgcttggnnnnnnnnnnnnnnnnnnnnnnnnnggtcgacccaggttcgtgnnnnnnnnnnnnnnnnnnnnnnnnngaggtattcttatcaccttcgtggctacagagtttcctta"

## Extract X, Y

In [11]:
X, Y, count = get_data()

## Create trainining/evaluation/test dataset

In [2]:
resultsdir = '../results'
if not os.path.exists(resultsdir):
    os.makedirs(resultsdir)

In [56]:
create_h5_file(X, Y, 0,150000,resultsdir + "train_data_full.hdf5")

In [57]:
create_h5_file(150000,200000,resultsdir + "val_data_full.hdf5")

In [58]:
create_h5_file(200000,X.shape[0],resultsdir + "test_data_full.hdf5")

In [59]:
del X, Y, count