In [1]:
import pandas as pd
import numpy as np
import scipy.io as sio
import scipy as scipy
import itertools as it
import os
from concise.preprocessing import encodeDNA

Using TensorFlow backend.


In [None]:
import sys
sys.path.append('../helper')
import common as cm

Using TensorFlow backend.
  'Matplotlib is building the font cache using fc-list. '


## Summary

- Extract X, Y from data files, preprocess (clean, normalize, encode) X, Y. Save preprocessed data to file for modelling 
<br />
- Create and save partition information into external file for cross validation 

## Helper functions

In [3]:
def featurize_donor_seq(seqs, boundary = 40):
    seq1 = []
    seq2 = []
    for i in range(len(seqs)):
        cur_seq = seqs[i]
        seq1.append(str(cur_seq[0:boundary]))
        seq2.append(str(cur_seq[boundary:100]))        
    return seq1,seq2

In [4]:
def get_data(data_file = '../data/Reads.mat', seq_file = '../data/A5SS_Seqs.csv'):
    data = sio.loadmat(data_file)
    # A5SS
    A5SS_data = data['A5SS']
    A5SS_reads = np.array(A5SS_data.sum(1)).flatten()
    A5SS_data = np.array(A5SS_data.todense())
    # Get minigenes with reads
    A5SS_nn = np.where(A5SS_data.sum(axis=1))
    A5SS_reads = A5SS_reads[A5SS_nn]
    A5SS_data = A5SS_data[A5SS_nn]
    
    # Normalize data
    A5SS_data = A5SS_data/A5SS_data.sum(axis=1)[:,np.newaxis]

    # Only include the first 80 positions and the the last position (unspliced)
    Y = scipy.matrix(A5SS_data)
    Y = scipy.hstack((Y[:,:80],Y[:,-1:]))
    
    # Get X
    A5SS_seqs = pd.read_csv(seq_file,index_col=0).Seq[A5SS_nn[0]]
    X1, X2 = featurize_donor_seq(A5SS_seqs.values)
    return X1, X2, Y

## Extract data and save to results folder
    

In [5]:
#create folder to save results
resultsdir = cm.create_folder(os.path.abspath('../results'))

In [7]:
X1, X2, Y = get_data()

#encode X value
encodeSeq1 = encodeDNA(X1)
encodeSeq2 = encodeDNA(X2)

#save data into result folder
sio.savemat(resultsdir+'/feature_donor_seq.mat',{'X1':X1,'X2': X2, 'Y':Y, 'encodeSeq1': encodeSeq1, 'encodeSeq2': encodeSeq2})

In [10]:
#partition data into 10 folds for cross validation
partition = cm.partition_data( 10, len(Y))
np.save(resultsdir+'/paritions', partition)

## Clean data

In [11]:
del X1
del X2
del Y
del partition