In [1]:
from glob import glob

import pgzip as gz
import _pickle as pkl

import pandas as pd
import numpy as np

import random
import seaborn as sns

## Reading Files

### X1: Sequence

In [2]:
dict_seq = pd.read_csv('DATA_scMNT/X1.tsv', sep='\t', header=None, index_col=0, squeeze=True).to_dict()

### X2: Gene Expr, Y: Epigenomic peaks
1. X1 : peaks x 1000 bps
1. X2 : genes x cells
1. Ys: peaks x cells x #of Y

In [17]:
with gz.open('DATA_scMNT/X2_Y.pkl.gz') as f:
    l_cells, l_genes, l_peaks, np_X2, np_Ys = pkl.load(f)

## X1: Sequence processing

In [4]:
C_1HE = {
    'a':[1,0,0,0],
    'c':[0,1,0,0],
    'g':[0,0,1,0],
    't':[0,0,0,1],
    'A':[1,0,0,0],
    'C':[0,1,0,0],
    'G':[0,0,1,0],
    'T':[0,0,0,1],
    'n':[0,0,0,0],
    'N':[0,0,0,0]
}

def seq_to_numpy(seq):
    char_list = [ C_1HE[x] for x in seq ]
    return np.array(char_list)

def peak_id_to_numpy(pid):
    seq = dict_seq[pid]
    return seq_to_numpy(seq)

In [5]:
l_X1 = [ peak_id_to_numpy(x) for x in l_peaks ]

NP_seq_0 = np.array(l_X1, dtype=np.int8)
NP_seq_1 = np.flip(NP_seq_0, (1,2))

NP_seq = np.concatenate( [NP_seq_0, NP_seq_1] )

## Writing X1, X2, Ys and IDs

In [18]:
print('Writing X1:seq x 1000bp =', NP_seq.shape)

Writing X1:seq x 1000bp = (882970, 1000, 4)


In [10]:
with gz.open('TR_DATA/data_X1.pkl.gz', 'wb') as f:
    pkl.dump(NP_seq, f)

In [19]:
print('Writing X2:genes x cells =', np_X2.shape)

Writing X2:genes x cells = (12817, 102)


In [20]:
with gz.open('TR_DATA/data_X2.pkl.gz', 'wb') as f:
    pkl.dump(np_X2, f)

In [21]:
print('Writing Ys:seq x cells x # of features =', np_Ys.shape)

Writing Ys:seq x cells x # of features = (441485, 102, 2)


In [22]:
with gz.open('TR_DATA/data_Ys.pkl.gz', 'wb') as f:
    pkl.dump(np_Ys, f)

In [27]:
print('Writing cells =', len(l_cells) )

Writing cells = 102


In [28]:
with gz.open('TR_DATA/id_cells.pkl.gz', 'wb') as f:
    pkl.dump(l_cells, f)

In [29]:
print('Writing genes =', len(l_genes) )

Writing genes = 12817


In [30]:
with gz.open('TR_DATA/id_genes.pkl.gz', 'wb') as f:
    pkl.dump(l_genes, f)

In [31]:
print('Writing peaks =', len(l_peaks) )

Writing peaks = 441485


In [32]:
with gz.open('TR_DATA/id_peaks.pkl.gz', 'wb') as f:
    pkl.dump(l_peaks, f)