In [60]:
import keras

Using TensorFlow backend.


In [61]:
import numpy as np
import random
import pandas as pd

np.random.seed(55555)
random.seed(55555)

In [62]:
N_SENTENCES = 10000
TEST_RATIO = .5
MAX_LENGTH = 50
N_EPOCHS = 10
N_STATES = 4
BATCH_SIZE = 32

In [63]:
def solve_expr(S):
    res_buffer = 1
    op_buffer = '&'
    for x in S:
        if x =='X':
            pass
        elif x in ['&', '|']:
            op_buffer = x
        elif x in ['0','1']:
            x = int(x)
            if op_buffer == '&':
                res_buffer = res_buffer & x
            elif op_buffer == '|':
                res_buffer = res_buffer | x
    return res_buffer

def gen_expr(n_expr, max_size):
    random.seed(55555)
    
    bools = []
    sols = []
    for _ in range(n_expr):
        
        # Gets expr length
        L = random.randint(1,max_size)
        if L%2 == 0 and L < max_size:
            L += 1
        elif L%2 == 0 and L == max_size:
            L -= 1
            
        # Generates expression
        expr = ''
        for c in range(L):
            if c%2 == 0:
                c = str(random.randint(0,1))
            else:
                c = '&|'[random.randint(0,1)]
            expr += c
        sol = solve_expr(expr)
        
        bools.append(expr)
        sols.append(sol)

    return pd.DataFrame({
        'sequence': bools,
        'target'  : sols
    })
        
df = gen_expr(N_SENTENCES, MAX_LENGTH)

In [64]:
print(df.head())
print('Max size:', max(len(s) for s in df['sequence']))

                                        sequence  target
0                                          0|1&0       0
1                                    0|1&1&1|1&0       0
2              1&1|1|1|1&1|0|1|0&0|0&0|1&1&1&0&0       0
3  0&0|1|0|0|0|0|1&0|0|1|1|1|1&1|1&1&0&0|0|0|1|1       1
4                        1&0&0|1|1|0|0|0|0|0&1|0       1
Max size: 49


In [65]:
def pad(s, L):
    if len(s) > L:
        raise ValueError('Nope')
    diff = L - len(s)
    return 'X' * diff + s

df['sequence'] = [pad(s, 50) for s in df['sequence']]

In [66]:
df

Unnamed: 0,sequence,target
0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0...,0
1,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&1&1...,0
2,XXXXXXXXXXXXXXXXX1&1|1|1|1&1|0|1|0&0|0&0|1&1&1...,0
3,XXXXX0&0|1|0|0|0|0|1&0|0|1|1|1|1&1|1&1&0&0|0|0...,1
4,XXXXXXXXXXXXXXXXXXXXXXXXXXX1&0&0|1|1|0|0|0|0|0...,1
...,...,...
9995,XXXXXXX0&0|0&0|0|0|0|0|0&0&0&0&1|0&1|1&1|0|0|0...,0
9996,XXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&1&0&0&1|1|0|1...,1
9997,XXXXXXXXXXXXXXXXXXXXXXX1|0&1|1&0&0|1&1&0&1&0&0...,0
9998,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1|1|0|0|0|0...,1


In [67]:
TRAIN_SPLIT = int(len(df)*(1-TEST_RATIO))
TRAIN_SPLIT = TRAIN_SPLIT - TRAIN_SPLIT%BATCH_SIZE

TEST_SPLIT = int(len(df))
TEST_SPLIT = TEST_SPLIT - TEST_SPLIT%BATCH_SIZE

train_sequences = list(df['sequence'][:TRAIN_SPLIT])
train_labels = list(df['target'][:TRAIN_SPLIT])

test_sequences = list(df['sequence'][TRAIN_SPLIT:TEST_SPLIT])
test_labels = list(df['target'][TRAIN_SPLIT:TEST_SPLIT])

print(len(train_sequences))
print(len(test_sequences))

4992
4992


In [68]:
char2int = {'X': 0, '0':1, '1':2, '|':3, '&':4}

# ENCODING MATRICES OF CHARS
class TwoDimEncoders:

    @staticmethod
    def raw_to_encoded(seqs, seqs2=None, cust_char2int=None):

        if cust_char2int is None:
            all_chars = reduce(lambda chars,seq : set(chars) | set(seq), seqs)
            if seqs2 is not None:
                all_chars |= reduce(lambda chars,seq : set(chars) | set(seq),
                                    seqs2)
            all_chars = sorted(all_chars)

            char2int =  {c:i for i, c in enumerate(all_chars)}
            print("Total vocabulary len_sequence: ", len(all_chars))
        else:
            char2int = cust_char2int

        encoded_seqs = [[char2int[x] for x in seq] for seq in seqs]
        if seqs2 is None:
            return encoded_seqs, char2int

        encoded_seqs2 = [[char2int[x] for x in seq] for seq in seqs2]
        return encoded_seqs, char2int, encoded_seqs2

    @staticmethod
    def encoded_to_bin_tensor(enc_seqs, char2int, enc_seqs2=None,
                                start_at_min=False):

        min_x = min(char2int.values()) if start_at_min else 0
        n_chars = max(char2int.values()) - min_x + 1
        seq_len = len(enc_seqs[0])

        X = np.zeros((len(enc_seqs), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X[i, j, k] = 1

        if enc_seqs2 is None:
            return X

        X2 = np.zeros((len(enc_seqs2), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs2):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X2[i, j, k] = 1

        return X, X2

    @staticmethod
    def raw_to_bin_tensor(seqs, seqs2=None, cust_char2int=None):
        encoding_out = TwoDimEncoders.raw_to_encoded(seqs, seqs2, cust_char2int)
        if len(encoding_out) == 2:
            encoded_seqs, char2int = encoding_out
            encoded_seqs2 = None
        else:
            encoded_seqs, char2int, encoded_seqs2 = encoding_out

        bin_tensors = TwoDimEncoders.encoded_to_bin_tensor(
            encoded_seqs, char2int, encoded_seqs2)

        if type(bin_tensors) is not tuple:
            return bin_tensors, char2int

        out = bin_tensors + (char2int,)
        return out

In [71]:
X_train, char2int = TwoDimEncoders.raw_to_bin_tensor(train_sequences, cust_char2int = char2int)

In [76]:
X_train[0]

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0,

In [116]:
int2char = {0:'X', 1:'0', 2:'1', 3:'|', 4:'&'}
seq = ''
for i in X_train[0]:
    for k in range(len(i)):
        if i[k] == 1:
            seq = seq + int2char[k]

In [117]:
seq

'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0'

In [108]:
train_sequences[0]

'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0'

In [20]:
y_train, y_test = np.array(train_labels), np.array(test_labels)

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM,SimpleRNN
from keras.layers import Lambda
from keras import regularizers

In [29]:
in_dim = X_train.shape[1:]

In [69]:
adam = keras.optimizers.Adam(lr=0.001)

model = Sequential()
model.add(LSTM(N_STATES, return_sequences=True,
                         stateful=False,
                         batch_size=BATCH_SIZE,
                         input_shape=in_dim,
                          activity_regularizer=regularizers.l1(0.00001)))
model.add(Lambda(lambda x: x[:,-1, :]))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['acc'])

In [70]:
from keras.models import load_model

CACHED = False
if not CACHED:
    model.summary()
    model.fit(X_train, y_train,
                        batch_size=BATCH_SIZE,
                        epochs=N_EPOCHS,
                        verbose=1,
                        shuffle=False)
    model.save('boolean_nopriorities.h5')
else:
    model = load_model('boolean_nopriorities.h5')

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (32, 50, 4)               160       
_________________________________________________________________
lambda_2 (Lambda)            (32, 4)                   0         
_________________________________________________________________
dense_2 (Dense)              (32, 1)                   5         
Total params: 165
Trainable params: 165
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [71]:
model.reset_states()
score = model.evaluate(X_test, y_test, batch_size=BATCH_SIZE, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.07672305187831323
Test accuracy: 0.9983974099159241


In [118]:
def fsm_states(transition_table, dictionary, init_state = 0, id_fsm = None):
    def F(seq_raw):
        int2char = {0:'X', 1:'0', 2:'1', 3:'|', 4:'&'}
        seq = ''
        for i in seq_raw:
            for k in range(len(i)):
                if i[k] == 1:
                    seq = seq + int2char[k]
        features = np.zeros(len(seq))
        cur_state = init_state
        for i, x in enumerate(seq):
            cur_state = transition_table[cur_state][dictionary[x]]
            features[i] = cur_state
        name = 'states' if id_fsm is None else 'states_' + id_fsm
        return features, name
    return F

In [119]:
fsm_dict = {'X': 0, '0':1, '1':2, '|':3, '&':4}
fsm_tbl =  [[0, 2, 1, -1, -1],
            [0, -1, -1, 4, 3],
            [0, -1, -1, 6, 5],
            [0, 2, 1, -1, -1],
            [0, 1, 1, -1, -1],
            [0, 2, 2, -1, -1],
            [0, 2, 1, -1, -1]]
fsm_states_7 = fsm_states(fsm_tbl, fsm_dict, id_fsm = '7')

fsm_tbl4 = [[0,2,1,-1,-1],
            [0,1,1,1,3],
            [0,2,2,3,2],
            [0,2,1,2,1]]
fsm_states_4 = fsm_states(fsm_tbl4, fsm_dict, id_fsm = '4')


fsm_tbl3 = [[0,2,1,-1,-1],
           [0,2,1,1,0],
           [0,2,2,0,2]]
fsm_states_3 = fsm_states(fsm_tbl3, fsm_dict, id_fsm = '3')


features = [fsm_states_7, fsm_states_4, fsm_states_3]

def random_fsm(n_states, fsm_dict=fsm_dict):
    s = np.random.randint(n_states, size=n_states*len(fsm_dict))
    tran = np.reshape(s, (n_states, len(fsm_dict)))
    return tran

fsm_states_3rd = fsm_states(random_fsm(3), fsm_dict, id_fsm = 'rd_3')
fsm_states_4rd = fsm_states(random_fsm(4), fsm_dict, id_fsm = 'rd_4')
fsm_states_7rd = fsm_states(random_fsm(7), fsm_dict, id_fsm = 'rd_7')

features += [fsm_states_3rd, fsm_states_4rd, fsm_states_7rd]

In [120]:
fsm_states_7(X_train[0])[0].shape

(50,)

In [123]:
fsm_states_3(X_train[0])[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 1., 0., 2.])

In [75]:
[l.input_shape for l in model.layers]

[(32, 50, 5), (32, 50, 4), (32, 4)]

In [76]:
for i,shape in enumerate([l.input_shape for l in model.layers]):
    print(i,shape[0:1])

0 (32,)
1 (32,)
2 (32,)


In [77]:
model.input_shape[0:1]

(32,)

In [78]:
same_dim = [i for i,shape in enumerate([l.input_shape for l in model.layers])
                      if shape[0:1] == model.input_shape[0:1]]

In [79]:
from keras.layers import SimpleRNN, GRU, LSTM
RECURRENT_LAYERS = (SimpleRNN, GRU, LSTM)
out = []
for i, layer in enumerate(model.layers):
    if isinstance(layer, RECURRENT_LAYERS):
        out.append(i)

In [80]:
recur = out
recur

[0]

In [81]:
intersect = sorted(set(same_dim) & set(recur))

In [82]:
layer_ids = intersect
layer_ids

[0]

In [83]:
outputs = [model.layers[l].output for l in layer_ids]

In [84]:
from keras.models import Model
spy_model = Model(inputs = model.input, outputs = outputs)

In [88]:
raw_activations = spy_model.predict(X_train,
                                            batch_size=32)

In [127]:
a = raw_activations[0,:,0]
a

array([ 4.8889272e-04,  9.3079545e-04,  1.2901225e-03,  1.5700043e-03,
        1.7843596e-03,  1.9477402e-03,  2.0723476e-03,  2.1676428e-03,
        2.2407547e-03,  2.2970124e-03,  2.3404087e-03,  2.3739489e-03,
        2.3999116e-03,  2.4200284e-03,  2.4356304e-03,  2.4477406e-03,
        2.4571414e-03,  2.4644434e-03,  2.4701168e-03,  2.4745238e-03,
        2.4779509e-03,  2.4806140e-03,  2.4826846e-03,  2.4842916e-03,
        2.4855421e-03,  2.4865153e-03,  2.4872711e-03,  2.4878595e-03,
        2.4883172e-03,  2.4886727e-03,  2.4889500e-03,  2.4891668e-03,
        2.4893347e-03,  2.4894653e-03,  2.4895642e-03,  2.4896432e-03,
        2.4897044e-03,  2.4897512e-03,  2.4897866e-03,  2.4898141e-03,
        2.4898374e-03,  2.4898546e-03,  2.4898688e-03,  2.4898800e-03,
        2.4898872e-03,  4.7461429e-01,  2.1423253e-01, -3.3533663e-01,
        3.7080324e-01,  6.6535527e-01], dtype=float32)

In [128]:
b = fsm_states_7(X_train[0])[0]
b

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 6., 1., 3., 2.])

In [131]:
corr = abs(np.corrcoef(a, b)[1,0])
corr

0.5783246604097833

In [134]:
type(X_train)

numpy.ndarray

In [175]:
with open('data.txt', 'w') as f:
    for item in train_sequences:
        f.write("%s\n" % item)

In [176]:
type(train_sequences)

list

In [180]:
%time
read_seq = []
with open('data.txt', 'r') as f:
    line = f.readline().rstrip('\n') 
    read_seq.append(line)
    while line: 
        line = f.readline().rstrip('\n') 
        read_seq.append(line)
        

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [181]:
train_sequences

['XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&1&1|1&0',
 'XXXXXXXXXXXXXXXXX1&1|1|1|1&1|0|1|0&0|0&0|1&1&1&0&0',
 'XXXXX0&0|1|0|0|0|0|1&0|0|1|1|1|1&1|1&1&0&0|0|0|1|1',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXX1&0&0|1|1|0|0|0|0|0&1|0',
 'XXXXXXXXXXXXXXXXXXXXX1&0|1&0|1&1&1&0|1&1|0&1&0|0&0',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1&0&0|1|1&0&1&0',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1&0|1|1&1&1',
 'XXX1|1|0&0&1&1|1|1&1|1|0&1&0|1&1&0|1|1&1|0|1|1&1|0',
 'X1&1|0&0|0&0|1&0|1&0|1&1&1&0&0|1&0&0&0&0|1|1|0|1&1',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0&1&1&1',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1|0&1|1|1|0&0&0',
 'XXXXXXXXXXXXXXXXXXXXX1|1&0&0|1|1&1&0&0&0|1|0&1&0|1',
 'XXXXXXXXXXXXXXXXX1|0&0&1|1|1&1|0|1&1&0&1&1|0|0&0|1',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0|0|0&1|1&0&1|0|1',
 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0&0|1&1|0&0&1',
 'XXX0|0&1&1&1&1&0&0|1&0&0&0&0&1|1|1|1&0|1&0&1&0&1&1',
 'XXXXXXXXXXXXXXXXXXX1&1&1&1&0|1|1|0|0|0&0|0&1&0|0|1',
 'X0&1&0|1

In [194]:
if read_seq[-1] == '':
    del read_seq[-1]

In [218]:
type(read_seq[0])

str

In [195]:
len(read_seq)

4992

In [196]:
read_seq == train_sequences

True

In [187]:
len(train_sequences)

4992

In [7]:
from google.cloud import storage
storage_client = storage.Client()

In [8]:
bucketn = "dni-storage"
bucket = storage_client.get_bucket(bucketn)
blob = bucket.get_blob('Datasets/l_data.txt')
%time s = blob.download_as_string(start=0, end=1)

CPU times: user 56 ms, sys: 16 ms, total: 72 ms
Wall time: 204 ms


In [7]:
%%time
blob = bucket.get_blob('Datasets/l_data.txt')


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 143 ms


In [51]:
i = 0

In [59]:
step = 20 # fetch how many data at one time
s = blob.download_as_string(start=i*51*step, end=i*51*step+51*step-1)
i += 1
s

b'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0&0&0&1|1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1|1|1&1|1&0&1|0\nXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1&0|0&0&1&1|0&1&0&1&0\nXXXXXXXXXXXXXXXXX0&0&1&0|1|0|0&1|1|0&0|0|1&1&1|0|0\nXXXXXXXXX0&1&1|0|1|1&0&1|0|1&1&0|0&1&0|0|0|1|1&1|1\nXXXXXXXXX1&1|0|1&1|0&0&1|0&0&0&0&0&1|1&0|1|1|1|1&1\nXXXXXXXXXXXXXXXXXXXXXXX1|1|0|1&1&1&1&1&0|0&1|1&0|1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1\nXXXXXXX0|1&1|0&0&1|1|1|1|1|1&0|0&1|1|1&1|1&0&0&0|1\nXXX0|0|0|1&0|0|0|0|1&0|0|1|0&1|1|0|0|1|1&0&0|0|1&1\nXXXXXXXXXXXXXXXXXXXXXXXXXXX1|1|1&0|1|1|0&1|1|1&1&0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0&1|0&1\nXXXXXXX0|0|0|1&1|0&1|1|0&0|0|1&0&1&1&1&1&0&0&1&0&1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0&0|0|1&0|0|1|1&1&0\nXXXXXXXXXXX0&1&0&0|1|1|0&0&0&1&0|0|0&0|0|1&0|0&1|0\nXXXXXXXXX1&0&1|0|0&0&0|1&0&0|1&1&1&0&1|0|0|1|0|1|0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&0&0|1\nXXXXXXXXXXXXX0&1&1&1|1&1|1&0&1|1|1&1|0|0|1&1|0|0&1\nXXXXXXXXXXXXXXXXX1&1&0&0|1|0&0&1|0&1|1&1&1|1|0&0&1\nXXXXXXXXXX

In [46]:
s = blob.download_as_string(start=0, end=101)
s

b'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&1&1|1&0\n'

In [18]:
s = blob.download_as_string(start=102, end=152)
s

b'XXXXXXXXXXXXXXXXX1&1|1|1|1&1|0|1|0&0|0&0|1&1&1&0&0\n'

In [4]:
%%time
# s = blob.download_as_string(start=1, end=101)
s = blob.download_as_string()

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 98.3 ms


In [14]:
type(s)

bytes

In [242]:
%time
decoded_s = s.decode("utf-8") 
decoded_s

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&1&1|1&0\nXXXXXXXXXXXXXXXXX1&1|1|1|1&1|0|1|0&0|0&0|1&1&1&0&0\nXXXXX0&0|1|0|0|0|0|1&0|0|1|1|1|1&1|1&1&0&0|0|0|1|1\nXXXXXXXXXXXXXXXXXXXXXXXXXXX1&0&0|1|1|0|0|0|0|0&1|0\nXXXXXXXXXXXXXXXXXXXXX1&0|1&0|1&1&1&0|1&1|0&1&0|0&0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1&0&0|1|1&0&1&0\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1&1&0|1|1&1&1\nXXX1|1|0&0&1&1|1|1&1|1|0&1&0|1&1&0|1|1&1|0|1|1&1|0\nX1&1|0&0|0&0|1&0|1&0|1&1&1&0&0|1&0&0&0&0|1|1|0|1&1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0&1&1&1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1|0&1|1|1|0&0&0\nXXXXXXXXXXXXXXXXXXXXX1|1&0&0|1|1&1&0&0&0|1|0&1&0|1\nXXXXXXXXXXXXXXXXX1|0&0&1|1|1&1|0|1&1&0&1&1|0|0&0|1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0|0|0&1|1&0&1|0|1\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|0&0|1&1|0&0&1\nXXX0|0&1&1&1&1&0&0|1&0&0&0&0&1|1|1|1&0|1&0&1&0&1&1\nXXXXXXXXXXXXXXXXXXX1&1&1&1&0|1|1|0|0|0&0|0&1&0|0|1\nX0&1&0|1|1|0&1|0|1|1|1|1|1&0&1&0|1|0&0&1&1|0|1|0&0\nXXXXXXXXXXX

In [243]:
type(decoded_s)

str

In [245]:
final_s = decoded_s.split('\n')

In [251]:
len(final_s)

4992

In [250]:
if final_s[-1] == '':
    del final_s[-1]

In [252]:
final_s == train_sequences

True