# Process Data from Seq2Vec (Virtifier)

In [10]:
import os
import numpy as np
import pandas as pd
import json
import shutil

from pathlib import Path
from pprint import pprint

# Load Training Datasets

File Structure:
```
project
  |
  |---data
  |     |
  |     |-virtifier
  |     |     |
  |     |     | "all processed file for virtifier and seq2vec"
  | 
  |---repos
  |     |
  |     |--- virtifier (cloned from https://github.com/crazyinter/Seq2Vec)
  |     |
  |     |---data
  |     |
  |     |---suplementary_files
```

In [14]:
proc_data = Path('../data/virtifier').resolve()
seq2vec_repo = Path('../repos/seq2vec').resolve()
assert proc_data.is_dir()
assert seq2vec_repo.is_dir()

In [15]:
train_file_300 = seq2vec_repo /'data/train_300bp.fasta'
train_file_500 = seq2vec_repo /'data/train_500bp.fasta'
assert train_file_300.is_file()
assert train_file_500.is_file()

X_train_300=np.loadtxt(open(train_file_300,"rb"),delimiter=",",skiprows=0)
X_train_500=np.loadtxt(open(train_file_500,"rb"),delimiter=",",skiprows=0)
print(X_train_300.shape, X_train_500.shape)
X_train_300[0,:10]

(9000, 298) (9000, 498)


array([46., 57., 35., 61., 16.,  6.,  9., 35., 61., 16.])

# Codon 2 Numerical Code
Conversion between codon and number code according to the Virifier code. The conversion dictionary is saved in a json file. For conversion identical to the one used by `Seq2Vec` packate (Vitrifier), the file is `seq2vec_codon2codes.json`

In [18]:
def decode_sequence(seq, code2codon=None):
    """Takes a sequence encoded for Seq2Vec (stride 1) and recover the initial sequence as a letter string"""

    if code2codon is None:
        # Build the list to convert a codon code into a three letter string 
        # Format: code2codon[i] returns the three letter codon
        
        # load the codon2code dictionary from json file
        json_fname =  proc_data/ 'seq2vec_codon2codes.json'
        if not json_fname.is_file():
            raise ValueError(f"Must have {json_fname.name} file in virtifier data folder")
        with open(json_fname, 'r') as fp:
            codon2code = json.load(fp)

        # build list by using each (codon, code) present in the dictionary. All others are 'Unknow'
        code2codon = ['Unknow'] * 65
        for codon, i in codon2code.items():
            code2codon[i] = codon

    # check format of the passed sequence and cast it as a np.array
    if isinstance(seq, list):
        seq = np.array(seq)
    elif seq.ndim > 1:
        raise ValueError('seq should be a list or a 1D np.array')

    seq_len = seq.shape[0]
    sep = ''
    s = sep.join([code2codon[int(i)][0] for i in seq]) + sep + code2codon[int(seq[seq_len-1])][1:]
    return s

for seq_nbr in [0, 10, 30]:
    seq_decoded = decode_sequence(X_train_500[seq_nbr, :])
    print(seq_decoded)

TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCTCAACAACGGCAGTTCAGCATTTTTGATTATATTATTCAGCTTGTAGATAAGGTTGTGATAATCCATCACTCCACCTTCTATTTTTGGAAATACTAGGTTTTGTTGTGGATTTCCTAACTTCATAAAGTGTTTCTTTTGAAAGAAGTACCAGCTTTTTAATACATAAATTGCCTTATCGTCAATGCTAATAATTCGATTGC
GAAATTACTAAGACAAGGTTCTTTGTCAGTAAATCTGGGACACGTACTGGTAAAGGGTTGCGTCATAAAGTAATTTCATCAATTTTTGATACGAAACATGTCAATCTAGACGAACTCTCAAATAAGGCAACGGCTGCAATGGCTTGGGCTGGATTTGATGGCGGAGAAATGTTACTGGTGACTGAATCAGGAGAAATCGGAAAAAGCTTAGAACGTTATTTGAAAATACTGGCTACTGAAAGTACATATCGTGGTCGAGGTATCGGTCAAAACTATGCGGATATTAATCTTACTGGTGTGCTGTCTATTGATTCAAATGAAAAAGTTCTGTTTTCTTCTGAGATGAATAGCAGGGCTGTGAACATTGCTTTTAAGAATCGTCCTAAAGGGGAAACTGATAGCGAACGTGAATCAATCTTTGCGCCCTACTGGGAAGCATTTACTGAACAGCGTGTATCTGAGACCAGTCGAGAAGCAACGGCGCTTGCAGGAGTGCTGC

In [21]:
def array2sequences(a):
    """Build list to convert a codon code into a three letter string

    Format: code2codon[i] returns the three letter codon
    """
    
    # load the codon2code dictionary from json file
    seq2vec_json = proc_data/ 'seq2vec_codon2codes.json'
    if not seq2vec_json.is_file():
        raise ValueError(f"Must have {seq2vec_json.name} file in data folder")
    with open(seq2vec_json, 'r') as fp:
        codon2code = json.load(fp)

    # build list by using each (codon, code) present in the dictionary. All others are 'Unknow'
    code2codon = ['Unknow'] * 65
    for codon, i in codon2code.items():
        code2codon[i] = codon
    sequences = ''

    for n, row in enumerate(a):
        s = decode_sequence(row, code2codon=code2codon)
        sequences = sequences + f"> Sequence {n}\n{s}\n"
    return sequences

print(array2sequences(X_train_300[:5, :]))

> Sequence 0
TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCT
> Sequence 1
TTTTATTGCTTTTTGGTGTATCTTGAATTTGAAATCGTCTAACTGTGGTTTTGCTGACAGATACAGTCTGGTTTTGAAAGTTAATATCAGACCATGATAAGGCCATAGCTTCGCCAATACGCAAACCAGAAGCCACAAGCAAGCGTAGAAGAGCTTTAAAGTATTCATTTGACCACTGGCCACTCTTTAGAGATTCAAGGTAATTAAAGAGTTTTGCTAATTCCGATTTTTTGTAGAACTTTATCTCTTTTTTTGCTTGTTCCACCTTTACTTGGGGTACGATCACTGACTGACAAGGAT
> Sequence 2
GGCAAAGTATAATTATCGAGATTTGATCTCATTTGGTCAATAGAACCTATTTTACGTTGCAGCTTATAAAGTTCAAAAAAGCTCTCAGCAACCTCTTCAAAAGTTTCGAGTGATTCTTTACCTTTGGTGTTACCGTTCTTTTTAAAATTATTTTTCAGGCGTTCTAGTTCATTCTTGACGCCTGTTTTTGTGCGCCCTCTAATATCTGTTTTTATTTGCTTACCTGTCAGCACATCAGTACCGATATAAGCGCCACGGAGAATATAACGCACCTCTCCGGCTTTTGTTTTGTATTCTTTT
> Sequence 3
TCTAAGAAATCCTCTAGAGAATGTAGCCTTTCAAGTATTTCCCTA

In [22]:
print(array2sequences(X_train_300[-3:,:]))

> Sequence 0
GCTCCAGTAATGACCGCTGACATCGCTTGTTCTGGAACCGCCAAGTCTTTAGCAATTGTTAATTCTGAAGGATTAGTGAAACGGCTGATATAAGGAGAAACAGCATTCAAGAAATCTGCATGTTCAGATTTAATCAACATTGGGACAGCTTTAGACAATGGAGTATTAACCTCAGCACGAATATTACGAACGGCTGTAATTAATTCGATTAACATCGCCACACCTTCGCTTGCCTTGTCATCATTAAATTCTGGACGTACTTTTGGATATTCGGCAACAACAATTGAACCAGAAGTATTT
> Sequence 1
AACACCAAATTCAAATTTATCCATTTGTTCAGTCACGCGCTCAACAGTATCATTCAAACGTGTCAAAATCCAACGGTCAGTGACATTACCAGCAGTTTTATTTGCAACTTTTGTCAAAGCCGAAGAAACAGCATCAGCGCTGATATCTTCTGCATTCATCAAAATATAACGTGAAACATTCCAAATTTTATTGATGAAATTCCAAGCGGCATCCATTTTATCATAAGAAAAACGTACATCTTGTCCTGGTGCAGAACCATTTGATAGGAACCAACGGAGTGCATCAGCTCCATACTTTTC
> Sequence 2
TGAAGTCTGCTGAGTTTTCGTCTGGCCAACCCATTGTTGAAAACGGCCACAAAGCTGAACTAAACCAAGTATCGAGCACATCTTCATCTTGTGTCCAACCTTCTCCCTCAGGTGCTTCTTCACCAACATACATTTCACCCGCTTCATTATACCAAGCTGGAATTTGATGTCCCCACCAAAGTTGACGTGAAATAACCCAATCATGAACATTTTCCATCCATTGCATGAAAGTATCATTGAAACGTGGCGGATAAAATTCTACTGCATCTTCTGTTGTTTGATTAGCAATTGCATTCTTAG



In [23]:
p2data = seq2vec_repo / 'data'
p2data.is_dir()

True

In [24]:
with open(proc_data / 'training_sequences_300bp.fasta', 'w') as fp:
    fp.write(array2sequences(X_train_300))

with open(proc_data / 'training_sequences_500bp.fasta', 'w') as fp:
    fp.write(array2sequences(X_train_500))

# Test data
Must handle line by line because the sequences do not have the same length.

In [27]:
test_file = seq2vec_repo / 'data/test_real_data.fasta'
test_fasta = proc_data / 'test_sequences.fasta'

assert test_file.is_file

# output file will be create by appending lines, must delete any pre-existing file.
if test_fasta.is_file():
    os.remove(test_fasta)

with open(test_file, 'r') as fp_in:
    with open(test_fasta, 'a') as fp_out:
        n = 1
        while True:
            line = fp_in.readline()
            if line == '':
                print('Reached EOF')
                break
            else:
                s = decode_sequence(line.split(','))
                fp_out.write(f"> Sequence {n}\n")
                fp_out.write(f"{s}\n")
                n += 1

Reached EOF
