# Make fasta sets from from Seq2Vec raw data (Virtifier)

Data in the Seq2Vec repo are encoded into sequences of numbers, instead of letter codons.

This notebook make proper fasta files out of the raw data


In [None]:
import os
import numpy as np
import pandas as pd
import json
import shutil

from pathlib import Path
from pprint import pprint

# Load Training Datasets

File Structure:
```
project
  |
  |---data
  |     |
  |     |-virtifier
  |     |     |
  |     |     |- raw  (data from Seq2Vec repo: https://github.com/crazyinter/Seq2Vec)
  |     |     |   |- embedding_matrix.csv
  |     |     |   |- NCBI_accession_numbers_of_the_whole_Refseq_genomes.xlsx
  |     |     |   |- test_real_data.fasta
  
  |     |     |   |- train_300bp.fasta
  |     |     |   |- train_500bp.fasta
  |     |     |- processed  (all processed files from raw)
  |     |     |      |
  |     |     |      |
  

```

In [None]:
data_dir = Path('../../data/virtifier').resolve()
raw = data_dir / 'raw'
processed = data_dir / 'processed'
assert data_dir.is_dir()
assert raw.is_dir()
assert processed.is_dir()

In [None]:
train_file_300 = raw /'train_300bp.fasta'
train_file_500 = raw /'train_500bp.fasta'

assert train_file_300.is_file()
assert train_file_500.is_file()

X_train_300=np.loadtxt(open(train_file_300,"rb"),delimiter=",",skiprows=0)
X_train_500=np.loadtxt(open(train_file_500,"rb"),delimiter=",",skiprows=0)
print(X_train_300.shape, X_train_500.shape)
X_train_300[0,:10]

(9000, 298) (9000, 498)


array([46., 57., 35., 61., 16.,  6.,  9., 35., 61., 16.])

# Codon 2 Numerical Code
Conversion between codon and number code according to the Virifier code. The conversion dictionary is saved in a json file. For conversion identical to the one used by `Seq2Vec` package (Vitrifier), the file is `seq2vec_codon2codes.json` in processed directory

In [None]:
def decode_sequence(seq, code2codon=None):
    """Takes a sequence encoded for Seq2Vec (stride 1) and recover the initial sequence as a letter string"""

    if code2codon is None:
        # Build the list to convert a codon code into a three letter string 
        # Format: code2codon[i] returns the three letter codon
        
        # load the codon2code dictionary from json file
        json_fname =  processed/ 'seq2vec_codon2codes.json'
        
        if not json_fname.is_file():
            raise ValueError(f"Must have {json_fname.name} file in virtifier data folder")
        with open(json_fname, 'r') as fp:
            codon2code = json.load(fp)

        # build list by using each (codon, code) present in the dictionary. All others are 'Unknow'
        code2codon = ['Unknow'] * 65
        for codon, i in codon2code.items():
            code2codon[i] = codon

    # check format of the passed sequence and cast it as a np.array
    if isinstance(seq, list):
        seq = np.array(seq)
    elif seq.ndim > 1:
        raise ValueError('seq should be a list or a 1D np.array')

    seq_len = seq.shape[0]
    sep = ''
    s = sep.join([code2codon[int(i)][0] for i in seq]) + sep + code2codon[int(seq[seq_len-1])][1:]
    return s

for seq_nbr in [0, 10, 30]:
    seq_decoded = decode_sequence(X_train_500[seq_nbr, :])
    print(seq_decoded)

TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCTCAACAACGGCAGTTCAGCATTTTTGATTATATTATTCAGCTTGTAGATAAGGTTGTGATAATCCATCACTCCACCTTCTATTTTTGGAAATACTAGGTTTTGTTGTGGATTTCCTAACTTCATAAAGTGTTTCTTTTGAAAGAAGTACCAGCTTTTTAATACATAAATTGCCTTATCGTCAATGCTAATAATTCGATTGC
GAAATTACTAAGACAAGGTTCTTTGTCAGTAAATCTGGGACACGTACTGGTAAAGGGTTGCGTCATAAAGTAATTTCATCAATTTTTGATACGAAACATGTCAATCTAGACGAACTCTCAAATAAGGCAACGGCTGCAATGGCTTGGGCTGGATTTGATGGCGGAGAAATGTTACTGGTGACTGAATCAGGAGAAATCGGAAAAAGCTTAGAACGTTATTTGAAAATACTGGCTACTGAAAGTACATATCGTGGTCGAGGTATCGGTCAAAACTATGCGGATATTAATCTTACTGGTGTGCTGTCTATTGATTCAAATGAAAAAGTTCTGTTTTCTTCTGAGATGAATAGCAGGGCTGTGAACATTGCTTTTAAGAATCGTCCTAAAGGGGAAACTGATAGCGAACGTGAATCAATCTTTGCGCCCTACTGGGAAGCATTTACTGAACAGCGTGTATCTGAGACCAGTCGAGAAGCAACGGCGCTTGCAGGAGTGCTGC

In [None]:
def array2sequences(a):
    """Build list to convert a codon code into a three letter string

    Format: code2codon[i] returns the three letter codon
    """
    
    # load the codon2code dictionary from json file
    seq2vec_json = processed / 'seq2vec_codon2codes.json'
    
    if not seq2vec_json.is_file():
        raise ValueError(f"Must have {seq2vec_json.name} file in data folder")
    with open(seq2vec_json, 'r') as fp:
        codon2code = json.load(fp)

    # build list by using each (codon, code) present in the dictionary. All others are 'Unknow'
    code2codon = ['Unknow'] * 65
    for codon, i in codon2code.items():
        code2codon[i] = codon
    sequences = ''
    
    for n, row in enumerate(a):
        
        s = decode_sequence(row, code2codon=code2codon)
        sequences = sequences + f"> Sequence {n}\n{s}\n"
    return sequences

print(array2sequences(X_train_300[:5, :]))

> Sequence 0
TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCT
> Sequence 1
TTTTATTGCTTTTTGGTGTATCTTGAATTTGAAATCGTCTAACTGTGGTTTTGCTGACAGATACAGTCTGGTTTTGAAAGTTAATATCAGACCATGATAAGGCCATAGCTTCGCCAATACGCAAACCAGAAGCCACAAGCAAGCGTAGAAGAGCTTTAAAGTATTCATTTGACCACTGGCCACTCTTTAGAGATTCAAGGTAATTAAAGAGTTTTGCTAATTCCGATTTTTTGTAGAACTTTATCTCTTTTTTTGCTTGTTCCACCTTTACTTGGGGTACGATCACTGACTGACAAGGAT
> Sequence 2
GGCAAAGTATAATTATCGAGATTTGATCTCATTTGGTCAATAGAACCTATTTTACGTTGCAGCTTATAAAGTTCAAAAAAGCTCTCAGCAACCTCTTCAAAAGTTTCGAGTGATTCTTTACCTTTGGTGTTACCGTTCTTTTTAAAATTATTTTTCAGGCGTTCTAGTTCATTCTTGACGCCTGTTTTTGTGCGCCCTCTAATATCTGTTTTTATTTGCTTACCTGTCAGCACATCAGTACCGATATAAGCGCCACGGAGAATATAACGCACCTCTCCGGCTTTTGTTTTGTATTCTTTT
> Sequence 3
TCTAAGAAATCCTCTAGAGAATGTAGCCTTTCAAGTATTTCCCTA

In [None]:
print(array2sequences(X_train_300[-3:,:]))

> Sequence 0
GCTCCAGTAATGACCGCTGACATCGCTTGTTCTGGAACCGCCAAGTCTTTAGCAATTGTTAATTCTGAAGGATTAGTGAAACGGCTGATATAAGGAGAAACAGCATTCAAGAAATCTGCATGTTCAGATTTAATCAACATTGGGACAGCTTTAGACAATGGAGTATTAACCTCAGCACGAATATTACGAACGGCTGTAATTAATTCGATTAACATCGCCACACCTTCGCTTGCCTTGTCATCATTAAATTCTGGACGTACTTTTGGATATTCGGCAACAACAATTGAACCAGAAGTATTT
> Sequence 1
AACACCAAATTCAAATTTATCCATTTGTTCAGTCACGCGCTCAACAGTATCATTCAAACGTGTCAAAATCCAACGGTCAGTGACATTACCAGCAGTTTTATTTGCAACTTTTGTCAAAGCCGAAGAAACAGCATCAGCGCTGATATCTTCTGCATTCATCAAAATATAACGTGAAACATTCCAAATTTTATTGATGAAATTCCAAGCGGCATCCATTTTATCATAAGAAAAACGTACATCTTGTCCTGGTGCAGAACCATTTGATAGGAACCAACGGAGTGCATCAGCTCCATACTTTTC
> Sequence 2
TGAAGTCTGCTGAGTTTTCGTCTGGCCAACCCATTGTTGAAAACGGCCACAAAGCTGAACTAAACCAAGTATCGAGCACATCTTCATCTTGTGTCCAACCTTCTCCCTCAGGTGCTTCTTCACCAACATACATTTCACCCGCTTCATTATACCAAGCTGGAATTTGATGTCCCCACCAAAGTTGACGTGAAATAACCCAATCATGAACATTTTCCATCCATTGCATGAAAGTATCATTGAAACGTGGCGGATAAAATTCTACTGCATCTTCTGTTGTTTGATTAGCAATTGCATTCTTAG



## Save sequences in fasta file

In [None]:
with open(processed / 'training_sequences_300bp.fasta', 'w') as fp:
    fp.write(array2sequences(X_train_300))

with open(processed / 'training_sequences_500bp.fasta', 'w') as fp:
    fp.write(array2sequences(X_train_500))

# Load and Convert Test data
Must handle line by line because the sequences do not have the same length.

In [None]:
test_file = raw / 'test_real_data.fasta'
test_fasta = processed / 'test_sequences.fasta'

assert test_file.is_file

# output file will be create by appending lines, must delete any pre-existing file.
if test_fasta.is_file():
    os.remove(test_fasta)

with open(test_file, 'r') as fp_in:
    with open(test_fasta, 'a') as fp_out:
        n = 1
        while True:
            line = fp_in.readline()
            if line == '':
                print('Reached EOF')
                break
            else:
                s = decode_sequence(line.split(','))
                fp_out.write(f"> Sequence {n}\n")
                fp_out.write(f"{s}\n")
                n += 1

Reached EOF
