# Explore Seq2Vec - Build Code to Codon Dictionary

# Imports and setup environment

### Install and import packages

In [1]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [2]:
# Import all required packages
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ecutilities.core import path_to_parent_dir
from ecutilities.ipython import nb_setup
from itertools import product
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

from metagentools.core import TextFileBaseIterator, TextFileBaseReader, ProjectFileSystem

Set autoreload mode


In [6]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(PosixPath('/home/vtec'),
 PosixPath('/home/vtec/projects/bio/metagentools'),
 PosixPath('/home/vtec/projects/bio/metagentools/data'),
 PosixPath('/home/vtec/projects/bio/metagentools/nbs'))

## Load Training Data

Two sequence files, with sequences of about 300 bps and aboput 500 bps. Sequences are already encoded as numbers.

In [13]:
data_dir = pfs.data / 'virtifier/'

ds_file_300 = data_dir / 'raw/train_300bp.fasta'
ds_file_500 = data_dir / 'raw/train_500bp.fasta'
ds_file_300.is_file(), ds_file_500.is_file(), 

(True, True)

In [15]:
X_train_300=np.loadtxt(open(ds_file_300,"rb"),delimiter=",",skiprows=0)
X_train_500=np.loadtxt(open(ds_file_500,"rb"),delimiter=",",skiprows=0)
print(f"Loaded data shapes:\n - 300 bp: {X_train_300.shape}, \n - 500 bp: {X_train_500.shape}")

Loaded data shapes:
 - 300 bp: (9000, 298), 
 - 500 bp: (9000, 498)


In [18]:
X_train_300[:3,:10]

array([[46., 57., 35., 61., 16.,  6.,  9., 35., 61., 16.],
       [ 2.,  2., 14., 16.,  6., 20., 43., 34., 24.,  2.],
       [29., 30.,  7.,  1.,  4., 49., 52., 16., 21., 13.]])

In [21]:
X_train_300[:5, :12]

array([[46., 57., 35., 61., 16.,  6.,  9., 35., 61., 16.,  6., 14.],
       [ 2.,  2., 14., 16.,  6., 20., 43., 34., 24.,  2.,  2.,  2.],
       [29., 30.,  7.,  1.,  4., 49., 52., 16., 21., 13.,  5.,  6.],
       [35., 61., 13.,  4., 10.,  3.,  1.,  5., 19., 58., 59., 60.],
       [37., 59., 24.,  2.,  2.,  9., 58., 42., 26., 38., 32., 33.]])

## Build code to codon dictionary

Recover codons from numerical codes. Encoding from  the original code in [Seq2Vec](https://github.com/crazyinter/Seq2Vec/blob/165b2abb1cbc792c1a67f61cb6bc7f07cd38ce2b/preprocessing.py#L7):

```shell
("AAA","1"),  ("TTT","2"),  ("GAA","3"),  ("AAG","4"),  ("AAT","5"),  ("ATT","6"),  ("CAA","7"),  ("TGA","8"),  ("TTC","9"), ("AGA","10"), 
("GAT","11"), ("AAC","12"), ("TAA","13"), ("TTA","14"), ("TCA","15"), ("TAT","16"), ("ATG","17"), ("TGG","18"), ("ATC","19"), 
("TTG","20"), ("ATA","21"), ("GTT","22"), ("CTG","23"), ("CTT","24"), ("ACA","25"), ("CAG","26"), ("CGA","27"), ("GGT","28"), 
("GGC","29"), ("GCA","30"), ("CAT","31"), ("GCG","32"), ("CGC","33"), ("GCT","34"), ("TCT","35"), ("TCG","36"), ("ACC","37"), 
("AGC","38"), ("CGG","39"), ("GAC","40"), ("CCG","41"), ("CCA","42"), ("TGC","43"), ("ACG","44"), ("GGA","45"), ("TGT","46"), 
("ACT","47"), ("TAC","48"), ("AGT","49"), ("GCC","50"), ("GAG","51"), ("GTA","52"), ("GTG","53"), ("AGG","54"), ("CGT","55"), 
("CAC","56"), ("GTC","57"), ("TCC","58"), ("CCT","59"), ("CTC","60"), ("CTA","61"), ("GGG","62"), ("TAG","63"), ("CCC","64")
```

In [None]:
c = '"AAA","1").replace("TTT","2").replace("GAA","3").replace("AAG","4").replace("AAT","5").replace("ATT","6").replace("CAA","7").replace("TGA","8").replace("TTC","9").replace("AGA","10").replace("GAT","11").replace("AAC","12").replace("TAA","13").replace("TTA","14").replace("TCA","15").replace("TAT","16").replace("ATG","17").replace("TGG","18").replace("ATC","19").replace("TTG","20").replace("ATA","21").replace("GTT","22").replace("CTG","23").replace("CTT","24").replace("ACA","25").replace("CAG","26").replace("CGA","27").replace("GGT","28").replace("GGC","29").replace("GCA","30").replace("CAT","31").replace("GCG","32").replace("CGC","33").replace("GCT","34").replace("TCT","35").replace("TCG","36").replace("ACC","37").replace("AGC","38").replace("CGG","39").replace("GAC","40").replace("CCG","41").replace("CCA","42").replace("TGC","43").replace("ACG","44").replace("GGA","45").replace("TGT","46").replace("ACT","47").replace("TAC","48").replace("AGT","49").replace("GCC","50").replace("GAG","51").replace("GTA","52").replace("GTG","53").replace("AGG","54").replace("CGT","55").replace("CAC","56").replace("GTC","57").replace("TCC","58").replace("CCT","59").replace("CTC","60").replace("CTA","61").replace("GGG","62").replace("TAG","63").replace("CCC","64"'

code_pattern_to_replace = ').replace('
c = c.replace('        ', '').replace(code_pattern_to_replace, ';').replace('","', '",') #.replace('";', ';')
c.split(';')
codon2number = {s[1:4]:int(s[6:-1]) for s in c.split(';')}
codon2number['Unknown'] = 0
# print([v for k, v in codon2number.items()])
# pprint(codon2number)

number2codon = ['Unknown']
number2codon.extend([s[1:4] for s in c.split(';')])

number2codon[:5]

['Unknown', 'AAA', 'TTT', 'GAA', 'AAG']

Save dictionary as json file

In [None]:
json_fname = data_dir /  'processed/seq2vec_codon2codes.json'

# Testing that codon2number is a proper dictionary
# 
# 1. All codes from 0 to 64 are included in the dictionary
assert set(codon2number.values()).difference(set(range(65))) == set()

# 2. dictionary keys include each of the possible 3 letter codons
full_codon_set = set([''.join(t) for t in product('ACGT', repeat=3)])
assert full_codon_set.difference(set(codon2number.keys())) == set()

# 3. Dictionary keys also include a code for "Unknow"
assert set(codon2number.keys()).difference(full_codon_set) == {'Unknown'}

with open(json_fname, 'w') as fp:
    json.dump(codon2number, fp, indent=4)  
    print(f"Saved {json_fname}")

Saved /home/vtec/projects/bio/metagentools/data/virtifier/processed/seq2vec_codon2codes.json


## Convert sequence in number into letters

In [None]:
print(X_train_300.shape)
print(X_train_500.shape)
X_train_300[:5,:10]

(9000, 298)
(9000, 498)


array([[46., 57., 35., 61., 16.,  6.,  9., 35., 61., 16.],
       [ 2.,  2., 14., 16.,  6., 20., 43., 34., 24.,  2.],
       [29., 30.,  7.,  1.,  4., 49., 52., 16., 21., 13.],
       [35., 61., 13.,  4., 10.,  3.,  1.,  5., 19., 58.],
       [37., 59., 24.,  2.,  2.,  9., 58., 42., 26., 38.]])

In [None]:
seq_nbr = 0

[number2codon[int(i)] for i in X_train_300[seq_nbr, :]][:10]

['TGT', 'GTC', 'TCT', 'CTA', 'TAT', 'ATT', 'TTC', 'TCT', 'CTA', 'TAT']

Sequences encoding:
1. TGTCTATTCTA... is split in overlapping codons with stride 1:
    - 'TGT', 'GTC', 'TCT', 'CTA', 'TAT', 'ATT', 'TTC', 'TCT', 'CTA', ...
2. Then encoded into numbers:
    - 46., 57., 35., 61., 16.,  6.,  9., 35., 61., ...

In [None]:
[number2codon[int(i)][0] for i in X_train_300[seq_nbr, :]][:10]

['T', 'G', 'T', 'C', 'T', 'A', 'T', 'T', 'C', 'T']

to get the initial sequence, without repetition:
- decode each codon in encoded sequence
- keep only the first letter
- add the last two letters of the last codon

In [None]:
seq_len = X_train_300.shape[1]
"-".join([number2codon[int(i)][0] for i in X_train_300[seq_nbr, :]]) + '-' + number2codon[int(X_train_300[seq_nbr, seq_len-1])][1:]

'T-G-T-C-T-A-T-T-C-T-A-T-T-A-T-A-T-T-A-T-A-T-A-G-A-C-G-G-A-T-T-A-G-T-C-T-C-A-A-A-C-C-T-T-T-G-A-T-A-T-T-A-A-A-A-G-G-T-T-T-G-A-G-T-T-T-T-T-T-A-T-T-T-T-T-A-T-C-T-A-A-C-A-A-T-A-G-A-A-T-T-A-A-C-A-G-A-G-T-T-T-T-T-A-A-C-A-G-A-G-T-T-C-T-A-T-T-T-T-A-A-A-A-G-T-T-G-G-C-A-T-A-C-T-T-A-G-T-A-A-A-T-A-G-T-T-C-A-A-G-C-T-C-T-T-T-A-T-T-C-T-T-C-T-T-T-T-C-G-G-G-C-T-C-A-A-G-A-T-G-T-G-A-G-T-A-T-A-G-G-T-C-C-A-T-A-G-T-C-A-A-T-T-G-C-A-A-C-T-T-T-G-A-A-T-G-A-C-C-T-A-G-C-C-T-T-T-C-T-T-G-A-A-T-C-A-C-T-T-T-A-T-A-A-C-T-C-A-T-T-T-C-A-G-C-A-T-T-T-A-A-A-C-A-A-A-G-G-C-T-A-G-C-G-T-G-T-G-A-A-C-G-C-C-T-G-A-A-T-C-C-A-T-G-A-A-A-G-CT'

In [None]:
def decode_sequence(seq):
    """Takes a sequence encoded for Seq2Vec (stride 1) and recover the initial sequence as a letter string"""
    if isinstance(seq, list):
        seq = np.array(seq)
    elif seq.ndim > 1:
        raise ValueError('seq should be a list or a 1D np.array')
    seq_len = seq.shape[0]
    sep = ''
    s = sep.join([number2codon[int(i)][0] for i in seq]) + sep + number2codon[int(seq[seq_len-1])][1:]
    return s

seq_decoded = decode_sequence(X_train_300[seq_nbr, :])
print(seq_decoded)
len(seq_decoded)

TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCT


300

In [None]:
seq_decoded = decode_sequence(X_train_500[seq_nbr, :])
print(seq_decoded)
len(seq_decoded)

TGTCTATTCTATTATATTATATAGACGGATTAGTCTCAAACCTTTGATATTAAAAGGTTTGAGTTTTTTATTTTTATCTAACAATAGAATTAACAGAGTTTTTAACAGAGTTCTATTTTAAAAGTTGGCATACTTAGTAAATAGTTCAAGCTCTTTATTCTTCTTTTCGGGCTCAAGATGTGAGTATAGGTCCATAGTCAATTGCAACTTTGAATGACCTAGCCTTTCTTGAATCACTTTATAACTCATTTCAGCATTTAAACAAAGGCTAGCGTGTGAACGCCTGAATCCATGAAAGCTCAACAACGGCAGTTCAGCATTTTTGATTATATTATTCAGCTTGTAGATAAGGTTGTGATAATCCATCACTCCACCTTCTATTTTTGGAAATACTAGGTTTTGTTGTGGATTTCCTAACTTCATAAAGTGTTTCTTTTGAAAGAAGTACCAGCTTTTTAATACATAAATTGCCTTATCGTCAATGCTAATAATTCGATTGC


500

# Embedding Matrix

```
,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.25005418,0.059795063,0.009158412,-0.4391078,-0.09833209, ... ,-0.19221346,0.23508973,-0.07497623,-0.29959005,0.24146456
1,-0.08675727,0.060839232,-0.07424304,-0.37983173,-0.25841987, ...,-0.24511199,-0.083285585,-0.24021326,0.062484346,0.4966392
```

In [None]:
embedding_file = data_dir /'raw/embedding_matrix.csv'
assert embedding_file.is_file()

embedding_matrix = np.loadtxt(open(embedding_file,"rb"), delimiter=",",skiprows=1, usecols=range(1, 21))
print(embedding_matrix.shape)
display(embedding_matrix[:2,0:5])
display(embedding_matrix[:2,-5:])
display(embedding_matrix[-2:,-5:])

(65, 20)


array([[-0.25005418,  0.05979506,  0.00915841, -0.4391078 , -0.09833209],
       [-0.08675727,  0.06083923, -0.07424304, -0.37983173, -0.25841987]])

array([[-0.19221346,  0.23508973, -0.07497623, -0.29959005,  0.24146456],
       [-0.24511199, -0.08328558, -0.24021326,  0.06248435,  0.4966392 ]])

array([[-0.23886187,  0.00119262, -0.25492588, -0.2362128 ,  0.00675272],
       [-0.52064896,  0.167717  , -0.19897348, -0.295527  ,  0.04861289]])

In [None]:
embedding_df = pd.DataFrame(embedding_matrix, index=number2codon)
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Unknown,-0.250054,0.059795,0.009158,-0.439108,-0.098332,0.185745,-0.196381,0.404267,-0.005326,0.160307,0.210252,-0.246796,0.190436,-0.07488,-0.276083,-0.192213,0.23509,-0.074976,-0.29959,0.241465
AAA,-0.086757,0.060839,-0.074243,-0.379832,-0.25842,-0.087894,-0.167048,0.222247,-0.006491,0.007524,0.175967,-0.185069,0.253603,-0.136537,-0.404523,-0.245112,-0.083286,-0.240213,0.062484,0.496639
TTT,-0.425821,0.26143,-0.061543,-0.058999,-0.020832,-0.003754,0.080069,0.069302,0.111467,0.046123,0.149116,-0.259215,0.188353,0.034878,-0.386942,-0.316909,0.402015,-0.251909,-0.338319,0.032208
GAA,-0.045727,0.044461,-0.15342,-0.254663,-0.199528,-0.259784,-0.132734,0.44487,0.161986,0.146042,0.147589,-0.154322,0.260501,-0.014606,0.021609,0.029186,0.079643,-0.347668,-0.141596,0.5243
AAG,-0.033699,-0.121312,-0.023058,-0.374022,-0.521882,0.163355,-0.132236,0.179604,0.009331,0.196445,-0.027424,-0.141101,0.367374,-0.384175,-0.056314,-0.090067,-0.152333,-0.240178,-0.104698,0.223395


# Review CAMI information

In [26]:
fasta_single = pfs.data / 'cami/gold_standard_high_single-288-seq.fasta'
fasta_assy =  pfs.data / 'cami/gold_standard_high_assy-398-seq.fasta'
fasta_assy.is_file(), fasta_single.is_file()

(True, True)

In [27]:
with open(fasta_assy, 'r') as fp:
    seq_id = []
    sequences = []
    while True:
        line = fp.readline()
        if line == '':
            print('Read all sequences')
            break
        elif line[0] == '>':
            seq_id.append(line[1:-1])
        else:
            sequences.append(line[:-1])

fasta_df = pd.DataFrame({'SeqID': seq_id, 'Sequence': sequences})
fasta_df['Prefix'] = fasta_df.SeqID.str.slice(0,5)
fasta_df['SeqID'] = fasta_df.SeqID.str.slice(5)
fasta_df.head(10)

Read all sequences


Unnamed: 0,SeqID,Sequence,Prefix
0,C0,GGCCGACTGGGACATCATCGCCAAGAGCGTGTTCGACCGCCTCGTC...,RH|P|
1,C1,TGTTTATCTATTTTACCCAGATCGGAGCTTTTTAAAAAATTTATTT...,RH|P|
2,C2,AACTATGGAACTTTTTGAAGAGGTTTCTACATATTTAAAAGGGTTT...,RH|P|
3,C3,AACCGGAGGGACCGCTTCCGGGGTTCTTCCCGTACCGGCATCGACA...,RH|P|
4,C4,TCCCGCGCGCGCCATTCGCCATGCCGGAGTCCTTGTCGGTGATGGC...,RH|P|
5,C5,GGGTTGGACGGGTTTGAGAGTAGCTCGTCGAGGGCTTCGGCGGGTT...,RH|P|
6,C6,CaccAGTTCGCTCTGGCTGTTGGCGCCCTCGAAACGGATGGGGGCG...,RH|P|
7,C7,CCGCATGAGCGCGGCATACATCGATACCTTGCGCTAAGGCGCAGGG...,RH|P|
8,C8,TCGCTTGGCCGCCGACAAGGGCTACGACGCCGACTGGCTCCGGGCC...,RH|P|
9,C9,ACATGAAGTTGAAGAGAGCCAGCTTCCACTGAAATAAGTGTGAAGG...,RH|P|


In [28]:
profile_pool = pfs.data / 'cami/goldstandard_high_pool.profile'
profile_pool.is_file()

True

In [29]:
with open(profile_pool, 'r') as fp:
    print(fp.read(500))

@SampleID:
@Version:0.9.1
@Ranks:superkingdom|phylum|class|order|family|genus|species|strain

@@TAXID	RANK	TAXPATH	TAXPATHSN	PERCENTAGE	_CAMI_genomeID	_CAMI_OTU
2157	superkingdom	2157	Archaea	0.3353		
2	superkingdom	2	Bacteria	28.2362		
976	phylum	2|976	Bacteria|Bacteroidetes	2.9050		
1224	phylum	2|1224	Bacteria|Proteobacteria	14.6433		
203691	phylum	2|203691	Bacteria|Spirochaetes	0.1037		
544448	phylum	2|544448	Bacteria|Tenericutes	0.1024		
74201	phylum	2|74201	Bacteria|Verrucomicrobia	0.1410		


In [30]:
profile_pool_df = pd.read_csv(profile_pool, sep='\t', skiprows=4)
profile_pool_df.sample(20)

Unnamed: 0,@@TAXID,RANK,TAXPATH,TAXPATHSN,PERCENTAGE,_CAMI_genomeID,_CAMI_OTU
1673,32644.7,strain,||||||32644|32644.70,||||||unidentified|unidentified strain,0.1848,Sample18_82,p2
999,45202.41,strain,||||||45202|45202.41,||||||unidentified plasmid|unidentified plasmi...,0.1077,Sample16_136,p1
38,72274.0,order,2|1224|1236|72274,Bacteria|Proteobacteria|Gammaproteobacteria|Ps...,1.0071,,
1068,379.11,strain,2|1224|28211|356|82115|379||379.11,Bacteria|Proteobacteria|Alphaproteobacteria|Rh...,0.0487,1021_AR,272
96,135620.0,family,2|1224|1236|135619|135620,Bacteria|Proteobacteria|Gammaproteobacteria|Oc...,0.1077,,
987,575302.1,strain,2|1224|28216|80840|119060|32008|575302|575302.1,Bacteria|Proteobacteria|Betaproteobacteria|Bur...,0.0191,1287_B,469
967,223904.1,strain,2|201174|1760|2037|85015|1839|223904|223904.1,Bacteria|Actinobacteria|Actinobacteria|Actinom...,0.0197,1139_BO,269
1356,32644.234,strain,||||||32644|32644.234,||||||unidentified|unidentified strain,0.0689,Sample9_68,p2
1352,33886.4,strain,2|201174|1760|2037|85023|33886||33886.4,Bacteria|Actinobacteria|Actinobacteria|Actinom...,0.0117,1285_C,396
14,1236.0,class,2|1224|1236,Bacteria|Proteobacteria|Gammaproteobacteria,2.6543,,


In [31]:
fasta_df.tail()

Unnamed: 0,SeqID,Sequence,Prefix
394,C394,CAGCCACCGGCCGGCGGTTCAACAGGAAGCCTCATCCGACGCGACC...,RH|P|
395,C395,CCTTCAGGATGCCGATGATCTGCTCTTCCGAAAACTGCTTCCGCTT...,RH|P|
396,C396,CCGACGTCGCGATCGGCGGGGGTTCGGACATCGCCGCCGGCGCGGA...,RH|P|
397,C397,ATTTAATTCCCTGAGCTCCATCAATTTGTCGTTTGGCAATTTTATA...,RH|P|
398,C398,AGCCTCCCGGGCACGTTGTTCCAGCCTCGATCCCATCTGCAGCACT...,RH|P|


In [32]:
mask = profile_pool_df.dropna()._CAMI_genomeID.str.contains('Sample18')
mask
cami_seq_idxs = profile_pool_df.dropna().loc[mask, '_CAMI_genomeID'].str.replace('Sample18_', '').astype(int)

In [33]:
cami_seq_idxs.sort_values()

1143      1
1448      2
1141      3
1659      5
899      11
       ... 
1666    360
1669    363
1667    364
1670    369
884     370
Name: _CAMI_genomeID, Length: 81, dtype: int64

# Experiment with counters

In [34]:
from collections import Counter
words = 'this is a sentence where each word is represented once or several times and the sentence is then split into a list'.split(' ')
count = Counter(words)
count.most_common(8)

[('is', 3),
 ('a', 2),
 ('sentence', 2),
 ('this', 1),
 ('where', 1),
 ('each', 1),
 ('word', 1),
 ('represented', 1)]

In [35]:
count.get('sentence'), count['sentence'], count.get('is') , count.get('blabla')

(2, 2, 3, None)

In [36]:
count.keys()

dict_keys(['this', 'is', 'a', 'sentence', 'where', 'each', 'word', 'represented', 'once', 'or', 'several', 'times', 'and', 'the', 'then', 'split', 'into', 'list'])