# Biology Reference Info

- List of reference information
- Creation of reference `json` files with bio metadata

In [None]:
import json

from pathlib import Path
from pprint import pprint

## DNA Codons

List of groups as per Virtifier paper's supplementary info: Fig. S6

In [None]:
codon_groups = {
    'Leucine': 'TTA CTA'.split(' '),
    'Group 2': 'CTG CTT'.split(' '),
    'Glycine': 'GGA GGC GGG GGT'.split(' '),
    'Threonine': 'ACA ACC ACG ACT'.split(' '),
    'Alanine': 'GCA GCC GCG GCT'.split(' '),
    'Proline': 'CCA CCC CCG CCT'.split(' '),
}

### Standard DNA Codon table

- [Wikipedia](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)
- 64 codons

#### Direct DNA codon table


<table>
<caption>Standard genetic code</caption>
<tbody><tr>
<th rowspan="2">1st<br>base
</th>
<th colspan="8">2nd base
</th>
<th rowspan="2">3rd<br>base
</th></tr>
<tr>
<th colspan="2" style="width:150px;">T
</th>
<th colspan="2" style="width:150px;">C
</th>
<th colspan="2" style="width:150px;">A
</th>
<th colspan="2" style="width:150px;">G
</th></tr>
<tr>
<th rowspan="4"><a href="/wiki/Thymine" title="Thymine">T</a>
</th>
<td>TTT
</td>
<td rowspan="2" >(Phe/F) <a href="/wiki/Phenylalanine" title="Phenylalanine">Phenylalanine</a>  
</td>
<td>TCT
</td>
<td rowspan="4" >(Ser/S) <a href="/wiki/Serine" title="Serine">Serine</a>  
</td>
<td>TAT
</td>
<td rowspan="2" >(Tyr/Y) <a href="/wiki/Tyrosine" title="Tyrosine">Tyrosine</a>  
</td>
<td>TGT
</td>
<td rowspan="2" >(Cys/C) <a href="/wiki/Cysteine" title="Cysteine">Cysteine</a>  
</td>
<th>T
</th></tr>
<tr>
<td>TTC
</td>
<td>TCC
</td>
<td>TAC
</td>
<td>TGC
</td>
<th>C
</th></tr>
<tr>
<td>TTA
</td>
<td rowspan="6" >(Leu/L) <a href="/wiki/Leucine" title="Leucine">Leucine</a>  
</td>
<td>TCA
</td>
<td>TAA
</td>
<td ><a href="/wiki/Stop_codon" title="Stop codon">Stop</a> (<i>Ochre</i>) 
</td>
<td>TGA
</td>
<td ><a href="/wiki/Stop_codon" title="Stop codon">Stop</a> (<i>Opal</i>) 
</td>
<th>A
</th></tr>
<tr>
<td >TTG  
</td>
<td>TCG
</td>
<td>TAG
</td>
<td ><a href="/wiki/Stop_codon" title="Stop codon">Stop</a> (<i>Amber</i>)
</td>
<td>TGG
</td>
<td >(Trp/W) <a href="/wiki/Tryptophan" title="Tryptophan">Tryptophan</a>  
</td>
<th>G
</th></tr>

<tr>
<th rowspan="4"><a href="/wiki/Cytosine" title="Cytosine">C</a>
</th>
<td>CTT
</td>
<td>CCT
</td>
<td rowspan="4" >(Pro/P) <a href="/wiki/Proline" title="Proline">Proline</a>  
</td>
<td>CAT
</td>
<td rowspan="2" >(His/H) <a href="/wiki/Histidine" title="Histidine">Histidine</a>  
</td>
<td>CGT
</td>
<td rowspan="4" >(Arg/R) <a href="/wiki/Arginine" title="Arginine">Arginine</a>  
</td>
<th>T
</th></tr>
<tr>
<td>CTC
</td>
<td>CCC
</td>
<td>CAC
</td>
<td>CGC
</td>
<th>C
</th></tr>
<tr>
<td>CTA
</td>
<td>CCA
</td>
<td>CAA
</td>
<td rowspan="2" >(Gln/Q) <a href="/wiki/Glutamine" title="Glutamine">Glutamine</a>  
</td>
<td>CGA
</td>
<th>A
</th></tr>
<tr>
<td>CTG
</td>
<td>CCG
</td>
<td>CAG
</td>
<td>CGG
</td>
<th>G
</th></tr>
<tr>
<th rowspan="4"><a href="/wiki/Adenine" title="Adenine">A</a>
</th>
<td>ATT
</td>
<td rowspan="3" >(Ile/I) <a href="/wiki/Isoleucine" title="Isoleucine">Isoleucine</a>  
</td>
<td>ACT
</td>
<td rowspan="4" >(Thr/T) <a href="/wiki/Threonine" title="Threonine">Threonine</a>  
</td>
<td>AAT
</td>
<td rowspan="2" >(Asn/N) <a href="/wiki/Asparagine" title="Asparagine">Asparagine</a>  
</td>
<td>AGT
</td>
<td rowspan="2" >(Ser/S) <a href="/wiki/Serine" title="Serine">Serine</a>  
</td>
<th>T
</th></tr>
<tr>
<td>ATC
</td>
<td>ACC
</td>
<td>AAC
</td>
<td>AGC
</td>
<th>C
</th></tr>
<tr>
<td>ATA
</td>
<td>ACA
</td>
<td>AAA
</td>
<td rowspan="2" >(Lys/K) <a href="/wiki/Lysine" title="Lysine">Lysine</a>  
</td>
<td>AGA
</td>
<td rowspan="2" >(Arg/R) <a href="/wiki/Arginine" title="Arginine">Arginine</a>  
</td>
<th>A
</th></tr>
<tr>
<td>ATG  
</td>
<td>(Met/M) <a href="/wiki/Methionine" title="Methionine">Methionine</a>  
</td>
<td>ACG
</td>
<td>AAG
</td>
<td>AGG
</td>
<th>G
</th></tr>

<tr>
<th rowspan="4"><a href="/wiki/Guanine" title="Guanine">G</a>
</th>
<td>GTT
</td>
<td rowspan="4" >(Val/V) <a href="/wiki/Valine" title="Valine">Valine</a>  
</td>
<td>GCT
</td>
<td rowspan="4">(Ala/A) <a href="/wiki/Alanine" title="Alanine">Alanine</a>  
</td>
<td>GAT
</td>
<td rowspan="2" >(Asp/D) <a href="/wiki/Aspartic_acid" title="Aspartic acid">Aspartic acid</a> ↓
</td>
<td>GGT
</td>
<td rowspan="4" >(Gly/G) <a href="/wiki/Glycine" title="Glycine">Glycine</a>  
</td>
<th>T
</th></tr>
<tr>
<td>GTC
</td>
<td>GCC
</td>
<td>GAC
</td>
<td>GGC
</td>
<th>C
</th></tr>
<tr>
<td>GTA
</td>
<td>GCA
</td>
<td>GAA
</td>
<td rowspan="2" >(Glu/E) <a href="/wiki/Glutamic_acid" title="Glutamic acid">Glutamic acid</a> ↓
</td>
<td>GGA
</td>
<th>A
</th></tr>
<tr>
<td>GTG  
</td>
<td>GCG
</td>
<td>GAG
</td>
<td>GGG
</td>
<th>G
</th></tr>
</tbody>
</table>

#### Inverse DNA Codon Table

<table style="vertical-align:top;">
<caption>Inverse table for the standard genetic code (compressed using <a href="/wiki/Nucleic_acid_notation" title="Nucleic acid notation">IUPAC notation</a>)
</caption>
<tbody><tr>
<th>Amino acid</th>
<th>DNA codons</th>
<th>Compressed
</th>
<td rowspan="13">
</td>
<th>Amino acid</th>
<th>DNA codons</th>
<th>Compressed
</th></tr>
<tr>
<th style="text-align:center;">Ala, A
</th>
<td>GCT, GCC, GCA, GCG
</td>
<td>GCN
</td>
<th style="text-align:center;">Ile, I
</th>
<td>ATT, ATC, ATA
</td>
<td>ATH
</td></tr>
<tr>
<th style="text-align:center;">Arg, R
</th>
<td>CGT, CGC, CGA, CGG; AGA, AGG
</td>
<td>CGN, AGR; or<br> CGY, MGR
</td>
<th style="text-align:center;">Leu, L
</th>
<td>CTT, CTC, CTA, CTG; TTA, TTG
</td>
<td>CTN, TTR; or <br>CTY, YTR
</td></tr>
<tr>
<th style="text-align:center;">Asn, N
</th>
<td>AAT, AAC
</td>
<td>AAY
</td>
<th style="text-align:center;">Lys, K
</th>
<td>AAA, AAG
</td>
<td>AAR
</td></tr>
<tr>
<th style="text-align:center;">Asp, D
</th>
<td>GAT, GAC
</td>
<td>GAY
</td>
<th style="text-align:center;">Met, M
</th>
<td colspan="2">ATG
</td></tr>
<tr>
<th style="text-align:center;">Asn or Asp, B
</th>
<td>AAT, AAC; GAT, GAC
</td>
<td>RAY
</td>
<th style="text-align:center;">Phe, F
</th>
<td>TTT, TTC
</td>
<td>TTY
</td></tr>
<tr>
<th style="text-align:center;">Cys, C
</th>
<td>TGT, TGC
</td>
<td>TGY
</td>
<th style="text-align:center;">Pro, P
</th>
<td>CCT, CCC, CCA, CCG
</td>
<td>CCN
</td></tr>
<tr>
<th style="text-align:center;">Gln, Q
</th>
<td>CAA, CAG
</td>
<td>CAR
</td>
<th style="text-align:center;">Ser, S
</th>
<td>TCT, TCC, TCA, TCG; AGT, AGC
</td>
<td>TCN, AGY
</td></tr>
<tr>
<th style="text-align:center;">Glu, E
</th>
<td>GAA, GAG
</td>
<td>GAR
</td>
<th style="text-align:center;">Thr, T
</th>
<td>ACT, ACC, ACA, ACG
</td>
<td>ACN
</td></tr>
<tr>
<th style="text-align:center;">Gln or Glu, Z
</th>
<td>CAA, CAG; GAA, GAG
</td>
<td>SAR
</td>
<th style="text-align:center;">Trp, W
</th>
<td colspan="2">TGG
</td></tr>

<tr>
<th style="text-align:center;">Gly, G
</th>
<td>GGT, GGC, GGA, GGG
</td>
<td>GGN
</td>
<th style="text-align:center;">Tyr, Y
</th>
<td>TAT, TAC
</td>
<td>TAY
</td></tr>
<tr>
<th style="text-align:center;">His, H
</th>
<td>CAT, CAC
</td>
<td>CAY
</td>
<th style="text-align:center;">Val, V
</th>
<td>GTT, GTC, GTA, GTG
</td>
<td>GTN
</td></tr>
<tr>
<th style="text-align:center;">START
</th>
<td colspan="2">ATG
</td>
<th style="text-align:center;">STOP
</th>
<td>TAA, TGA, TAG
</td>
<td>TRA, TAR
</td></tr></tbody></table>

# Save Codon info into json files

Encode inverse table into a dictionary

In [None]:
l = [
    ('Ala, A','Alanine','GCT, GCC, GCA, GCG'),
    ('Arg, R','Arginine','CGT, CGC, CGA, CGG, AGA, AGG'),
    ('Asn, N','Asparagine','AAT, AAC'),
    # ('Asn, B','Asn or Asp, B','AAT, AAC, GAT, GAC'),
    ('Asp, D','Aspartic acid','GAT, GAC'),
    ('Cys, C','Cysteine','TGT, TGC'),
    ('Gln, Q','Glutamine','CAA, CAG'),
    ('Glu, E','Glutamic acid','GAA, GAG'),
    # ('Gln, Z','Glu or Glu, Z','CAA, CAG, GAA, GAG '),
    ('Gly, G','Glycine','GGT, GGC, GGA, GGG'),
    ('His, H','Histidine','CAT, CAC'),
    ('Ile, I','Isoleucine','ATT, ATC, ATA'),
    ('Leu, L','Leucine','CTT, CTC, CTA, CTG; TTA, TTG'),
    ('Lys, K','Lysine','AAA, AAG'),
    ('Met, M','Methionine','ATG'),
    ('Phe, F','Phenylalanine','TTT, TTC'),
    ('Pro, P','Proline','CCT, CCC, CCA, CCG'),
    ('Ser, S','Serine','TCT, TCC, TCA, TCG, AGT, AGC'),
    ('Thr, T','Threonine','ACT, ACC, ACA, ACG'),
    ('Trp, W','Tryptophan','TGG'),
    ('Tyr, Y','Tyrosine','TAT, TAC'),
    ('Val, V','Valine','GTT, GTC, GTA, GTG'),
    ('START','START','ATG'),
    ('STOP','STOP','TAA, TGA, TAG')
]

Create a json file with the inverse table:
```json
{
    'amino acid':{
        'amino acid long': long name
        'codons': list of codons
    },
    'Phe, F':{
        'amino acid long': 'Phenylalanine'
        'codons': ['TTT', 'TTC'],
    }
}
```

In [None]:
dna_codon_inv_table = {k:{'amino acid long':n, 'codons': s.split(', ')} for k, n, s in l}

key = 'Phe, F'
dna_codon_inv_table[key]

{'amino acid long': 'Phenylalanine', 'codons': ['TTT', 'TTC']}

In [None]:
proc_data = Path('../data/').resolve()
assert proc_data.is_dir()

p2dna_inv_table = proc_data / 'dna_codon_inverse_table.json'

# with open(p2dna_inv_table, 'w') as fp:
#     json.dump(dna_codon_inv_table, fp, indent=4)

Create a json file with the direct table:
```json
{
    'codon':{
        'amino acid': name of the amino acid
        'long name': long name
    },
    'TTT':{
        'amino acid': 'Phe, F',
        'amino acid long': 'Phenylalanine'
    }
}
```

In [None]:
dna_codon_table = {} 
for k, v in dna_codon_inv_table.items():
    for codon in v['codons']:
        subdict = {}
        subdict['amino acid'] = k
        subdict['amino acid long'] = v['amino acid long']
        dna_codon_table[codon] = subdict

key = 'TTT'
dna_codon_table[key]

{'amino acid': 'Phe, F', 'amino acid long': 'Phenylalanine'}

In [None]:
p2dna_table = proc_data / 'dna_codon_table.json'

# with open(p2dna_table, 'w') as fp:
#     json.dump(dna_codon_table, fp, indent=4)

# dna_codon_metadata

In [None]:
with open(p2dna_inv_table, 'r') as fp:
    dna_info = json.load(fp)
dna_info

{'Ala, A': {'amino acid long': 'Alanine',
  'codons': ['GCT', 'GCC', 'GCA', 'GCG']},
 'Arg, R': {'amino acid long': 'Arginine',
  'codons': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']},
 'Asn, N': {'amino acid long': 'Asparagine', 'codons': ['AAT', 'AAC']},
 'Asp, D': {'amino acid long': 'Aspartic acid', 'codons': ['GAT', 'GAC']},
 'Cys, C': {'amino acid long': 'Cysteine', 'codons': ['TGT', 'TGC']},
 'Gln, Q': {'amino acid long': 'Glutamine', 'codons': ['CAA', 'CAG']},
 'Glu, E': {'amino acid long': 'Glutamic acid', 'codons': ['GAA', 'GAG']},
 'Gly, G': {'amino acid long': 'Glycine',
  'codons': ['GGT', 'GGC', 'GGA', 'GGG']},
 'His, H': {'amino acid long': 'Histidine', 'codons': ['CAT', 'CAC']},
 'Ile, I': {'amino acid long': 'Isoleucine', 'codons': ['ATT', 'ATC', 'ATA']},
 'Leu, L': {'amino acid long': 'Leucine',
  'codons': ['CTT', 'CTC', 'CTA', 'CTG; TTA', 'TTG']},
 'Lys, K': {'amino acid long': 'Lysine', 'codons': ['AAA', 'AAG']},
 'Met, M': {'amino acid long': 'Methionine', 'codons

In [None]:
with open(p2dna_table, 'r') as fp:
    dna_info = json.load(fp)
dna_info

{'GCT': {'amino acid': 'Ala, A', 'amino acid long': 'Alanine'},
 'GCC': {'amino acid': 'Ala, A', 'amino acid long': 'Alanine'},
 'GCA': {'amino acid': 'Ala, A', 'amino acid long': 'Alanine'},
 'GCG': {'amino acid': 'Ala, A', 'amino acid long': 'Alanine'},
 'CGT': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'CGC': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'CGA': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'CGG': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'AGA': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'AGG': {'amino acid': 'Arg, R', 'amino acid long': 'Arginine'},
 'AAT': {'amino acid': 'Asn, B', 'amino acid long': 'Asn or Asp, B'},
 'AAC': {'amino acid': 'Asn, B', 'amino acid long': 'Asn or Asp, B'},
 'GAT': {'amino acid': 'Asp, D', 'amino acid long': 'Aspartic acid'},
 'GAC': {'amino acid': 'Asp, D', 'amino acid long': 'Aspartic acid'},
 'TGT': {'amino acid': 'Cys, C', 'amino acid long': 'Cysteine'},
 'TGC': {