## Tests

Test: check that the gb genebank file info matches a fafsa file sequence. 
A 'diff'.

python3 -m venv .venv-dependencies
source .venv-dependencies/bin/activate
pip install -r requirements.txt
python -m ipykernel install --user \
    --name dependencies-venv \
    --display-name "Python (.venv-dependencies)"


In [1]:
# Run this code inside a Jupyter cell. If all three lines point to your .venv folder in WSL, your setup is perfect:

import sys
import os

print(f"1. Executable: {sys.executable}") 
print(f"2. Version: {sys.version}")
print(f"3. Env Path: {os.getenv('VIRTUAL_ENV')}")

1. Executable: /home/morgan/projects/bioinfo-projects/comparative-sequence-analysis/.venv/bin/python
2. Version: 3.11.2 (main, Apr 28 2025, 14:11:48) [GCC 12.2.0]
3. Env Path: /home/morgan/projects/bioinfo-projects/comparative-sequence-analysis/.venv


In [2]:
from os import path

In [3]:
import Bio
from Bio import SeqIO


In [None]:
from Bio import GenBank
def get_gb_sequence__orig(g_path):
    with open(g_path, 'r') as G_file: 
        records = [x for  x in GenBank.parse(G_file)]
        if not len(records)==1:
           raise RuntimeError(f"Number of records not handled: {str(len(records))}")
        return records[0].sequence

In [54]:
def get_gb_sequence(g_path):
    for ind,seq_record in enumerate(SeqIO.parse(g_path, "gb")):
        print('--- seq number', ind)
        print(seq_record.id)
        print(repr(seq_record.seq))
        print(len(seq_record))
    return seq_record.seq

In [57]:
def get_fasta_sequence(f_path):
    for ind,seq_record in enumerate(SeqIO.parse(f_path, "fasta")):
        print('--- seq number', ind)
        print(seq_record.id)
        print(repr(seq_record.seq))
        print(len(seq_record))
    return seq_record.seq

In [63]:
def verify_gb_fafsa_compatibility(g, f, relpath=''):
    g_path = path.join(relpath, g)
    f_path = path.join(relpath, f)
    print('gb:')
    gb_seq = get_gb_sequence(g_path)
    print('fasta:')
    fa_seq = get_fasta_sequence(f_path)
    print('done')
    return gb_seq, fa_seq


In [64]:
gb_seq, fa_seq = verify_gb_fafsa_compatibility('sequence.gb', 'ncbi_dataset/data/gene.fna', relpath='./data/sirt1-human')

gb:
--- seq number 0
NM_001142498.2
Seq('GCATCTCCTCCTCCCTCTCCCCGGGCTCCTACTGGCCTGAGGTTGAGGGCGGCT...TTA')
3588
fasta:
--- seq number 0
NC_000010.11:67884656-67918390
Seq('GCCAGTGCCGCGCGTCGAGCGGGAGCAGAGGAGGCGAGGGAGGAGGGCCAGAGA...TTA')
33735
--- seq number 1
NC_060934.1:68753401-68787131
Seq('GCCAGTGCCGCGCGTCGAGCGGGAGCAGAGGAGGCGAGGGAGGAGGGCCAGAGA...TTA')
33731
done


In [69]:
print(gb_seq[:120])
print(fa_seq[25:][:120])


GCATCTCCTCCTCCCTCTCCCCGGGCTCCTACTGGCCTGAGGTTGAGGGCGGCTGGGGGCTCGGGGCAGGCTCCGCGGCGTTCCCCTCCCCACCCCGGCCCTCCGTTCAGCCGCGCTCCT
CAGAGGAGGCGAGGGAGGAGGGCCAGAGAGGCAGTTGGAAGATGGCGGACGAGGCGGCCCTCGCCCTTCAGCCCGGCGGCTCCCCCTCGGCGGCGGGGGCCGACAGGGAGGCCGCGTCGT


In [36]:
def get_fasta_sequence_length(f_path):
    for ind,seq_record in enumerate(SeqIO.parse(f_path, "fasta")):
        return len(seq_record.seq)

files_found = []
for root, dirs, files in os.walk("data"):
    # print(f"Current Directory: {root}")
    # print(f"Subdirectories: {dirs}")
    # print(f"Files: {files}")
    # print("-" * 20)
    DEBUG=False
    for f in files:
        if f[-6:]=='.fasta':
            fpath = root.split('/') + dirs + [f]
            printing_str = '__'.join(fpath)
            if DEBUG and 'human' in root:
                print (fpath)
            try:
                with open('/'.join(fpath), 'r') as filetoread:
                    print (printing_str)
                    print('  ', str(get_fasta_sequence_length(filetoread)))
                files_found.append('/'.join(fpath))
            except:
                print("Could not open this file or commence sequence-length-getting operations on it:", printing_str)
    


data__sir2-p-aeruginosa__sequence(4).fasta
   250
data__sir2-yeast__sequence(3).fasta
   562
data__sirt1-horn-fly__sequence-sirtuin-1.fasta
   872
data__sirt1-horn-fly__sequence-sirtuin-2.fasta
   381
data__sirt1-mouse__sequence(1).fasta
   576
Could not open this file or commence sequence-length-getting operations on it: data__sirt1-human__ncbi_dataset__sequence(2).fasta


In [37]:
files_found

['data/sir2-p-aeruginosa/sequence(4).fasta',
 'data/sir2-yeast/sequence(3).fasta',
 'data/sirt1-horn-fly/sequence-sirtuin-1.fasta',
 'data/sirt1-horn-fly/sequence-sirtuin-2.fasta',
 'data/sirt1-mouse/sequence(1).fasta']

In [None]:
from Bio import SeqIO
sequences = [SeqIO.read(file_, 'fasta') for file_ in files_found]

In [54]:
for s in sequences:
    print(s, '\n')

ID: XLJ82145.1
Name: XLJ82145.1
Description: XLJ82145.1 SIR2 family NAD-dependent protein deacylase [Pseudomonas aeruginosa]
Number of features: 0
Seq('MRAVVELLAGARRLVIFTGAGVSAESGIPTFRDALGGLWARYDPAALATPAAFA...FPG') 

ID: CAA96447.1
Name: CAA96447.1
Description: CAA96447.1 SIR2 [Saccharomyces cerevisiae]
Number of features: 0
Seq('MTIPHMKYAVSKTSENKVSNTVSPTQDKDAIRKQPDDIINNDEPSHKKIKVAQP...KTL') 

ID: XP_075153850.1
Name: XP_075153850.1
Description: XP_075153850.1 sirtuin 1 [Haematobia irritans]
Number of features: 0
Seq('MMDSSRQAVLSSERLKDIDDIHPIEFPEKADFDKFSVKTQNFTFGANILNTTTM...GPS') 

ID: XP_075148462.1
Name: XP_075148462.1
Description: XP_075148462.1 sirtuin 2 [Haematobia irritans]
Number of features: 0
Seq('MSESPTSSNKNKSKDDNETSSSAQNEEDNTIEVIRKFFTQKLNLVTSLDEEDGA...DAV') 

ID: AAI52315.1
Name: AAI52315.1
Description: AAI52315.1 Sirt1 protein [Mus musculus]
Number of features: 0
Seq('MAAAAAAAAIGYRGPYTFVQQHLMIGTDPRTILKDLLPETIPPPELDDMTLWQI...DKS') 

