# SeqDatSim Notebook

Simulations of gene sequencing with shot sequencing methods.

In [None]:
# Allow imports from the current directory.
import os
import sys
sys.path.append( os.path.abspath(os.path.join('.')) )

import numpy as np

# This workflow uses the silvio methods directly, but it would be possible to redirect every
# single method call thorugh the SeqExperiment.
from silvio import (
    estimate_from_overlap, evaluate_sequence,
    print_scaffold, print_assembly_evaluation, print_estimation_evaluation,
    write_scaffolds_to_file
)
from catalog import SeqExperiment

In [None]:
exp = SeqExperiment( seed=2021 )

host = exp.create_host( name="origin", bg_size=120, bg_gc_content=0.6 )
genome = host.get_genome_sequence()
shotgun = exp.create_sequencer(
    library_size_mean=80, library_size_sd=10, read_method='paired-end', read_length=30,
    average_coverage=5, call_error_beta=6
)
rnd_assembler = exp.create_random_assembler(expected_genome_size=len(genome))
gca_assembler = exp.create_greedy_assembler()

exp.print_status()
host.print_status()

In [None]:
# Create the Sequencer. It will fragment the genome.
scafs = shotgun.apply( genome )

# Print the obtained scaffolds.
print("\n")
for i in range(len(scafs)) :
    print_scaffold(scafs[i])
print("\n")

# Store them in files.
write_scaffolds_to_file( scafs, "output/tryseq_R1.fastq", "output/tryseq_R2.fastq" )


In [None]:

# Place the obtained R1 sequences randomly together and test the consensus sequence.
rnd_locseqs = rnd_assembler.apply_internal(scafs)
rnd_estseq = estimate_from_overlap(rnd_locseqs) # rnd_assembler.apply(scafs)

print("\nRandom Positioning - Shannon Entropy: {:.4f}".format(rnd_estseq.calc_shannon_entropy()))
print("\nRandom Positioning - Positioning <=> Real Genome:")
print_assembly_evaluation( rnd_locseqs, genome )
print("\nRandom Positioning - Estimation <=> Real Genome:")
print_estimation_evaluation( rnd_estseq, genome )
print("\nRandom Positioning - Evaluation: {:.4f}".format(evaluate_sequence(rnd_estseq,genome)))


In [None]:

# Try to assemble the scaffolds using the greedy assembler.
gca_locseqs = gca_assembler.apply_internal(scafs)
gca_estseq = estimate_from_overlap(gca_locseqs) # gca_assembler.apply(scafs)

print("\nGreedy Contig Assembler - Shannon Entropy: {:.4f}".format(gca_estseq.calc_shannon_entropy()))
print("\nGreedy Contig Assembler - Positioning <=> Real Genome:")
print_assembly_evaluation( gca_locseqs, genome )
print("\nGreedy Contig Assembler - Estimation <=> Real Genome:")
print_estimation_evaluation( gca_estseq, genome )
print("\nGreedy Contig Assembler - Evaluation: {:.4f}".format(evaluate_sequence(gca_estseq,genome)))


