# SeqDatSim Notebook

Simulations of gene sequencing with shot sequencing methods.

In [1]:
# Allow imports from the current directory.
import os
import sys
sys.path.append( os.path.abspath(os.path.join('.')) )

import numpy as np

# This workflow uses the silvio methods directly, but it would be possible to redirect every
# single method call thorugh the SeqExperiment.
from silvio import (
    estimate_from_overlap, evaluate_sequence,
    print_scaffold, print_assembly_evaluation, print_estimation_evaluation,
    write_scaffolds_to_file
)
from catalog import SeqExperiment

In [2]:
exp = SeqExperiment( seed=2021 )

host = exp.create_host( name="origin", bg_size=120, bg_gc_content=0.6 )
genome = host.get_genome_sequence()
shotgun = exp.create_sequencer(
    library_size_mean=80, library_size_sd=10, read_method='paired-end', read_length=30,
    average_coverage=5, call_error_beta=6
)
rnd_assembler = exp.create_random_assembler(expected_genome_size=len(genome))
gca_assembler = exp.create_greedy_assembler()

exp.print_status()
host.print_status()

Experiment:
  hosts = [ origin ]
Host [origin]:
  seed plus counter = 2282102634837760632 + 1
  Gene List: 0 genes
  Event History: 0 events


In [4]:
# Create the Sequencer. It will fragment the genome.
scafs = shotgun.apply( genome )

# Print the obtained scaffolds.
print("\n")
for i in range(len(scafs)) :
    print_scaffold(scafs[i])
print("\n")

# Store them in files.
write_scaffolds_to_file( scafs, "output/tryseq_R1.fastq", "output/tryseq_R2.fastq" )




| TCGTGCCCGGGTTCGGTTTTGAGCCGTCTA~~~~~~~~~~~(ca.20)~~~~~~~~~~~TGTAAACGGCGATTACGGCTAGGGTGGCGC
| CAAGGGGACGTTTCGGCAGGCCTCGTGCCC~~~~~~~~~~~(ca.20)~~~~~~~~~~~CGGTTTTGAGCCGTCTACCCCGCGCGAAGC
| TTGAGCCGTCTACCCCGCGCGAAGCTTCAC~~~~~~~~~~~(ca.20)~~~~~~~~~~~AACGGCGATTACGGCTAGGGTGGCGCGGGG
| CGGGTTCGGTTTTGAGCCGTCTACCCCGCG~~~~~~~~~~~(ca.20)~~~~~~~~~~~AACCTGTAAACGGCGATTACGGCTAGGGTG
| TTCGGCAGGCCTCGTGCCCGGGTTCGGTTT~~~~~~~~~~~(ca.20)~~~~~~~~~~~CGAAGCTTCACATGAACCTGTAAACGGCGA
| GCCGTCTACCCCGCGCGAAGCTTCACATGA~~~~~~~~~~~(ca.20)~~~~~~~~~~~GCGATTACGGCTAGGGTGGCGCGGGGCCGG
| TTTTGAGCCGTCTACCCCGCGCGAAGCTTC~~~~~~~~~~~(ca.20)~~~~~~~~~~~AACGGCGATTACGGCTAGGGTGGCGCGGGG
| GGGACGTTTCGGCAGGCCTCGTGCCCGGGT~~~~~~~~~~~(ca.20)~~~~~~~~~~~TGTAAACGGCGATTACGGCTAGGGTGGCGC
| ACGTTTCGGCAGGCCTCGTGCCCGGGTTCG~~~~~~~~~~~(ca.20)~~~~~~~~~~~GCCGTCTACCGCGCGCGAAGCTTCACATGA
| AGCCGTCTACCCCGCGCGAAGCTTCACATG~~~~~~~~~~~(ca.20)~~~~~~~~~~~GGCGATTACGGCTAGGGTGGCGCGGGGCCG


R1 file stored in: /home/shima/Projects/BioLabSim/repo-biolabsim/notebooks/S

In [7]:

# Place the obtained R1 sequences randomly together and test the consensus sequence.
rnd_locseqs = rnd_assembler.apply_internal(scafs)
rnd_estseq = estimate_from_overlap(rnd_locseqs) # rnd_assembler.apply(scafs)

print("\nRandom Positioning - Shannon Entropy: {:.4f}".format(rnd_estseq.calc_shannon_entropy()))
print("\nRandom Positioning - Positioning <=> Real Genome:")
print_assembly_evaluation( rnd_locseqs, genome )
print("\nRandom Positioning - Estimation <=> Real Genome:")
print_estimation_evaluation( rnd_estseq, genome )
print("\nRandom Positioning - Evaluation: {:.4f}".format(evaluate_sequence(rnd_estseq,genome)))



Random Positioning - Shannon Entropy: 1.3246

Random Positioning - Positioning <=> Real Genome:
-----------------------------------------------------------------------------------------------------------------------------------------------
[heatmap:A] ##····5442··2·12221·111111·2·112···3···22···12·22····62·11···4·2··2····35··22·224113··23133··3113·52422···3·6335····#·············
[heatmap:C] ··#3···4·2226512·13322332·522253261125762121·62222·222·43·14246644·84443··5··64425215325·515515315·2·45555·33335··#···············
[heatmap:G] ···366··4·4625622123531127335513335·852248735·4225722·86351642424482666·25522424214556415·3131133334442·2233··3·##·#··············
[heatmap:T] ···333522642···1452212333111212·3·35····2··53242222552··136·4···22·····325·56·2·211···2·3113131111122··52233·3····················
-----------------------------------------------------------------------------------------------------------------------------------------------
 [coverage] 1113334555555667788888888888898

In [11]:

# Try to assemble the scaffolds using the greedy assembler.
gca_locseqs = gca_assembler.apply_internal(scafs)
gca_estseq = estimate_from_overlap(gca_locseqs) # gca_assembler.apply(scafs)

print("\nGreedy Contig Assembler - Shannon Entropy: {:.4f}".format(gca_estseq.calc_shannon_entropy()))
print("\nGreedy Contig Assembler - Positioning <=> Real Genome:")
print_assembly_evaluation( gca_locseqs, genome )
print("\nGreedy Contig Assembler - Estimation <=> Real Genome:")
print_estimation_evaluation( gca_estseq, genome )
print("\nGreedy Contig Assembler - Evaluation: {:.4f}".format(evaluate_sequence(gca_estseq,genome)))





Greedy Contig Assembler - Shannon Entropy: 0.0000

Greedy Contig Assembler - Positioning <=> Real Genome:
-------------------------------------------------------------------------------------------------------------------------------------
[heatmap:A] ··················#·#·····#·#·····#··#···#·##·····························#·····#········#··#··####···#··#·#·#···#······
[heatmap:C] ··········#····#·····##··········#·····#··········#·#·#···####·#···##··#······#··#·#··#·#·#··#·····#·#··#·#·············
[heatmap:G] ············###·##·#····#··#·##····#····#·#·····##·#·#·###····#·#·#··##·##··##····#··#························#·········
[heatmap:T] ·········#·#···········#·#·····##···#·#······###·················#·········#···#····#··#···#··#·····#··#····#··##·####··
-------------------------------------------------------------------------------------------------------------------------------------
 [coverage] ·········11111123355556666888888899####999999877555544442333333232222334455677777