Example Script for Sequencing
===


In [5]:
from biolabsim import set_seed, FabricatedHost, Sequencer, print_scaffold, write_scaffolds_to_file, \
    RandomAssembler, estimate_from_overlap, print_assembly_evaluation, print_estimation_evaluation, \
    GreedyContigAssembler, EstimatedSequence, evaluate_sequence

# Set random seed
set_seed(20210121)

In [2]:

# Get a good Host to be sequenced. Fabricate it to have a very simple example.
host = FabricatedHost( genome_size=80, gc_content=0.6 )
genome = host.get_genome()

# Create the Sequencer. It will fragment the genome.
shotgun = Sequencer(
    library_size_mean=50, library_size_sd=5,
    read_method='paired-end', read_length=20,
    average_coverage=5, call_error_beta=2.85
)
scaffolds = shotgun.apply( genome )

In [3]:
# Print the obtained scaffolds.
print("\n")
for i in range(len(scaffolds)) :
    print_scaffold(scaffolds[i])
print("\n")

# Store them in files.
write_scaffolds_to_file( scaffolds, "output/present-R1.fastq", "output/present-R2.fastq" )



| TATGGGTTCGCTAGTACCCA~~~~~~(ca.10)~~~~~~GTCGAGGCGCGTCACTATCT
| GGTTCGCTAGTACCCAGTAC~~~~~~(ca.10)~~~~~~CGGCAATTCAGTCGAGGCGC
| GCGATCCATCGGTGCCATAT~~~~~~(ca.10)~~~~~~GGTTCGCTAGTACCCAGTAC
| ACCGCGATCCATCGGTGCCA~~~~~~(ca.10)~~~~~~TAGTACCCAGTACCGGCAAT
| CGGTGCCATATATGGGTTCG~~~~~~(ca.10)~~~~~~AATTCAGTCGAGGCGCGTCA
| GTTCGCTAGTACCCAGTACC~~~~~~(ca.10)~~~~~~GCAATTCAGTCGAGGCGCGT
| CCATCGGTGCCATATATGGG~~~~~~(ca.10)~~~~~~TACCGGCAATTCAGTCGAGG
| GCTAGTACCCAGTACCGGCA~~~~~~(ca.10)~~~~~~GAGGCGCGTCACTATCTCTA
| ATATATGGGTTCGCTAGTAC~~~~~~(ca.10)~~~~~~GGCAATTCAGTCGAGGCGCG
| CGCGATCCATCGGTGCCATA~~~~~~(ca.10)~~~~~~CTAGTACCCAGTACCGGCAA


R1 file stored in: /home/shima/Documents/RWTH/BLS-BiotechSimPraktika/repo-biolabsim/output/present-R1.fastq
R2 file stored in: /home/shima/Documents/RWTH/BLS-BiotechSimPraktika/repo-biolabsim/output/present-R2.fastq


The Scaffolds have been generated and are accessible by variable or in `.fastq` files.

In [9]:
#
# This section is written by the student.
#

estimated_sequence = EstimatedSequence({
    'A': [ 1  , .2 , 0  , 0  , 0  , .1 , 0  , .7 , 0  , 0  , .25 ],
    'C': [ 0  , .4 , .2 , 0  , 1  , .9 , .5 , 0  , 0  , .3 , .25 ],
    'G': [ 0  , .4 , 0  , 0  , 0  , 0  , .5 , .3 , 1  , .1 , .25 ],
    'T': [ 0  , 0  , .8 , 1  , 0  , 0  , 0  , 0  , 0  , .6 , .25 ],
})
shannon_entropy = estimated_sequence.calc_shannon_entropy()
print("Shannon Entropy: {}".format(shannon_entropy))


Shannon Entropy: 0.7172367751666382


Now the exercise will evaluate the estimation and return a score.

In [10]:
print_estimation_evaluation( estimated_sequence, genome )

---------------------------------------------------------------------------------------------
[heatmap:A] ···································#2···1·7··2··································
[heatmap:C] ····································42·#95··32··································
[heatmap:G] ····································4····53#12··································
[heatmap:T] ·····································8#·····62··································
---------------------------------------------------------------------------------------------
[consensus]                                    ACTTCCCAGTA
---------------------------------------------------------------------------------------------
 [real-gen] TACCGCGATCCATCGGTGCCATATATGGGTTCGCTAGTACCCAGTACCGGCAATTCAGTCGAGGCGCGTCACTATCTCTA
---------------------------------------------------------------------------------------------
Final Score: 0.0894 (8.9%)


In [11]:
# Example using the Random Assembler.
rnd_assembler = RandomAssembler( expected_genome_size=len(genome) )
rnd_locseqs = rnd_assembler.apply_internal( scaffolds )
rnd_estseq = estimate_from_overlap( rnd_locseqs ) # rnd_assembler.apply( scaffolds )
print_assembly_evaluation( rnd_locseqs, genome )


------------------------------------------------------------------------------------------------
[heatmap:A] ············#··366·2·542··53222·2·1511221221222121134221··133251115···42222·5··5·#·
[heatmap:C] ··········#····6333·5··46111·33713221325331221332363··265313341263··444224·3·55····
[heatmap:G] ·········#···#6····25·4·26156·316232511·31233513131142·126313··1·3524622227·5·555··
[heatmap:T] ···········#··3···65·524211·13·1·42·24421222212221·1256·1·31·22411·82··442·6·5··5·#
------------------------------------------------------------------------------------------------
 [coverage] ·········11112333334445556668888998889988878988976665456766667776665555555432222211
------------------------------------------------------------------------------------------------
  [seq:000]                                                TATGGGTTCGCTAGTACCCA                 ( 30%)
  [seq:001]                                      GTCGAGGCGCGTCACTATCT                           ( 25%)
  [seq:002]            

In [13]:
# Example using the Greedy Contig Assembler.
gca_assembler = GreedyContigAssembler()
gca_locseqs = gca_assembler.apply_internal( scaffolds )
gca_estseq = estimate_from_overlap( gca_locseqs ) # gca_assembler.apply( scaffolds )
print_assembly_evaluation( gca_locseqs, genome )

---------------------------------------------------------------------------------------------
[heatmap:A] ·#·····#···#········#·#·#··········#··#···#··#·····##···#····#········#··#·····#
[heatmap:C] ··##·#···##··#····##···········#·#·····###····##··#····#···#····#·#··#·#···#·#··
[heatmap:G] ····#·#·······##·#········###···#···#······#····##·······#··#·##·#·#············
[heatmap:T] ········#···#···#····#·#·#···##···#··#······#········##···#·········#···#·#·#·#·
---------------------------------------------------------------------------------------------
 [coverage] ·1123333344445555555655544467666778888887776777667787655566677776665433222222111
---------------------------------------------------------------------------------------------
  [seq:000]                            GGTTCGCTAGTACCCAGTAC                                  (100%)
  [seq:001]                            GGTTCGCTAGTACCCAGTAC                                  (100%)
  [seq:002]                             GTTCGCTAGTACC