In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from custom_random_forest import RandomForestClassifierCustom
from bioinformatics_tools import (run_genscan, GenomicElement,
                                  DNASequence, RNASequence, AminoAcidSequence)
from bio_files_processor import OpenFasta

###  Showcase for custom random forest classifier

In [2]:
X, y = make_classification(n_samples=100000)
random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, 
                                             max_features=2, random_state=42)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
%%time

random_forest.fit(X_train, y_train, 1)

CPU times: total: 5.61 s
Wall time: 5.6 s


In [5]:
%%time

random_forest.fit(X_train, y_train, 2)

CPU times: total: 7.11 s
Wall time: 3.89 s


In [8]:
%%time

prob_1 = random_forest.predict_proba(X_test, 1)

CPU times: total: 15.6 ms
Wall time: 40.1 ms


In [9]:
%%time

prob_2 = random_forest.predict_proba(X_test, 2)

CPU times: total: 46.9 ms
Wall time: 25.7 ms


In [11]:
sum(prob_1 != prob_2)

array([0, 0])

###  Showcase for OpenFasta

In [3]:
fasta_file = "./data/example_fasta.fasta"
with OpenFasta(fasta_file) as fasta:
    for record in fasta:
        print(record)
        print()

FastaRecord:
id = GTD323452 
description = 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+) 
sequence = ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG

FastaRecord:
id = GTD678345 
description = 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+) 
sequence = TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT

FastaRecord:
id = GTD174893 
description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+) 
sequence = TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT

FastaRecord:
id = GTD906783 
description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-) 
sequence = TTGAAGAGTTTGATCATGGC

###  Showcase for run_genscan

In [5]:
input_seq = "GTCCCTGGCGGCGGAGATGGCGGCGACAGCGGCGGAGGCTGTGGCCTCTGGCTCTGGAGAGCCCCGGGAGGAGGCTGGAGCCCTCGGCCCCGCCTGGGATGAATCCCAGTTGCGCAGTTATAGCTTCCCGACTAGGCCCA TTCCGCGTCTGAGTCAGAGCGACCCCCGGGCAGAGGAGCTTATTGAGAATGAGGTGGGGGGCGGGGCCGC GTCTAAAGGGAGAGGAGGAAGGTACCCGGTTGAGAGGTCAGAGGTGAATGGTGAAAGAGATGGGAGACTG GCAGGGCTCTAGACTAGGGCCTATGGGAAGATGGAGAGAAAGGCGAGGAGATACGGAACTTAGGGGTTCG TGCTGATTAACAGGAGAAATTATTCTGAAGTTGAGAAGTAGTTGGGATCATGGAGGCCAACCCACCGGCT CCTTGGCATTACTTCATTTGGTGTTTTTCACCTGTTGTTTTCATTTAGGAGCCTGTGGTGCTGACCGACA CAAATCTTGTGTATCCTGCCCTGAAATGGGACCTTGAATACCTGCAAGAGAATATTGGCAATGGAGACTT CTCTGTGTACAGTGCCAGCACCCACAAGTTCTTGTACTATGATGAGAAGAAGATGGCCAATTTCCAGAAC TTTAAGCCGAGGTCCAACAGGGAAGAAATGAAATTTCATGAGTTCGTTGAGAAACTGCAGGATATACAGC AGCGAGGAGGGGAAGAGAGGTAAATTCCGAAAGCTTAATTTTCTGTTGGATTTAGGACACTCATTTTTCT TTCCTATACTTGTTTGAAGGTGTGGCAGAATTGGGTCAATTAGAGATTGGCCTCTCCCATCTCTGGATTC CAGTTGAAGGAGTAGTCTGGTCTTCCAGCCTTTAAGGGGTTTGAGTTCTTAGTGGTTCTTGGTGCTACAG CCAGCTAGTTCTGTAGCTTGAGGGATGAGGGAATGATTAAACTTGAGCTATGAATCTCTGGGTTCCCATC TTCTTCCTCATAAAGAGTTAGTTGCCTAGAAGGGTCAGGCAGGTGAATGGAAGGAGTAATATCGTCTGTG TGAAATCAGTGATGAGGCTGAAGGCATAAGCCTTGTTCAAAGTGAGAAGCCAAGGCCGGCGGGGGTGGCT CATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGTGGGAGGATTGCTTGAGTCCAGGAGTTCAAGACCA GCCTGGGAAACGTGGTGAGAGCTAGTCTCTGCAAAATTAAAAACTAAAACAAAAAACCATGGTGAGAGGT CATTACTTGACTCTGCTTGCTTCAGTTTTGGAGTGCAGTTCTAGGGTTGGGAGATATTCCGATTTTCAAG AGAAGTTCAAAATCAGGATTTTTATTTAAAATCTTTTGACTTTAAATGGAGCAACTAAATCAGATTTAAA GAAAAAAAGTTGCAGGGGGCCAAACAAAAATGTTTGTCACTGGATTTAGCTCTGAAAATGCCTGTTTGTA GCCATTGTCCTTAAGTATTGTGATGACTGAGGATAGTTCAACCTGAAAAGTTGTATAGTTTGTTGTGATT CTGCAGTAAAGAAGGTTAGGATTCTCTAGCTTAGGGAAGATCAGATTAAATTTGTTTTTCCTATCACAAC AGCAACACAAGACACACAAGACAGTGATTCTTCCTCTCTGCTTTTATTCCTGCTTATGAGGACAGCCTCT TATGTAGGTCCGAGACAGTATTCAGTCCCACACTATAACACTGCCTCTGGGGAATTGCTTTCGCTCAGTA CTTCACGGTGAATTACCCTCTCGCATTTGGTGTCCAACCCTGGGTTTCTCTCTAGTCTCTTGCTTCACTC ACTCCTGTTCCTGACTCCTTATCAGCAGTCTGTGGCTAAGTGAATGGATTTGAAATATCTTTTTTTCTTC TTCTTAACACTTAGCAGTTGTCATAATGATGGGATTTTTGACCAGAAGAGTTTCCAAGATATGAACTGTG GTCGATCTGTCCATGATGTCTTAATATATTGGTTCAGTTATCTCATGCATTAATTTTCTAAAATTTAACA CAGTGTGTGTGTGCACACTTCCATATACACACACCCAACATTGAGAGAAAGATTGATTTAAATGATGCTG TTGGAGTGTGTAATTCCATTGAATGAACATATACCCAAGAATAAGCATAAGATATGTGAATCTTGTGGTT CTTCAGTCGCTTGAAGTTTCTTGTGTCCCATAGTGAGAGCATTTTGCATAAGAATGAGTGTAATCTGTAC TTTATGGGTGATTAAAGAGTTTTCAAGGAAAATCTTTAATTGTTTGGAAGTTTTACCCTATCAGCTGACT TTAGCACCGAGCAGCTTCACCAGTCAGCTTCACTTCAACCTGATTGGACAGGTTGTGGCGACTATGGGGT GAAATGTATTAGGATCCTCTGAGCTATTTTGAGCATTAACCATTCTGGTGTAATCATAATATTGTTTAGA TTCATATATGGGCAGTATCACTGACTCTTTCAAAGTTACTTGCATATTTTTTGGGTGAATATATAATTAT TATCTTTGGGGAGCGTTGCATTGAGTCTTGAGGATTTCATGTTCCTACAAAGGTAGTGTTTCCAACTCAT CAAGATTTCCTATTAAGGCTGGGCATGGTAGCTCACACCTGTAATCTCAGCACTTCGGGAGGCAGATCAC CTGAGGTCAGGAGTTCGAGACCAGCGTGGCTAACGTGGTGAAACCCCGTCTCTACTAAAAATACAAAAAT TAGCTGGGCATGGTGGCACGTGCCTGTAGTCTCAGCTACTTGGGAGACTGAGGCAGGAGAATTGCTTGAA CTCAGGAGGCAGAGGTTGCAGTGAGCCAAGATTGCGCCACTGCACTCCAGCCTGGGTGACAGTGAGACTC CGTCTCAAAAAAAAAAAAAAAAAAGATTTCTGATTAACATTGCCTTAAACATTGTGATTCTAGGCAGTGA GATGAGATGCCTTTGAACCTTTCTTTTTTTTTTTTTTTTTTGAGACAGGGTCTTGCTCTGTCACCCAGGC TGGAGTGCAGTGGCATGATCTCAGCTTACTGCAACCTTCACCTCCCAGGCTCAAGCGATCCTCCCACGTC AGCCTGTCAAGTAGCAGGGACCACAGGCATGTGCTGCCACCACCATGCCCAGCTAATTTTGTGTATTTTT AATAACCAATTTATTTATTTAAATACATAAGGTTTTGCCATGTTGCCCAGGCTGGTCTCGAACTCCTGAG CTTAGGCAATCCACCTCCCTTGGCCTCTCCAAGTGCTGGGATTACAGGCGTGAGACACCGCACCTGGCCT CTTTGAGCCTTTTGAAATAATGATCTCAGGACCAGGACAGAGATGCTTCTGTCTTTTTTTTAGGTTTGCA CACTTATTAAATGTCTCTTGTGGGCTTGTTCAAGTGTGTCTGGGTTTTCATATCTCTTAAGTAACTTATG ACCTTCCTCAGGTATGTTAGGAGCCTGAGGCCTTTACCTAATAAGCTATCTTCCTTACTTTAGCTGTAGA AGGCTGCAACAGGATAGGTGGCTGAATCAGTTAAGATAAACTAGGCTTCAAGCCAGAGCTCGGGCTAGAA ACAACAGAGGCTTATTTCTCACTTACTACATGTTGATCATGGGTAATGTCATATTGTTCTTTACATCATC TTACTTGCTCTGGCACCCAGATTGCCCCAGCAGCTACTATCTAGAACTTTGGAGTTGCTGTGGCAGAAGG AAAGAGATGTGGTAGTGGTAACTTGTTCTTAAATTACAGTTGGAAACAACACAAGTCGCTTCTGTGGACA TTTCCTTGGCCAGAGCAGCGTAAATAAGCAAGCCTGGCTTCAAGGGGATGTGAAAGAAAACTCGCACACT GAGGGAACCAGCTCTTGGTGAACAGTAATACAATTTACCACAGTGGTCATCAAGGGCCAACGGAGATGTC TCAGGCTCTGCGAGTCTCTGGAGTGAGACTGATTCATCATTCTAGGGTCTGTGGATAGGCTTCTAGGAGT CTGAAATTGTGTGAAAGTTCTGGGTGTATGTGCATTTTTATGGGGAACCTCCGTGGATTCTTTTATTTAT TTATTTATGTATTTATATTTTTTGAGATGGTGTCTTGCTCTTGTCACCCAGGCTGGAGTGCAATGGCACG ATCTCAGCTCACTGCAACCTCTGCCTCCCAGGTTCAAGTGATTCTCCTGCCTCAGCCTCCTGAGTAGCTG GGACTTCTTCTCCAGGTTCTTTTTTTTTTTTTTAATTTTATTTTTCATCTCACCCTCCTACTCTTTGGAT TCTTAAAGGAGTCATTCTGTGACCCAAAAAAGGTTAAGCATCACTTCTTTAAAGAAAAAGTTTGGGGGCC AGATGCAGTGTCTCACACCTGTAATCCCAGCACTTTGGGAGGCAGAAGCAGGAGGATTGCTTGAGCCCAA GAGTTTAAGACCAACCTGGACAACATATTGAGACCTTGTTTCTTTAAACAAAACAAAGAAAAACCAGTAA ATTAAAAAATTTGGTGTTGAACATAACTCAATGAAGGATAAGTAAGAATTCAGTTGTATGAGAATCCATG TTGGCTAAAATTTGCCTGACTACATCTGTTCTCTGTGGGAGATGCTGGTATGGATTTGGGCTTGGATTTT CAGATGTGGAGAGGCCATGCCAGGGGCTGTGTGTGCATGCACTAATAGGATTTTCTTCTTGGGGAACATA GGTTGTATCTGCAGCAAACGCTCAATGACACTGTGGGCAGGAAGATTGTCATGGACTTCTTAGGTTTTAA CTGGAACTGGATTAATAAGCAACAGGGAAAGCGTGGCTGGGGGCAGCTTACCTCTAACCTGCTGCTCATT GGCATGGAAGGTAAGAAATCTTTCAAAAGCAGCATGGCTTGGAGTCATATGAACCTGATTTCTAATCCTG TCTTCACCGCTTATTAGCTCCATGACTTTGAACAAGCCACTTCACTTTCTTAAGCCTCAGCCTACTCAGC TCTAAAAGAAACATAATAGGGGCTGGATGCCGTGGCTCACGCCTGTAATCCCAGCAGTTTGGGAGGCCAA GGTGGGTGGATGATGAGGTCAGGAGATGGAGACCATCCCGGCCAACATCATGAAACCCCGTCTCTACTAA AATACAAAGAAAACAAATCAGCTGGGCATGGTGGTGCGTGCCTGTAGTCCCAGCTACTCGGGAGGCTGAG AGAGGGGAATTGCTTGAACCCAGGCGGTGGAGGTTGCAGTGAGCCAAGATCACGCCACTGCACTCCAGCC TGGCGACAGAGCAAGACTCTGTCTCAAAAGAAAGAAAAAGAAACATAATAGGGCCAGGCATGGTAGCTCA TATTTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGAGGATTGCTTGAAGCCAGAAGTTTGAGGCTACA GTGAACTATGATTGCACCACTGCACTGCAGCCTGGGCAACAGAGTGAGATCCTATTGCTTAAAAAAGGAA AAATGAAAAGAAATGAAACATGATAATGTCTTGTAAATAGTTGTGAAGTTGTGAAGATTAAATGAAATAA TGTATGTCGAGTCTGTTTTTTTTTGTTGTTGTTGTTTTGTTTTTTTGAGACAGAGTCTCACTCTGTTGCC TAGGCTGGAGTGCAGTTGTGTGGTCTCGGCTCACTGCAACCTCCGCCTCCCGTGTTCAAGTGATTCTCCT GCTTTAGCTCCCAAGTAGCTGACACTACAGGCGTGTGCCACCATACCCGGCTTATGTCGAGTCTTTAAGC TAGTTTCGTGTATTCTAAAAAGTTCCCTTTTCCCTTCCTCAGCTGTTATGTGAGTGTGTATACTGAGATA TTAGGGACCTCTCTGAAATACAGACATGTGATGCTTAATGTCAGGGATACGTTCTGAGAAATGTGTTGTT AGGTGACTTCATCATTGTGCAAACACCATAGAGTATAGTTACACAAACCTAGATTGTGTAGCCTTCTATA TGCCTAGGCTATATGGTGTAGCCTCTGACTAAACCATGTACTACATGGTGTTAGCATATTACTGTGCTGA ATATTATAGTCAATTCTAACACAATGGTAAGTATTTGTGTATCTAAACATACCTAAATATAGAAAAGGTA TAGTAAAAGTACGGTATCAGATACTGCAGGAAAAAGGTGACAAAAGAAAAAAGTATATAAAGGTTAAAAA ATGGTAGTTTGGGAATTTTTTTATTGCCAAAAAAAAAAAAGATAAAAAACGGTATACCTGTATAGGGCTG CTCCATCATAATCTTATGGGAGCACTATCGTAGATGCAATCTGTTGTCGACTAAAATGTCATCATGTGGC ACATGACTGTACCACTTTTCTCTACTAGATCTGTCTATATTCAGACAATTTTTGTCCCTGTGAAAGAGGA GAAACTTATTTTTTTAGTGTAAATTCAACTTTATTGTTTTTATTTATTTATTTATTTTTGAGACAGAGTC TCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTACAAGCTCCGCCTCCCAGGTTC ACGCCATTCTCCCACCTCAGCCTCCCAAGTAGCTGGGACTACGGGCGCCCGCCACCACACCTGGCTAATT TTTTTTGTACTTTTAGTAGAGATGGGGTTTCACCGTGTTAGCCAGGATGGTCTCGATATCCTGACCTCAT GATCCACCTGCCCCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCTGGCAAATTCAAC TTTATCTTTTAAATAGATAATACATTCATGGAGTTCGAATTCAAAAGGTGTAAATAGGTACATAGTGATA ACTACCCCTGTCTCTTAGCCTCCCAATTTCCTCCCTGGAGGTACTGCATATTATTAGTTGCTTGTGTTTT CTGCTAGAGAAATTTATGCATACACTAATAAATATGTGAATGTGTTTTTTTTTTTTTTTTTTTTTTTCTT TTGAATGAAAGGAGAGGCCCTTAAACTCCACTCAGGAGAATGAGAGCTTTTTATGAGGCAGGGTCATTGT CTCATTCTTATTTTTATTTCTCGCAGTAGCTCAATGCCAGGCCCATGGTAGGTACTTAGTAAATGGTGAA TAACCTGGGTTGAACAAGGAAACAGAGAAGTTATTTATTGTCATTCACCACTTAGCTCCTTGGTTTCTTC TTGTTTGATTGTCTGTTCTTCAGATTTGGATCCTTTGCTGCTGGTACTAATAAAGATTTCCGGCCATGTG GCTTGTGAATTGATACCACCTGCCTCTTTTTTTGTTTTTGTTTTTGTTTTTTATTGAGACAGTGTCTCAC TCTGTCACCCATGCTGGTGTGCAGTGGCATGATCAGGGCTCACTGCAGCCTTGACCTCCCAGTCTCAAAC GATCCTCCCACCTCACCCTTCCAAGTAGCTAGGACTACAGGCGTAAGCCACCATGCTTGGCTAATTTTTA AAGTATTTTTCTGTAGAGATGGGGGTCTCCCTATGTTGCCCGGACTGGTCTCGAACTCCTGGCCCCCTCT CACCTTGGCCCCCTAAGTGATCCTCTCACCTTGGCCCCCTAAAGTGTTGGGATTACAGACGTGAGCCACT GTGGCTAGTCTTACCTGCCCCTTTGAAACCAACTCTTGGTTGACCTGCCAAGAAAGAGCCAGTTCTGGTT TACTTGACTCTTTTTTTTGAGACAGAGTCTCGCTCTGTCGCCCAGACTGGAGTGCAATGGCGTGATCTCA GCTCACTGCAAGCTCTGCCTTCCGGTTCACACCATTCTTCTGCCTCAGCCTCCCAAGTAGCTGGGACTGT AGGCGCCCGCCACCACGCCCGGCTAATTGTTTTATAGTTATAATAGAGACGGAGTTTCACCGTGTTAGCC AGGATGGTCTCAATCTCCTGACCTCATGATCCCCCGCCTTGGCCTCCCAAAGTGCTGGCATTACAGGCGT GAGCCGCTGCGCCCGGCAGTTTACTTGACTCTTATGTCTTTCTGTTGCCTGCGTTTCCTGTGGTAGGGTG CATGGAAGTGGATCCCTCAGGTAGAGATGAGAGAGCAAGTCTGCTTTGAGGATGCATGTGGACTCTCATC ACTTAAGAAAGCATCTGTCTTGGCATACCCAGACAGAGCTGGGGAAGATTGTCTAAATTCCATGCCCTGC AAAAGTGTGTGTTCAACTCATGGACTGTGCAAAGGCCAAACAACACACGTGGTCCTAGTTAGAAAGGAGT AGATGGAACATACACTAGGGCACAACTAATTCAAGGTGGCATTTAAAATAATGTGCCATATTTAAATTAC ACAGCACTCTCTTCAGTCATCTGATATTCTTTCCTCCAATGAAAATCACTTGCACCAGTGAGCCCCAATG TTGCTTTAAGAAGTGTTGTCTTGGCTAGATGTGGTGGCTCACGCCTGTAATGCTAACACTTTGGGAGGCT GAGGTGGACAGGTCACTTGAGCCCAGAAGTTGGAGACCAGCCTGGGCAACGTGGTGAAACCCCATCTCTT AAAAAAACAAGTATTGCATTCGTTAGGTGTATTGAAGTGGGAGGAAAGGTTGAGAATCTTTGAGTTCCAC AGAGCTACCCAAAATAAATACAGAGCTACATAATCATACAACTTAGAATGACTGGTTAACTCCTCATGTA TATGTTGTTGATACCAGTCATTTCTGTTTTGCTTTCCTTGAATTCCTGCCGTAGTCAAGAAGAGTACACG GTGTACACAGTTGATTACCCTACCTGATTTTTGAGGATCATTTTGATGTGCTCTTGCTTCCAGTAAGGGC AGCAGCCTTCATTTAGGGAAGTTGACTTAGCTGATAGTGGACCCTTAACCAGTGATGTTTGCTAGAGTTA TTACCATACAAGCTTAACACCTAATTTTCTGAGAGTTGTAATTGGAATTTCCCTGATTCTGCCTAAATTT TGATTTGGAACTTTTAGCTGGGAGGGCAGGCTGGGTTTCTCTTTAGCACTGGGTCCTGTATTCCAGCTTA TGCAATAGAACCACTGCTCTGCATAAGAAAATGCCTTTCTTCTTACAGGAAATGTGACACCTGCTCACTA TGATGAGCAGCAGAACTTTTTTGCTCAGATAAAAGGTTACAAACGATGCATCTTATTCCCTCCGGATCAG TTCGAGTGCCTCTACCCATACCCTGTTCATCACCCATGTGACAGACAGAGCCAGGTGAGCTTGTGTGGTC TGAGAAGGGTATAGAACTCTAGATTCTAGTAATGCCTAGGCTGGAATGTCTTTCCCCTTCCCCGTAGTGA TATGCCAGGTTCGGATTACAGGCTGTTCTTGCCTTAACACACCAAGAGGGTAATTTCATTGTAGCTCACA TATTCCAGGCTCCCTGGAGTGGCCACCAGATGTTTGTAAGTATAACTAGCCCCACAGAGCTAATTCTCAG GACAGACACTGTCATTGATGCTGATGGACTCTGGTTACTCTGTTTGATTGGCTTATAAGTTTATGGAACA AAATGTGTATGTCTCCATTGGATTTAATCATTGTTTGCTTTTGTCATGGATGCTATTTCTTTTTGATAAG AAGGGTGAATAAAACTGAGCTATGTTGTTTAATGTATTTCCTAGATCTAAATGAGGTATGTGGTTAACTG TGGCCAGCCAAAGATATCCTTTCCTTTCTTCAGAAAGCTGTCTAGCTACTTCATGTGGGAGTCTTTGCTG GTTAGAATATGTTTTTGGGAAAAGTTTCTGAAGGTTTAGGTGGAGGATAATTCCTAATTCCTCATATCCT CAGCTGTGGAGTAGGTCGTCTGACCTCAGGAGCACTGAAGCCTAGGATTCTTTATCACTTAACATTCAAA TGTGGACTGTCTTATATTGTTCTCTCTCAGGGTTTAATGTGCATTATATCTTCTGCTGGATATAAATTCC AAATACTTTTTTTAAGGCATCGTTTTTTTTTTGTTTGTTTGTTTGTTTTTACTGCCAAACTGGCATATAG TAGTTTTACTGGTGAAGATTTGGTGATTGATATTTTTTTTCTTTCTCTTGAACCTCTTAGGTGGACTTTG ACAATCCCGACTACGAGAGGTTCCCTAATTTCCAAAATGTGGTTGGTTACGAAACAGTGGTTGGCCCTGG TGATGTTCTTTACATCCCAATGTACTGGTGAGAAGGGGGCTAGGGCTGGGGGCTTTTTGGGAGCTTTCTT TTCCATTCCCTGGAAAGCCTTGTCTGTGTCGCTTCTGAGTTATGGCATGATTGGTTTTATGTGGTCTATA GGAAGGTTGGCATATCCAGACATGAGCGTACTGAACCTGTGGGGGATAGTGGCGGGAATGATAGCCCTGT CTCTAGGACAGGATGACGGTTCTTCAAGTCCAAAGAGATTCCGGAGGCTTGAGCCTAGAAAAACTGTTCT TGGCTCTTTGGTATTTCGAGACTGAACACCAGTACCTCTAACCCCCAAGTGTGTGTTTTTTTCACAGGAT GAGTGTAAAGATATTGTCTTCACTATTTTCCGTAACTGGACTCCACCTAGTTTTTCAGCCCAACCCTGCC ACCCCCCCGCACTTCGCCCCTCCGCCCCCGCCAAAAGTATCAGTAGTAAACAGGAGCCTGTTTGTTTTCT TGCAGGTGGCATCACATAGAGTCATTACTAAATGGGGGGATTACCATCACTGTGAACTTCTGGTATAAGG TGAATATGGTTTGCTTTTTTTGTTTTTTCCAGATGGAACTGGCTCTTGGGTGAGGTAGGTATAATAAATG ATTTTGGATGGGATGGTTGACTATCCCTGGAAGTGATTGTCAGGTGTCTGGGGAGATGAAAGGGACAGGA TTGAATTGGGTTTCTTTTTTTTAAATATATTTTTTGAGACAGAGTCTTGCTCTATTCCCCAGGCTGGAGT GCAGTGGCGTGATCTCGGCTCACTGCAACTTCTGCCTCCCGGGTTCAAGTGATTCTCCTGCCTCAGCCTC CCGAGTAGTTGGGATTATAGGCACACACCACCATGCCTGGCTAATTTTTGTATCTTTAGTAGAGATGGGG TTTCACCATGTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAAGTGATCCACCCGCCTCGGCCTCCCAAA GTGCTGGGATTATAGGCCTGAGTCACCGTGCCCGGCAAGAATTGGGTTTCTTAGAAGGGAGGATGGTACT AAGAACAGTAGAGTGACACTGACGCCTGCTGCTAAAGGCATTTCCTGAGCTACTGCTTCCTTTGTAGGGG GCTCCCACCCCTAAGAGAATTGAATATCCTCTCAAAGCTCATCAGAAAGTGGCCATAATGAGAAACATTG AGAAGATGCTTGGAGAGGCCTTGGGGAACCCACAAGAGGTAGGTGACTGCCCCAAGGTGGCTCAGTGGGT GGGTTGACCAAGGAAAGTCAGGATCAGAGCGACATTCTTTACAGCTTCTGTGTTTCAGTGTCTGGTGTCC TCTCTCTCCATAGAGGTTATGGGTATGTTCAGAAAAATTCTTGTTTTCTGGTGATAGCTGGGCTTGACTG GCAGATCAATTATAATTAGTTCCATGCTAGTAGTAATGTTCCCCATGGGTATTATGTACCATCTGGGGAC CCCCTGAGGGGATTCTTCTGGAACAGGAAGCGAGCACTAACACATCTCCTTGCACTTTCCCAGCTGTGCC AGGGCTGCTGTAGGTAATGTTGAGAGACCTGCTTCCCATGCAGGTGTGCCAGTGGCCCATTGGGTAAATG GGGGAGGTTTGTATTTGTAGGGTTCTTGCTTGCCTCTTGGAGGTAACCTCAGCATCCACAGACACCTGTG GTGTTGGGAGATAGCATCAGCCCCAGTTGCAGATGATCATTTTTTGAGGATATGGAGGCAGTTAACATTC TGGAGAAACATATAGACACTCTTGGGTGTTCGGGGATTATTAGCTTCTCTGACCATGTCTGTCCGTTTGG GGCAGATTACGGTAAGAGACGGTGTTGTGCGAGAGAAGAATAATAGTACAAGATGGACCCCTGAGGGAAA GACTATGTCCCAGAATACTGGGTTGACCTTCAGGAAGATAGGTGATTAGAGGATTCACAAGATTGTATGA AAACATTGTCTCTGTCCTTGGCTGGACATCTGATGTCTCCAGACACACCCTGTCCAATTCCAGGGCCAGG GAACAGCCAGTTCTGATTGGCTCCTCATGTTTCTTTACAGGTGGGGCCCTTGTTGAACACAATGATCAAG GGCCGATACAACTAGCCTGCCAGGGGTCAAGGCCTCCTGCCAGGTGACTGCTATCCCGTCCACACCGCTT CATTGATGAGGACAGGAGACTCCAAGCGCTAGTATTGCACGCTGCACTTAATGGACTGGACTCTTGCCAT GGCCCAGGAGTCAGGTGTTTGGAGCGAGGCAGGGCAGTTGGCACTCCACTCCTATTTGGAGGGACTTCAT ACCCTTGCCTCTTGTGCCCCAGCACCTTCTCTCTCTGCCCCCCGCCTAAAGTCCTGCATTCAGTGTGTGG AGTCCCAGCTTTTGGTTGTCATCATGTCTGTGTGTATGTTAGTCTGTCAACTTCGGAATGTGTGCGTGTG TGTGCATGCACACGCATGTATGTATCTGTTCCCTGTTCCTTCTGGGTCAGGCTGTCACTTCCGGCTCTCA GCCCTATCTCCTGCAACCTCAGTGCCTCAGCCTGAGAGAGAGATGAGATGCTCTTGGACTCCCCACTGCA TCTGGGCTGCAGGGCCAGAGCTAGTCTGACCATTAGGTCAGTCTGCCTCCTGACAGTTTTTGCGTAGTCA AGCTCTAGGCGGTATGGGAATGGCTACCGGGACTCTAATGGGGTGAAAGAGAGGGGAGGCTTGCCTTTGA GAGCCTATATAGCCTTCCTGTGAGAGAGGATTAGATAGGGTTCCAACTGGGCCTACAAGCTCAAGCCATA CATAAAAGGACCTTGGGACATAAGAACCAATGATTGTGCATAAGTTCTAAATTAGAGACACATATAGTTT CTCTCTTTCAGCACCAGCTCTTGCCCCTATGCTGGGTACCAAGGGAGTTCTCCTAGCTGTGGCTTCTCTA GGTTCTAGGGGTGCAAGCCTCTGTGTGTTTGTTTGTGTGTGTCTGTGTGTGCGTATCCACACTAGGGGTG CAAGCCTCTGGGTGTGTGTGTGTGTGTGCGTGCGTGTGTGTGTGTGTGTCCGTGTGTGTGTGTGTGTGTG TCCACACTGGCCAGCCTCCCTACTTACCAAGGTTCTCCACTGCTTACCTTTTCCAGTGGGACAGTACAGT GTGAGCCCCCGGGAAGTACTGCCTGACCTATCCTAAGCTTTTACACTTGGATTTTAGCCATCATATGTTG GCCAGGTCTCACTGCAGCCTGCCCGAGGCTAACTGGCTAGAGCCTCCAGGCCCTATGATGCTCCCTGCCC AGGCCATATCCTTTATTCCTGCTGAGCTTCCTGGCTGAATAGATGAAATGGGGTCAAGCCCAGGCAGCTC ATTCACTACCTGTGATCCACCTCAGGGCACGGGCAAACACATAGGCTTGCGTCTTAAAGCCAGCTCCTCT GCCAGACCCCGTTGTAATGTGCCACAACACCCTCAATAGTCAGGGCAACTGGTGGAGCATGGAAGTCGAA TTTCCTTTTCTGTTAGGAGCTACTCCTGGGAACCCCTCTCAGGGCTGCAGCTTACAGGTGGGCAGCTGTG ATTGCACAACTTGAAGGGCCATCATTCACATCTATTCAGTGGGAGTGGGGTCCCTGGGATTGGGCAGTGT GGTGGCCCTGTGTCTCCTCACCTCTGCTCCTGTCTTCATCACCTTCTCTCTGGAAGGGAAGAGGAGTTGG AAGGTCTCTGGTTTTCTTTTCTTTTTTTTTTTTTTTGCCAAAGGTTTACTTCCAGCATCTGAGCTCTGGC TCTCACCCCTGAAGCTCAGTTATAGTGCACTGATGAACTGAGAGGATGCGTGTGGATGTGTGTGCATGCC TGAGTGCGTTTTTTGGGGAGGGGTGTTTATTTTTAGTACCCCATTCTGGGGTTCTCTGATGCAGTGTGGA TGTGAAGATATGGTACCTTCTCAAGTGTAGCTCTTTCAAATATAGTCAATGCTGGGAAATGTGATTGCAG TGATCTCTATCTCTCCACTTCTTTTGGGAAAGAGGAGCACAGGAGCGGAGGAAAAACTTGGCCACAGTCA CACTTGGAAAGATAGTAGATTATTTTCGTTCTCCTCAGCAGGTCTGCTGTATTCCTCGCTCAGCGCTCAA AGATGTGGGTGATGTGTCAAGGAGGCATGCACTCAGCTGGTCCTTGCTGAGTTATTTGCTGACACAGGTC TCACTAAGGTGGCTGAGGGGTGGGAGGGAGAAGGCTAATCTTGGAACCTTCACAGGATTGGCCTTGATCC TTGGGTAACACAAGAGTGATTCCTGATTTTTAGACACCTAATATGTGCCTAGTGCTTTTCTCTTTGTTCC TCACTATCCTGCAAGGTAGGTATTCTCACTTACAGATGAATAAATGGGCTCAGAGAGATTAGGTGATTAG TGACAATCAGTTGTAACTTAGCACTGTTAGGCTGCAAACCCCTTGCTCCTTCCTTTCCACCAAACTATGT TGATTTTCCTTCCTTACCTCCCTCCTTGTACCACAGGTTCTGTCCTCAAGATATCACCCCACTGGGCAGT ATCTGAAGGCAGACCCAGCCCAAGATGGTGCAGAATATACAGCTCAGGGTGAAAGTTCTTCTGGGCTCCT GCTAACAAAGTCTTGGAAAAACTGCCTGGGAAGCCATATTATGAATTAGTGCTAGAAGAAGCTGTGGGGG GAACTGAGTTCTGTGTCTAATTGCATTACTTATGTGTCCTTGAATAGGACCCATAACTCCTCCCGGCATG CTTTTGCATCTATAGAACTGAAGCTGTAGTTCTAGATGGCACTAAGGGTCTTGGTTCATATTCAGCTGCT GCTTCCCCCCAAGGAAAAGATGAGGTTTTTCCCCTTCGATTACCTTTTGAGGGATGTTTAGGCTCAAATT CTGCTTGCCACAGAGGAAACATTTCTGCTGTAGTTGTTTACCTCCATCCTAGCTGGCAGTGTGGGACACA CACATACAGCAAGTGAAGGTGGGGGTTTGGGCCTGTATAGAGGTCAGCTACGCTGCAGGCTCATCTATTC ACACCAGCAGGTACAGAGAAAACTAGTGTTTTGTAAAGTCAAATATTTGTTGGGGGTTGGAGTTCTGGGG TTGGAGGAGGGGCTGTTCTGGGCATCAGTTGAGCAGATGCCCAGGATGCCTGGGGGAGACCAGCTTCCCC TACAAATCAGAGCTCTAAATTACAAGGTTTTTACCAACGTGAACAGTTGGGGGAAGTCGTCTGCTCTCAT TTGCGTAATGGTTTCTGTCACTGGTGATTAGACACAGGATGAAGGAAAAGAAATTTGACAATTAGGAATG AGCGATCATTAATCTGAATCTGTTAGGAGACAGAGAGGGGAAGGATCCTTATCAGTGGGCCAGCCCAGTA TGGGGAGACACTCCCTCCCTCTGCCCCCAGAGACTCCTGGGCTACATCCTCTTTCAGTATTGCCACAAGT GGGCAGAGCTTGTATTTCTTAACAAAGATTAGGGACCCAGTTGCCAGCCTGAGATGGATATAGGAACAGA CATCTTTGGGCATGAGCCAACAAAGATAGAAATAGATGAGTGTCAGGTACAAGCTATAAGGCAAAGAATA ACAGTGGAAAAGTTTTGACATGTGTTTGCCATTTGTGGAAAATGTTTTTGTAATAGTCAACACCCCTCAT AGCCCACCTGTGACAAGTACTTGTTACATTCCAGCACTGTAGGGCGTGAGGAAAATCTGGTCCTTATCAA ATCCCAGGAGCTTCTGCTTAGTTGGGGAAGAAATTACATGAAGCAACCAGAGGTTATAAGGCCACACTTG TATATCGTGCACCCTGTGTGGACAAGATTAGGGACTGTTGAGAGAGGAGGAAACCAGTAGAGAGCAAAGC TCTACCCAGGCTCCTTGTAAGCCTCTGGGCTCCCCCGAGAGGGCCTCGCTACTCTACGCTTCCTAGCAAC GTTGATGTCCCCACAACCCCACATCAGTGCAGCTGTGGCTGTGTGGAGGGGCTCTGAGGCCTCTGAGGCC AGATGTGTAAACAGTGCTGAGGTTCAGTAATAGGATGAGTCTTCAGGTGTGGAGCAGCCCACCTTGGCTC TTCCCATGTCTCTGTGTTACTTCTCATATTCTGCTGTCCTTTCAAACTTCAAGGACAGTATTAATTTATA CTAGTATTTCTTCCTCAGTTTTGTGACTTGAATGCAGTGAGTGCCTTAGAGGATCCAAGGATGAAGGAAT GCGGGTTGGTGGTTCTCTCTTTCAGAATGGGAACTTCCCAAAAATGGGGCTGCGTCTCGCCTCTCAGTAG GTTCCCTACCTCTGGGTCTTCCACCCTTCAAAATCTGGTACAGAATTTAGCAGGGGCTGCAGGGAATGAC CCTCAGGGACCAGTTTCACCCAGATGGGGTAGATGATGAGGTATACCTAGAAGTAGCTCAGGGCTCCAGA GTCTGGCCTTTCAGCTGGTGTCTTACTCCCCTTTGAGACCTTCAGCTGCTTGTCTTCCACTGCCTGCATA CTTCTCTGATGTCTTCTCAAACTGTTGGCTCTAAAGGGATGTTCCAGCCTCCCTCTCCAGAGCTGCCTGC AGTTTCCCTTATTCAGCTGTCTCCTGCCTAGTCTCTCCAGCGGCCTCCTTACCAAGACCCTTTTTCCTAA ACAGTTGTCCTTCCCTCAGAAACTCCATGGCTGCAAGTGTGAATAGACTTGTCTTTAGGATCCCTCTCTC TGGTCATCCCTGATCATCTTAGGGCCGTCTCCCTTTTCTGATTTTTGACAGCTCTGTTTATTTTGCCCAT GGCTAGTCCTCTTGTTTTGTGTCTGCTGTGTGTTCTTGCCCTTCCATTGTCAGTTCTGTACATCCTGGTT CAGCTTGTCTCACTGTGGTAGCCCCTTCTGCTTGTCTTCAGCCATCTTGGTTTAATCCTCATAGGTCATG GATGAATGGAGCCTTGCTGAGCACTTTGAGGGCATTTCTGTTGGGCACAACCTCAGAAACGGGAAGTCTT GAGGCCTGGGGGCGTGTGGTCAGTGAGGCCTCCAGAGGTGGGTTAGGGGTGGGCACCTACAAGGGCCAGT TGAGGGGAGAGGTGCAGGGATTGCTGAGGAGACTTTGTGAGAATGATGTCAGGCCCGGGAAGCTGGTGTG GAAGAACTGAGCTGTCAGAAGGAGATACCAATATCCCCTTTGCAGGGTGGTGATGGGCATCAAACATGAC GTGGAACATGTCCCCTTCATGGAGGGTTTCCATCTAAATGCATCTGTGTGCCTTAGGGATCAGTAGTTCA GGGGTGGTGGACTTTTTTGAGTGTCTCCTAATTTCTCTGAGGTGTCTTTGGCTCTGTGTCACCAAATGTG CCTGACCTCTGCATCCCAGTGGTTGCTGAAAGGGAAACAGTATATTGCGTAGTCAGATGTCAGGAGTAGA GATCTTAATGAGAAAGGCCCTGGCAGCTGGAGAGATGAGAGGACCTAGTGGTCAGCTTATACCCAGACTA TCTCTCCACTCCTCCAAGAGTTTGAGTCCCATAGCTGATGAAAGAATTTACATATGGGCTCCCCCCTGCT TTCCCACTGAAAATCTATTCTCATATTTTGGGAAGGAGCATATGTCCTTTAACTTTCAATGGTAAACAAT CTAGGATAGTGTTTAAAGGCATGGATTTTGGAGTCACATATAGTTGGTTTGAATTTCTAGCTTCACTACT TACCAGCTGGGTTTGACCTTGGTTGATTTACTTAACCTCTCTGAGCCTCAGTTTCCTCAACTATAATAAC ATGGGTGGAGGGAGGCATGTTGAAAATATCTACCTCAGGGTTGTTGGAATAACTGAAACAATGTTTGTAA AGCTTTAGCACAGTGCCTGGCAAGCACTTAATAAATGGCTGTGGTGGTGGTGGTTATATTTATTATGTGA TTTATTCAGAAACTTGGTTTCAGTGATCTCTAAACAGATTTCCAGTGGTTCTTTCTCTCTCAAGAAAAGC ATAGGGAAAAACGAGAAACACCTTAGATAAACCATATTGAGTTGAGGGTAGACAGAAGAGTCTAGTACCT AAACTCACCCTAGAGGAAGAGAAACAGGAAGTGTCTTATGCCTCTTCTGCCTGGGCAGCCAGACTTTACT GCTGTACTACCAGCCCAGGGCCTTGGGTGAGAAGGTTTCAGGTCTGAGCTCCCCTTGCACTGTGTGAGAG GCCGTGGAGCCGAATGAGTCCTTATTATACCTGGAGAAGCTGTTGTCTCCTCACACCATGAAATGTAATA GCAGTCAAATCAGGCCCCTCTTGATTCAATTTTAGTCCATGCCTTTCCATGTATTTAAATTATATAAGTG TTGGCCAGGTGTGGTGGCTCACACCTGTAATCCCAGCACTTCGGGAGGCTGAGGCAGGAGGATCACTTGA GCCCAGGAGTTCGAACCAACCTTGGGCAGCATAGTGAGACCCCTTCTCTTAATAATTTTTTTTGAAAATA ATTTTTTAAAAATTACAAATATGGACCAGACACCTGGGAGGATTACCTCACGCCTGTAATCCCAGCACTT TGGGAGGCTGAGGTGGGTGGATCACCTGAGGTCAGGAGTTTGAGACTATCCTGGCCAACATGGTGAAACC CTGTCTCTACTAAAAATACAAAAATTAGCCAGATGTGGTGGTGGGTACCTGTAATCCCAGCTACTCGGGA GGCTGAGACAGGAGAATCACTTGAACCTGGGAGGCGGAGGTTGCAGTGAGCTGAGATCACGTCATTGCAC TCCAGCCTGGGTGACTACAGTGAGACTCCCTCTCAAAAAAAAAAAAAAAAAATTATGAGTATGTTAAGAT TGTTCTAGGGGAGAAGGGCAATCAAAATTCAGAAAAATATGAGTGGGATGTTAATAGCTACCATTTGTTG AACACTTCTGTTCCAGGCACTGTATTAGTTCGTCCCCATGTGGTGCTCTCAGATCAGTAGGGGAGACAGG CAGGCAGTTAGAATTATGGTGTATAGAGGAGAAGCATTGCACTATGTCCAGTTTTTTATGGTAGGCTTCC TGCAGAAAGTGATAAGTCCAAGCTGATATGTGGAAGATGAATAATTAACCAGAGGTTGGGGGATTTCCCC CTTAGCTTCTCTTTCTTCCAGTCCAGCAGTCCAAAACAAACACTGCTTCCCAATGCCTAAGGGTTTTGTC GTACCTTACTCCAGGGAATGGGTGAGACAGTAGATCCCAAGAAGAAAACCCTTTGGTTAGTCACAGAAAT CTGAAAGTCTTGGTTGCTGGCTCAGATTTGTCCTGGAGTGGACGTAGATTGGACTTGCTTATTTGTAGCC CTCCACCGTTCTCATTTGTGTTCTGGCCTCTGGAGCCACCCTGCCAGACCCTACTTGGAAGAAATGTTGA ATCAAGCTACTCAGACTTCAGCCTGAGATGTATTATGGCTGGCTGGAATTCTCAGCCCTTTCCACATCAT GATCCCATTAATCTCTTGGGTCTATCCCCACGCCCAGCCACCTCTCCTCCTCACTATAAGCCAAGTCTCC CTGCAGTTCTCTATAAATACTCAGATCCTCTTGCCTCAGACCCTGGTCCCTAGCTTTGTGGGTGAAGAAG ATACCTCTTTAGCTTGTGCCAACCCATCCCTCTGGGCTACAGGCCATTCTGTAGTCACCTTGCAGTCACG TCTTTCCTTCTAGATAACTGGCTCTGGGCCTTGGAAACCTTCTCTTTGGCTCTAGCTAATTTTGCTGTTT TATTCCGTTTACTTAAAACCTATTCACTTGGTCATTGAGGTCAGAACTATCAGGAAAGCTGTGCTCCTTA ACCCATCATCTAAGCTAAGCTTGGGCCAGGTTTTTATTACCTCTTGAATATATATAGTGCAATATAGTGT AGTGGTTGAGAGTGGAGTCTGCATACCTGGATCTGCCATTCTGCATGACTTTGGGCAAATGGACAAACTG CCCGAAACTTGACTTGTTTATATGTATGGGGAAGACATCATTTACATTGTCAGGGTAAGGCTTAGCAGAA TGACCGGTCCATAGCAAGGGCTCAGTCTTTCTGTGTCTCTCAGTCTTTACCTCTCTTCCCCATCTAGCTT TCTGTCTAAGTTCCTGTTTCTAGTCTCAGGGCATATCTAGTTGTCAGCTCCTGAGCAGAGCAGAGGTATA AGTGTCAACTCATGCTTGTGTTACCAGTTCCTACCCCTGTCAGTAACCTCTGATGCTTACAAGGATACTA TCACCTATGCCATCAGTGGGGAATGACCTTCCAAAAATATTTTTAACAATACTAAAAATACCTGAAACAT ACTGGGTGAAAATAAGACATTGGAGCTGCTGCTCCCCAAGGAGATAAAAGGACAGTTCCTAAAGAGGCAT TGAAGGTTTGTATGAGCCAGCAGCACAGTCAGAAGGCAAGAAGGAGACCCTGGGATGTGCACAAGGATGC AGGAGACGAGATTGTCACAGAGGCTGTCATGCTAGTAAGAGTCAGAGGTTACTGATGGGTAGAGATTGTC AGAGTCTGCCTTGCTAGTCTAGAGGTAGGACCATGGATTTAAGAACGTTGCCTCCAAGTTTTTGAATTGT GAATTTTTGATCATATTTGAACAAAACCCCACCTACAGTCTGCATGGTCATTGTTCTCACAAGGGTTTGT GTGATGCACTGACAAGAACAGAGGCTTTGGAGGTGACTCCTGGGTTTGAATCACCATTTGCCACTAGCTA ATTCTAACCTTAGGTAAGTCAGTGTCTCTGGGTCTCAACCTCTTCCTCTGTGAGGGGTAGGAAATAGCAC ATAACTTGTAGCATTGTTATAAGGGCTCGTGATAATGTTTTTAAAACACCTGGCTCAAGCACTCAGGAAA ATGTTTTATTATGAAGACCAAGTGTCTCTGAAAAGTGTTTCCTTAGTGTCTGGAGAGACTACAGCAGCAA GGCTTTTCTGATCCTACTCTGACTCTCAAGATAATGGTGCTTGAAGTGAAATCACAGAAGAGATGACTGG GGCAGGAAAAGCAAGATGGAAAAGATAAAGTTGTGGAAGGGACTGAAAGTCCATGAAGTGCTGGGCTGGG GACAGAAAGGGTTTAATAGGGCAGGGTCTTGAGGAACAGGAACAGAGACCTTGGTTCTGGGTTGCCATCT GATTCCCAGAAGAGTAAAGGAAATGGGGCAGGAGGATGTCTGTGGAGGACACAGTGTGGCTGAGTGGAGA GAACACAAGCTTCTGGCCAGTGAACCTAAGTGAGAATGTCAGTGGACTTTGCACAGGCTGGGTGCCTTTG GGGGTGTTCATCTCGCTATGCTTCGGTCCCCTCATTTAAAAAAATTGGTTAATAATATGATGACCTGCCA ATGTGTGTGGAAATACTATGTAAACTGAGCACTATATAAATGCTGGTTGTTGAGATATTGGGAGTGGGGG TTTCGGGAGATTTTGTTTGTTTGTTTTTTTGAGATGGAGTTTTCGCTCTTGTTGCCCAGGCTGGAGTGCA GTGGCGTGATCTTGGCTCACTGCAACCTCCACCTCCTCCAGGTTCAAGCAATTCTGCCTCAGCCTCCCGA GTAGCTGGGACTCAGGCGTGCGCCACCATGCCTGGCTAATTTTGTATTTTTAGTAGAGATGGGGTTTCTC CATGTTGGTCAGGCTGGTCTCGAACTCCCGACCTCAGGTGATCTGCCTGCCTTGGCCTCCCAAAGTGCTG GGATTACAGGTGCGAGCTACCACACCTGGCCCTGGATTTCTTAATAGGTCCTCTTATGTGGCTATTGAGA ATGAATGAGACAACTCATTAACTAACTAAGTTAATTCAGAACTTAGTGCCTAGTAAGTGCTCAGTAAATG TTAGTTTCTTTTCCTCCATTTCTTCTTAGGAAGCATCGTTTACCCAGCATAAACAGGAGGACTCAACTGT GCATGTCTTAGTGTGAACAAGTAACATCAGATTTCCCTTTTCTGAGGAGAGATCAGTTCTGTAAGAGGGA GCATGAGTGTGAATGGGGAAGAATCATGGAGAAGGGAGCAGGACTCCACAGTACTGGAGATGGAGTAGGG ACCTCTGGGAATCGTTGGCATGGGGGTGTGGTGCAGAAAGGGTGATGCTAAGAGGTGAGAGTCTCAAGAA AGAAGATGGGTGTTCTAGAGGGGAAAGCTAAACGATGATAGGCAATGAGGAGCGGAGGACAAGACCCTGA ACTAAGGGTGAGGAGGCCAGCATTCTGTCAGGAGAACCTTAGGAATTGTGACCAGAAGTCAAAGTTGTTA CAGCAATCCTGCATTCAACCAGGTTTCATCTGAAAAACTTAAGAGTTGCTAGGCACAGTCGTGCACACTT GTAATCCCAACACTTTGGGAGGCTTGAGTCAGGAGGAAGCTTGAGGCCAGGAGTTTGAGATCAGCCTAGG CAGCATAGTGACACCCCATCTCTTGAAAAAACAAACACTCTTAAGAGCAGTGGTTCACCTAGAAATAAAA AGCAGAAATCATCAAGATGAGAAGACATTTTGTGTTTTGACAGCCAAGGAAATTTGCCACTGAGACAAGG TATCTGACAGCCTAGATGGGATTCTAGACTTCTGTGAGAAAGTGCAGTTAGGCCCTGAATAGCGTCAAAC TCAGGGATGTGAGCTGACTGACTGCCTTTTTATGTAATGGAGGCCATGTGATAGGGCTTTATGAGGCCAC AAATAGCACTTTCTGGGTCCTTTCACAGTCTAGTCAGTGGCCCAACCAACATATCTGGTTGTTCTGCAAT CTCAGTCATCTGAGAAGATAACCTCCAAAATACATGCAGTGGCATCAGGTGTTTCCTTGGTAGGTGTGCA TCTTTTAATGATGGAGGACTTAGGATGAAGAACAGGTGTGTTTAAAAACATCAAACCCCCACATCATTAC TATTTGCTGTAAGGATCTATGCAGTTTGTATTTTAAGTCCTTGACATATGGAAGAGCATTTGTGAGAGTA ACAGGTTTGCTTTGTAATGCCAACTAAAAAGTGTGTATTTATACTCGAATTTTAAGTTGAAATTTTGCTA AGTTTCTTTTTTTTTTTTTTGAGACGGAGTCTCACTCTGTTGCCCAGGCTGGAGTACAGTGGTACAATCA TTGCTAAGTCTTTCTGTAGTTTTAGAACATTTAGAACAACTTAAGATTTCCTGTATTTATAAACTTAATT TATTGGAAATATAAGTACTTGATCAGGTTTTGTCTTGGTCAGATAAATTAAGTACTTAAGGAAATCAAAT ACATTAATCATATAAATGCCATTTAAGTGTTTAAAGGAGTTTAGTTAAATGTAAGTTGGATCAAACATTA AACTTTTTTGTTTTATAAGCAAAATAATAAGATTTGTTAGCAGAACTTAAAAGCTTAGAAAAGAGAGGAT TTTGGATATGTAACTTTGAAGAGGTGAATAGTTTGCTCTTTTCTGAGACAGGGCATTGCTTTTCGACGCA GGCTGGAGTGCAGTGGGGCAAGCATGGCTCACTGCAGCCTCAACCTCCTGGACTCAAGTGATCCTCCTGC CTCAGCCTCCCATGTAGCTGGGATCACAGGCACATGCCACCACACCTGGCTAATTTTTTATTTTTTGTAG AGACAGGGGCCTCACTTCATTGCCCAGGCGGATCTCGAACACCTGGGCTCAAGCAGTCCTGCCTCAGCCT CCCAAAGTTCTGGGATTACAGGCATAAGCCACTGTGCCTGGCCTTGAATATATTTTTTAACCAAAGTAAT TTATGAATATAAATTGTTTCTATATAACCCAAAGTCACTCTCATTGACATTAAGAAACTCAGAGAGTGGG TTCTAAATGTAATATTACCACACACCCAAGTCACTTAGCCTCTACAAGTACCACGTAACTAGCAGCCGTT AAAAGATGGCCACCGCTAGTCCAGTCGGCCACAGACATGGTTTTGTTTGGCCAATACAAGTTTTTGAAAG TTTTAAAAAATTGACAACATTAAAGATCAGAAACTTGACCTAAAAA"
result = run_genscan(input_seq)
print("CDS:")
print(result.cds_list)
print()
print("Introns:")
print(result.intron_list)
print()
print("Exons:")
print(result.exon_list)

CDS:
MAATAAEAVASGSGEPREEAGALGPAWDESQLRSYSFPTRPIPRLSQSDPRAEELIENEEPVVLTDTNLVYPALKWDLEYLQENIGNGDFSVYSASTHKFLYYDEKKMANFQNFKPRSNREEMKFHEFVEKLQDIQQRGGEERLYLQQTLNDTVGRKIVMDFLGFNWNWINKQQGKRGWGQLTSNLLLIGMEGNVTPAHYDEQQNFFAQIKGYKRCILFPPDQFECLYPYPVHHPCDRQSQVDFDNPDYERFPNFQNVVGYETVVGPGDVLYIPMYWWHHIESLLNGGITITVNFWYKGAPTPKRIEYPLKAHQKVAIMRNIEKMLGEALGNPQEVGPLLNTMIKGRYN

Introns:
[GenomicElement(number=1, start=193, end=469), GenomicElement(number=2, start=719, end=4692), GenomicElement(number=3, start=4840, end=9009), GenomicElement(number=4, start=9154, end=10001), GenomicElement(number=5, start=10107, end=10576), GenomicElement(number=6, start=10639, end=11198), GenomicElement(number=7, start=11308, end=12151)]

Exons:
[GenomicElement(number=1, start=17, end=193), GenomicElement(number=2, start=469, end=719), GenomicElement(number=3, start=4692, end=4840), GenomicElement(number=4, start=9009, end=9154), GenomicElement(number=5, start=10001, end=10107), GenomicElement(number=6, start=10576, end=10639)

In [4]:
result = run_genscan(sequence_file='./data/BRCA1.fasta')
print("CDS:")
print(result.cds_list)
print()
print("Introns:")
print(result.intron_list)
print()
print("Exons:")
print(result.exon_list)

CDS:
['DYEVTFTEDKINALIKAASVNIETFWPGLFAKVLANVNIGSHICSVEGGKKTGLQPARATRILRALLVRHLPHWDASPEHCMAGPHGGSTQWLNTGKERALGVWTSKTWIGLGNLPTPFEWKCGLITHGGLADNCADLLLGSSMAAPSVELTFFLGILAAGKACGSARGLRSFWTEAEATAAPEKAFWLKVEVHGVRRTA', 'MRGDNVLAALARSRRLLGLGVHSGRAGGALQPATALWGPLSGLAEARAGSLCLRGSVEGEAGVGTGAARSARQPARVPASLLKPARPRIHQKEVTLNTSEHQKEQTLDTSSLRTVPLTVRVRNFILEVSETKNPPISDTRTSGFY', 'MNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTENLIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHSRETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVTFECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKD

###  Showcase for AminoAcidSequence, DNASequence, RNASequence

In [6]:
sequence = 'DYEVTFTEDKINAL'
peptide = AminoAcidSequence(sequence)
print(peptide.molecular_weight(), 'kDa')

1.9 kDa


In [12]:
sequence = 'ATGCCGTGCA'
dna = DNASequence(sequence)
print('Transcribed sequence', dna.transcribe())
print('GC conten', dna.gc_content())

Transcribed sequence AUGCCGUGCA
GC conten 0.6


In [15]:
sequence = 'AUGCCGUGCA'
rna = RNASequence(sequence)
print('Alphabet of RNA seq is', rna.alphabet)

Alphabet of RNA seq is {'C', 'U', 'G', 'A'}
