In [1]:
# pip install -r requirements.txt

In [2]:
from sklearn.datasets import make_classification
import time
from Bio import Entrez
from bioseq import DNASequence, RNASequence, AminoAcidSequence
from bioseq import run_genscan, GenscanOutput
from bio_files_processor import OpenFasta
from custom_random_forest import RandomForestClassifierCustom


## RandomForestClassifierCustom

In [3]:
X, y = make_classification(n_samples=1000000)

In [4]:
random_forest = RandomForestClassifierCustom(max_depth=20, n_estimators=50, 
                                             max_features=2, random_state=42)

In [5]:
%%time
fit_1_job = random_forest.fit(X, y, n_jobs = 1)

CPU times: user 482 ms, sys: 1.16 s, total: 1.64 s
Wall time: 7min 13s


In [6]:
%%time
fit_2_jobs = random_forest.fit(X, y, n_jobs = 2)

CPU times: user 921 ms, sys: 1.75 s, total: 2.67 s
Wall time: 4min 5s


In [7]:
%%time
predict_1_job = random_forest.predict(X, n_jobs=1)

CPU times: user 4.96 s, sys: 7.06 s, total: 12 s
Wall time: 15.1 s


In [8]:
%%time
predict_2_jobs = random_forest.predict(X, n_jobs=2)

CPU times: user 2.94 s, sys: 5.91 s, total: 8.85 s
Wall time: 8.18 s


In [10]:
result = all(x == y for x, y in zip(predict_1_job, predict_2_jobs))
print(result)

True


## RunGenscan

##### Simple case

In [11]:
Entrez.email = "your_email@example.com"
record_id = "NG_059281.1"
handle = Entrez.efetch(db="nucleotide", id=record_id, rettype="fasta", retmode="text")

with open("HBB.fasta", "w") as out_file:
    out_file.write(handle.read())

record_id = "NG_012772.3"
handle = Entrez.efetch(db="nucleotide", id=record_id, rettype="fasta", retmode="text")

with open("BRCA2.fasta", "w") as out_file:
    out_file.write(handle.read())

In [12]:
run_genscan(sequence_file="HBB.fasta")

GenscanOutput(status=200, cds_list=['MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH'], intron_list=[['1.01', 'Intron', 5205, 5334], ['1.02', 'Intron', 5558, 6407], ['1.03', 'Intron', 6537, 6643]], exon_list=[['1.01', 'Init', 5113, 5204], ['1.02', 'Intr', 5335, 5557], ['1.03', 'Term', 6408, 6536], ['1.04', 'PlyA', 6644, 6649]])

##### Complex case

In [13]:
run_genscan(sequence_file="BRCA2.fasta", exon_cutoff=0.5)

GenscanOutput(status=200, cds_list=['MERFVRVPYGLYQGYGSTVPLGQPGLSGHKQPDWRQNMGPPTFLARPGLLVPANAPDYCIDPYKRAQLKAILSQMNPSLSPRLCKPNTKEVGVQVSPRVDKAVQCSLGPRTLSSCSPWDGRDPQEPLPACGVTSPATGRRGLIRLRRDGDEAESKALPGPAEASQPQPPSRRSGADRQEEPGQLEESGEKDAPCPQETKSKQVPGDAASEPLRRPNFQVDPLFPLLSPRPASSPPSFSIYGGNKSKIFSLFFHVLGEGEGALKICISRGLYVCFWNQNMAISTVKIVRPGGRVLTCGAFLERTX', 'MPDKEFPFATLRNTRSGPPRPDFRVVRVLRVASRRHVASAGLWRELLKLGGRGGAAVALLRLCCASGVFCGVALATFRVLNVASGTGLDSTAVKCSHPHNLGPISLNWFEELSSEAPPYNSEPAEESEHKNNNYEPNLFKTPQRKPSYNQLASTPIIFKEQGLTLPLYQSPVKELDKFKLDLVVCGSLFHTPKFVKGRQTPKHISESLGAEVDPDMSWSSSLATPPTLSSTVLIVRNEEASETVFPHDTTANVKSYFSNHDESLKKNDRFIASVTDSENTNQREAASHGFGKTSGNSFKVNSCKDHIGKSMPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTF

##### Another one

In [14]:
run_genscan(sequence='AACGAATGAGTAAATGAGTAAATGAAGGAATGATTATTCCTTGCTTTAGAACTTCTGGAATTAGAGGACAATATTAATAATACCATCGCACAGTGTTTCTTTGTTGTTAATGCTACAACATACAAAGAGGAAGCATGCAGTAAACAACCGAACAGTTATTTCCTTTCTGATCATAGGAGTAATATTTTTTTCCTTGAGCACCATTTTTGCCATAGGTAAAATTAGAAGGATTTTTAGAACTTTCTCAGTTGTATACATTTTTAAAAATCTGTATTATATGCATGTTGATTAATTTTAAACTTACTTGAATACCTAAACAGAATCTGTTGTTTCCTTGTGTTTGAAAGTGCTTTCACAGTAACTCTGTCTGTACTGCCAGAATATACTGACAATGTGTTATAGTTAACTGTTTTGATCACAACATTTTGAATTGACTGGCAGCAGAAGCTCTTTTATATCCATGTGTTTTCCTTAAGTCATTATACATAGTAGGCACTGAGAACTCTTTATATCTGAATAAGATATTTAGGAACCACTGGTTTACATATCAGAAGCAGAGCTACTCAGGGCATTTTGGGGAAGATCACTTTCACATTCCTGAGCATAGGGAAGTTCTCATAAGAGTAAGATATTAAAAGGAGATACTTGTGTGGTATTCGAAAGACAGTAAGAGAGATTGTAGACCTTATGATCTTGATAGGGAAAACAAACTACATTCCTTTCTCCAAAAGTCAAAAAAAAAGAGCAAATATAGCTTACTATACCTTCTATTCCTACACCATTAGAAGTAGTCAGTGAGTCTAGGCAAGATGTTGGCCCTAAAAATCCAAATACCAGAGAATTCATGAGAACATCACCTGGATGGGACATGTGCCGAGCACACACAATTACTATATGCTAGGCATTGCTATCTTCATATTGAAGATGAGGAGGTCAAGAGATGAAAAAAGACTTGGCACCTTGTTGTTATATTAAAATTATTTGTTAGAGTAGAGCTTTTGTAAGAGTCTAGGAGTGTGGGAGCTAAATGATGATACACATGGACACAAAAAATAGATCAACAGACACCCAGGCCTACTTGAGGGTTGAGGGTGGGAAGAGGGAGACGATGAAAAAGAACCTATTGGGTATTAAGTTCATCACTGAGTGATGAAATAATCTGTACATCAAGACCCAGTGATATGCAATTTACCTATATAACTTGTACATGTACCCCCAAATTTAAAATGAAAGTTAAAACAAAGTATAGGAATGGAATTAATTCCTCAAGATTTGGCTTTAATTTTATTTGATAATTTATCAAATGGTTGTTTTTCTTTTCTCACTATGGCGTTGCTTTATAAACTATGTTCAGTATGTCTGAATGAAAGGGTGTGTGTGTGTGTGAAAGAGAGGGAGAGAGGAAGGGAAGAGAGGACGTAATAATGTGAATTTGAGTTCATGAAAATTTTTCAATAAAATAATTTAATGTCAGGAGAATTAAGCCTAATAGTCTCCTAAATCATCCATCTCTTGAGCTTCAGAGCAGTCCTCTGAATTAATGCCTACATGTTTGTAAAGGGTGTTCAGACTGAAGCCAAGATTCTACCTCTAAAGAGATGCAATCTCAAATTTATCTGAAGACTGTACCTCTGCTCTCCATAAATTGACACCATGGCCCACTTAATGAGGTTAAAAAAAAGCTAATTCTGAATGAAAATCTGAGCCCAGTGGAGGAAATATTAATGAACAAGGTGCAGACTGAAATATAAATTTTTCTGTAATAATTATGCATATACTTTAGCAAAGTTCTGTCTATGTTGACTTTATTGCTTTTTGGTAAGAAATACAACTTTTTAAAGTGAACTAAACTATCCTATTTCCAAACTATTTTGTGTGTGTGCGGTTTGTTTCTATGGGTTCTGGTTTTCTTGGAGCATTTTTATTTCATTTTAATTAATTAATTCTGAGAGCTGCTGAGTTGTGTTTACTGAGAGATTGTGTATCTGCGAGAGAAGTCTGTAGCAAGTAGCTAGACTGTGCTTGACCTAGGAACATATACAGTAGATTGCTAAAATGTCTCACTTGGGGAATTTTAGACTAAACAGTAGAGCATGTATAAAAATACTCTAGTCAAGTGCTGCTTTTGAAACAAATGATAAAACCACACTCCCATAGATGAGTGTCATGATTTTCATGGAGGAAGTTAATATTCATCCTCTAAGTATACCCAGACTAGGGCCATTCTGATATAAAACATTAGGACTTAAGAAAGATTAATAGACTGGAGTAAAGGAAATGGACCTCTGTCTCTCTCGCTGTCTCTTTTTTGAGGACTTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTTGTGGTCAGTGGGGCTGGAATAAAAGTAGAATAGACCTGCACCTGCTGTGGCATCCATTCACAGAGTAGAAGCAAGCTCACAATAGTGAAGATGTCAGTAAGCTTGAATAGTTTTTCAGGAACTTTGAATGCTGATTTAGATTTGAAACTGAGGCTCTGACCATAACCAAATTTGCACTATTTATTGCTTCTTGAAACTTATTTGCCTGGTATGCCTGGGCTTTTGATGGTCTTAGTATAGCTTGCAGCCTTGTCCCTGCAGGGTATTATGGGTAATAGAAAGAAAAGTCTGCGTTACACTCTAGTCACACTAAGTAACTACCATTGGAAAAGCAACCCCTGCCTTGAAGCCAGGATGATGGTATCTGCAGCAGTTGCCAACACAAGAGAAGGATCCATAGTTCATCATTTAAAAAAGAAAACAAAATAGAAAAAGGAAAACTATTTCTGAGCATAAGAAGTTGTAGGGTAAGTCTTTAAGAAGGTGACAATTTCTGCCAATCAGGATTTCAAAGCTCTTGCTTTGACAATTTTGGTCTTTCAGAATACTATAAATATAACCTATATTATAATTTCATAAAGTCTGTGCATTTTCTTTGACCCAGGATATTTGCAAAAGACATATTCAAACTTCCGCAGAACACTTTATTTCACATATACATGCCTCTTATATCAGGGATGTGAAACAGGGTCTTGAAAACTGTCTAAATCTAAAACAATGCTAATGCAGGTTTAAATTTAATAAAATAAAATCCAAAATCTAACAGCCAAGTCAAATCTGCATGTTTTAACATTTAAAATATTTTAAAGACGTCTTTTCCCAGGATTCAACATGTGAAATCTTTTCTCAGGGATACACGTGTGCCTAGATCCTCATTGCTTTAGTTTTTTACAGAGGAATGAATATAAAAAGAAAATACTTAAATTTTATCCCTCTTACCTCTATAATCATACATAGGCATAATTTTTTAACCTAGGCTCCAGATAGCCATAGAAGAACCAAACACTTTCTGCGTGTGTGAGAATAATCAGAGTGAGATTTTTTCACAAGTACCTGATGAGGGTTGAGACAGGTAGAAAAAGTGAGAGATCTCTATTTATTTAGCAATAATAGAGAAAGCATTTAAGAGAATAAAGCAATGGAAATAAGAAATTTGTAAATTTCCTTCTGATAACTAGAAATAGAGGATCCAGTTTCTTTTGGTTAACCTAAATTTTATTTCATTTTATTGTTTTATTTTATTTTATTTTATTTTATTTTGTGTAATCGTAGTTTCAGAGTGTTAGAGCTGAAAGGAAGAAGTAGGAGAAACATGCAAAGTAAAAGTATAACACTTTCCTTACTAAACCGACATGGGTTTCCAGGTAGGGGCAGGATTCAGGATGACTGACAGGGCCCTTAGGGAACACTGAGACCCTACGCTGACCTCATAAATGCTTGCTACCTTTGCTGTTTTAATTACATCTTTTAATAGCAGGAAGCAGAACTCTGCACTTCAAAAGTTTTTCCTCACCTGAGGAGTTAATTTAGTACAAGGGGAAAAAGTACAGGGGGATGGGAGAAAGGCGATCACGTTGGGAAGCTATAGAGAAAGAAGAGTAAATTTTAGTAAAGGAGGTTTAAACAAACAAAATATAAAGAGAAATAGGAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTAGAGGAAAAAAATAATCTGAGCCAAGTAGAAGACCTTTTCCCCTCCTACCCCTACTTTCTAAGTCACAGAGGCTTTTTGTTCCCCCAGACACTCTTGCAGATTAGTCCAGGCAGAAACAGTTAGATGTCCCCAGTTAACCTCCTATTTGACACCACTGATTACCCCATTGATAGTCACACTTTGGGTTGTAAGTGACTTTTTATTTATTTGTATTTTTGACTGCATTAAGAGGTCTCTAGTTTTTTATCTCTTGTTTCCCAAAACCTAATAAGTAACTAATGCACAGAGCACATTGATTTGTATTTATTCTATTTTTAGACATAATTTATTAGCATGCATGAGCAAATTAAGAAAAACAACAACAAATGAATGCATATATATGTATATGTATGTGTGTATATATACACACATATATATATATATTTTTTCTTTTCTTACCAGAAGGTTTTAATCCAAATAAGGAGAAGATATGCTTAGAACCGAGGTAGAGTTTTCATCCATTCTGTCCTGTAAGTATTTTGCATATTCTGGAGACGCAGGAAGAGATCCATCTACATATCCCAAAGCTGAATTATGGTAGACAAAACTCTTCCACTTTTAGTGCATCAACTTCTTATTTGTGTAATAAGAAAATTGGGAAAACGATCTTCAATATGCTTACCAAGCTGTGATTCCAAATATTACGTAAATACACTTGCAAAGGAGGATGTTTTTAGTAGCAATTTGTACTGATGGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCACACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGTTGGTATCAAGGTTACAAGACAGGTTTAAGGAGACCAATAGAAACTGGGCATGTGGAGACAGAGAAGACTCTTGGGTTTCTGATAGGCACTGACTCTCTCTGCCTATTGGTCTATTTTCCCACCCTTAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGGTGAGTCTATGGGACGCTTGATGTTTTCTTTCCCCTTCTTTTCTATGGTTAAGTTCATGTCATAGGAAGGGGATAAGTAACAGGGTACAGTTTAGAATGGGAAACAGACGAATGATTGCATCAGTGTGGAAGTCTCAGGATCGTTTTAGTTTCTTTTATTTGCTGTTCATAACAATTGTTTTCTTTTGTTTAATTCTTGCTTTCTTTTTTTTTCTTCTCCGCAATTTTTACTATTATACTTAATGCCTTAACATTGTGTATAACAAAAGGAAATATCTCTGAGATACATTAAGTAACTTAAAAAAAAACTTTACACAGTCTGCCTAGTACATTACTATTTGGAATATATGTGTGCTTATTTGCATATTCATAATCTCCCTACTTTATTTTCTTTTATTTTTAATTGATACATAATCATTATACATATTTATGGGTTAAAGTGTAATGTTTTAATATGTGTACACATATTGACCAAATCAGGGTAATTTTGCATTTGTAATTTTAAAAAATGCTTTCTTCTTTTAATATACTTTTTTGTTTATCTTATTTCTAATACTTTCCCTAATCTCTTTCTTTCAGGGCAATAATGATACAATGTATCATGCCTCTTTGCACCATTCTAAAGAATAACAGTGATAATTTCTGGGTTAAGGCAATAGCAATATCTCTGCATATAAATATTTCTGCATATAAATTGTAACTGATGTAAGAGGTTTCATATTGCTAATAGCAGCTACAATCCAGCTACCATTCTGCTTTTATTTTATGGTTGGGATAAGGCTGGATTATTCTGAGTCCAAGCTAGGCCCTTTTGCTAATCATGTTCATACCTCTTATCTTCCTCCCACAGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAATGATGTATTTAAATTATTTCTGAATATTTTACTAAAAAGGGAATGTGGGAGGTCAGTGCATTTAAAACATAAAGAAATGAAGAGCTAGTTCAAACCTTGGGAAAATACACTATATCTTAAACTCCATGAAAGAAGGTGAGGCTGCAAACAGCTAATGCACATTGGCAACAGCCCCTGATGCATATGCCTTATTCATCCCTCAGAAAAGGATTCAAGTAGAGGCTTGATTTGGAGGTTAAAGTTTTGCTATGCTGTATTTTACATTACTTATTGTTTTAGCTGTCCTCATGAATGTCTTTTCACTACCCATTTGCTTATCCTGCATCTCTCAGCCTTGACTCCACTCAGTTCTCTTGCTTAGAGATACCACCTTTCCCCTGAAGTGTTCCTTCCATGTTTTACGGCGAGATGGTTTCTCCTCGCCTGGCCACTCAGCCTTAGTTGTCTCTGTTGTCTTATAGAGGTCTACTTGAAGAAGGAAAAACAGGGGTCATGGTTTGACTGTCCTGTGAGCCCTTCTTCCCTGCCTCCCCCACTCACAGTGACCCGGAATCTGCAGTGCTAGTCTCCCGGAACTATCACTCTTTCACAGTCTGCTTTGGAAGGACTGGGCTTAGTATGAAAAGTTAGGACTGAGAAGAATTTGAAAGGCGGCTTTTTGTAGCTTGATATTCACTACTGTCTTATTACCCTGTCATAGGCCCACCCCAAATGGAAGTCCCATTCTTCCTCAGGATGTTTAAGATTAGCATTCAGGAAGAGATCAGAGGTCTGCTGGCTCCCTTATCATGTCCCTTATGGTGCTTCTGGCTCTGCAGTTATTAGCATAGTGTTACCATCAACCACCTTAACTTCATTTTTCTTATTCAATACCTAGGTAGGTAGATGCTAGATTCTGGAAATAAAATATGAGTCTCAAGTGGTCCTTGTCCTCTCTCCCAGTCAAATTCTGAATCTAGTTGGCAAGATTCTGAAATCAAGGCATATAATCAGTAATAAGTGATGATAGAAGGGTATATAGAAGAATTTTATTATATGAGAGGGTGAAACCCTCAAAATGAAATGAAATCAGACCCTTGTCTTACACCATAAACAAAAATAAATTTGAATGGGTTAAAGAATTAAACTAAGACCTAAAACCATAAAAATTTTTAAAGAAATCAAAAGAAGAAAATTCTAATATTCACGTTGCAGCCGTTTTTTGAATTTGATATGAGAAGCAAAGGCAACAAAAGGAAAAATAAAGAAGTGAGGCTACATCAAACTAAAAAATTTCCACACAAAAAACAAAACAATGAACAAATGAAAGGTGAACCATGAAATGGCATATTTGCAAACCAAATATTTCTTAAATATTTTGGTTAATATCCAAAATATATAAGAAACACAGATGATTCAATAACAAACAAAAAATTAAAAATAGGAAAATAAAAAAATTAAAAAGAAGAAAATCCTGCCATTTATGGCAGAATTGATGAACCTGGAGGATGTAAAACTAAGAAAAATAAGCCTGACACAAAAAGACAAATACTACACAACCTTGCTCATATGTGAAACATAAAAAAGTCACTCTCATGGAAACAGACAGTAGAGGTATGGTTTCCAGGGGTTGGGGGTGGGAGAATCAGGAAACTATTACTCAAAGGGTATAAAATTTCAGTTATGTGGGATGAATAAATTCTAGATATCTAATGTACAGCATCGTGACTGTAGTTAATTGTACTGTAAGTATATTTAAAATTTGCAAAGAGAGTAGATTTTTTTTTTTTTTTAGATGGAGTTTTGCTCTTGTTGTCCAGGCTGGAGTGCAATGGCAAGATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTTCAAGCAAATCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGCATGCGACACCATGCCCAGCTAATTTTGTATTTTTAGTAGAGACGGGGTTTCTCCATGTTGGTCAGGCTGATCCGCCTGCCTCGGCCACCCAAAGGGCTGGGATTACAGGCGTGAGCCACCGGGCCTGGCCGAGAGTAGATCTTAAAAGCATTTACCACAAGAAAAAGGTAACTATGTGAGATAATGGGTATGTTAATTAGCTTGATTGTGGTAATCATTTCACAAGGTATACATATATTAAAACATCATGTTGTACACCTTAAATATATACAATTTTTATTTGTGAATGATACCTCAATAAAGTTGAAGAATAATAAAAAAGAATAGACATCACATGAATTAAAAAACTAAAAAATAAAAAAATGCATCTTGATGATTAGAATTGCATTCTTGATTTTTCAGATACAAATATCCATTTGACTGTTTACTCTTTTCCAAAACAATACAATAAATTTTAGCACTTTATCTTCATTTTCCCCTTCCCAATCTATAATTATATATATATATATTTTAGATATTTTGTATAGTTTTACTCCCTAGATTTTCTAGTGTTATTATTAAATAGTGAAGAAATGTTTACACTTATGTACAAAATGTTTTGCATGCTTTTCTTCATTTCTAACATTCTCTCTAAGTTTATTCTATTTTTTTCTGATTATCCTTAATATTATCTCTTTCTGCTGGAAATACATTGTTACTTTTGGTTTATCTAAAAATGGCTTCATTTTCTTCATTCTAAAATCATGTTAAATTAATACCACTCATGTGTAAGTAAGATAGTGGAATAAATAGAAATCCAAAAACTAAATCTCACTAAAATATAATAATGTGATATATAAAAATATAGCTTTTAAATTTAGCTTGGAAATAAAAAACAAACAGTAATTGAACAACTATACTTTTTGAAAAGAGTAAAGTGAAATGCTTAACTGCATATACCACAATCGATTACACAATTAGGTGTGAAGGTAAAATTCAGTCACGAAAAAACTAGAATAAAAATATGGGAAGACATGTATATAATCTTAGAGATAACACTGTTATTTAATTATCAACCCAAAGTAGAAACTATCAAGGGAGAAATAAATTCAGTCAACAATAAAAGCATTTAAGAAGTTATTCTAGGCTGGGAGCGGTGGCTCACACCTGCAATTGCAGCACTTTGGGAGGCCTAGACAGGCGGATCACGACGTCAGGAGTTCAAGATCAGCCTGGCCAACATAGTGAAACCTCATCGCTACTAAAAATATAAAAACTTAGCCTGGCGTGGTGGCAGGCATGTGTAATCCCAGCAATTTGGGAGGCTGAGGCAGGAGAATCGCTTGATCCTGGGAGGCAGAGGTTGCAGTGAGCCAAGATTGTGCCACTGCATTCCAGCCCAGGTGACAGCATGAGACTCCGTCACAAAAAAAAAAGAAAAAAAAAAGGGGGGGGGGAGCGGTGGAGCCAAGATGACCGAATAGGAACAGCTCCAGTCTATAGCTCCCATCGTGAGTGACGCAGAAGACGGGTGATTTCTGCATTTCCAACTGAGGTACCAGGTTCATCTCACAGGGAAGTGCCAGGCAGTGGGTGCAGGACAGTAGGTGCAGTGCACTG')

GenscanOutput(status=200, cds_list=['MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH'], intron_list=[['1.01', 'Intron', 5143, 5272], ['1.02', 'Intron', 5496, 6345], ['1.03', 'Intron', 6475, 6581]], exon_list=[['1.01', 'Init', 5051, 5142], ['1.02', 'Intr', 5273, 5495], ['1.03', 'Term', 6346, 6474], ['1.04', 'PlyA', 6582, 6587]])

## AminoAcidSequence/DNASequence/RNASequence

##### DNASequence

In [15]:
dna_sequence = DNASequence("ATCGAA")
rna_sequence = dna_sequence.transcribe()
print("Transcribed Sequence:", rna_sequence)

Transcribed Sequence: AUCGAA


##### RNASequence

In [16]:
rna_sequence = RNASequence("AUGC")
reverse_rna = rna_sequence.reverse()
print("Reverse RNA Sequence:", reverse_rna)

Reverse RNA Sequence: CGUA


##### AminoAcidSequence

In [17]:
amino_acid_sequence = AminoAcidSequence("GAVLI")
profile = amino_acid_sequence.amino_acid_profile()
print("Amino Acid Profile:", profile)

Amino Acid Profile: {'hydrophobic': 1.0, 'polar': 0.0, '- charged': 0.0, '+ charged': 0.0}


## OpenFasta

In [18]:
with OpenFasta('data/example_fasta.fasta') as fasta_file:
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())

GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCC...

GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACA...

GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGT...

GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGT...

GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-)
CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGA...



In [19]:
with OpenFasta('data/example_fasta.fasta') as fasta_file:
    records = fasta_file.read_records()
    print(records)

[GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCC...
, GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACA...
, GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGT...
, GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGT...
, GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-)
CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGA...
]


### Tests

In [20]:
%run -m pytest

platform linux -- Python 3.9.18, pytest-8.2.0, pluggy-1.5.0
rootdir: /mnt/c/users/uzunm/pocp/python/HW18_true/Bioinformatics_utilities
plugins: anyio-3.6.2
collected 8 items

test_modules.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                 [100%][0m

