In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from custom_random_forest import RandomForestClassifierCustom
from bio_files_processor import OpenFasta
from magos_biologis import run_genscan
from magos_biologis import DNASequence, RNASequence, AminoAcidSequence

## RandomForestClassifierCustom

In [2]:
SEED = 36

classification = make_classification(n_samples=1000000, n_features=40)
X, y = classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

In [3]:
rf = RandomForestClassifierCustom(n_estimators=20, max_depth=5, max_features=10, random_state=SEED)

In [4]:
%%time
rf.fit(X_train, y_train, n_jobs=1)

CPU times: user 379 ms, sys: 819 ms, total: 1.2 s
Wall time: 2min 5s


In [5]:
%%time
rf.fit(X_train, y_train, n_jobs=3)

CPU times: user 845 ms, sys: 2.41 s, total: 3.26 s
Wall time: 1min 2s


In [6]:
%%time
probas = rf.predict_proba(X_test, n_jobs=1)

CPU times: user 210 ms, sys: 399 ms, total: 610 ms
Wall time: 1.39 s


In [7]:
%%time
probas = rf.predict_proba(X_test, n_jobs=3)

CPU times: user 417 ms, sys: 902 ms, total: 1.32 s
Wall time: 1.88 s


## Open fasta

In [8]:
with OpenFasta('data/example_fasta.fasta') as file:
    print(file.read_record())
    print(file.read_record())
    print(file.read_record())

FastaRecord('ACGGCCATAG...')
FastaRecord('GAGCCCTTGG...')
FastaRecord('GAAGTAGGTA...')


In [9]:
with OpenFasta('data/example_fasta.fasta') as file:
    print(file.read_records())

[FastaRecord('ACGGCCATAG...'), FastaRecord('GAGCCCTTGG...'), FastaRecord('GAAGTAGGTA...'), FastaRecord('GAAGTAGGTA...'), FastaRecord('GAAGTAGGTA...')]


In [10]:
with OpenFasta('data/example_fasta.fasta') as file:
    record = file.read_record()
    print(record.id, record.description, record.sequence, sep="\n")

GTD323452
5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG


## Run genscan

In [11]:
run_genscan(sequence_file="data/Homo_sapiens_BRCA1_sequence.fa")

GenscanOutput(status='Success',
               cds_list=['DYEVTFTEDKINALIKAASVNIETFWPGLFAKVLANVNIGSHICSVEGGKKTGLQPARATRILRALLVRHLPHWDASPEHCMAGPHGGSTQWLNTGKERALGVWTSKTWIGLGNLPTPFEWKCGLITHGGLADNCADLLLGSSMAAPSVELTFFLGILAAGKACGSARGLRSFWTEAEATAAPEKAFWLKVEVHGVRRTA', 'MRGDNVLAALARSRRLLGLGVHSGRAGGALQPATALWGPLSGLAEARAGSLCLRGSVEGEAGVGTGAARSARQPARVPASLLKPARPRIHQKEVTLNTSEHQKEQTLDTSSLRTVPLTVRVRNFILEVSETKNPPISDTRTSGFY', 'MNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTENLIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHSRETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATF

## Biological Sequence

In [12]:
dna_seq = DNASequence("CACCGCATCCCGTCCGATCTGCGAAGTTAACC")
dna_seq.gc_content(percentage=False)

0.59375

In [13]:
dna_seq.is_valid_alphabet()

True

In [14]:
rna_seq = dna_seq.transcribe()
print(rna_seq, type(rna_seq), sep="\n")

CACCGCAUCCCGUCCGAUCUGCGAAGUUAACC
<class 'magos_biologis.RNASequence'>


In [15]:
print(rna_seq.complement(RNASequence.complement_map))

GUGGCGUAGGGCAGGCUAGACGCUUCAAUUGG


In [16]:
aa = AminoAcidSequence("DYEVTFTEDKINALIKAASVNIETFWPGLFAKVLANVNIGSH")
aa.is_valid_alphabet()

True

In [17]:
aa.count_aa()

{'S': 2,
 'V': 4,
 'W': 1,
 'T': 3,
 'P': 1,
 'L': 3,
 'E': 3,
 'G': 2,
 'Y': 1,
 'K': 3,
 'N': 4,
 'D': 2,
 'I': 4,
 'H': 1,
 'A': 5,
 'F': 3}

In [18]:
%run -m pytest

platform linux -- Python 3.11.7, pytest-8.2.0, pluggy-1.5.0
rootdir: /home/fuechsin/Documents/bioinformatics_institute/python/hw18/GenomeForge
plugins: anyio-4.2.0
collected 8 items

test_genome_forge_modules.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                    [100%][0m

