In [1]:
from yet_another_bioinformatic_tool import (
    AminoAcidSequence,
    DNASequence,
    FastQFiltrator,
    NucleicAcidSequence,
)

### Задание 2: FastQ-фильтратор с помощью Biopython

Можно прочитать FASTQ

In [2]:
handler = FastQFiltrator("example_data/example_fastq.fastq")

Отфильтровать (тут сохранятся все риды, потому что фильтров нет...)

In [3]:
handler.filter_fastq()

In [4]:
len(handler.reads)

10

In [5]:
handler.write_to_file(rewrite=True)

In [6]:
handler.reads

[SeqRecord(seq=Seq('GTTGTACTTCGTTCAATCGGTAGGTGTTTAACCGGATGGTCACGCCTACCGTGA...GTC'), id='ee15a423-b008-44be-a4b2-5441d11b0b94', name='ee15a423-b008-44be-a4b2-5441d11b0b94', description='ee15a423-b008-44be-a4b2-5441d11b0b94 runid=fa1d76e661ac2bbb53a002e85e75a30e91827c51 sampleid=1 read=5087 ch=53 start_time=2019-10-18T22:14:05Z', dbxrefs=[]),
 SeqRecord(seq=Seq('CAAGCATACTTCATTCAGTCAGGCGAAATTATTGCCAGGTCGCCGCCTACCGTG...TGT'), id='105220b1-f48a-43b4-8e89-a3cf20afeb0d', name='105220b1-f48a-43b4-8e89-a3cf20afeb0d', description='105220b1-f48a-43b4-8e89-a3cf20afeb0d runid=fa1d76e661ac2bbb53a002e85e75a30e91827c51 sampleid=1 read=5559 ch=33 start_time=2019-10-18T22:15:11Z', dbxrefs=[]),
 SeqRecord(seq=Seq('GTTGTACTTCGTTCAGTCGGTGGTGTTTAACTGGGTCATCGCCTACCGTGACAA...GTC'), id='ccdb7b45-8b29-4105-b9ea-f434d0ffc14a', name='ccdb7b45-8b29-4105-b9ea-f434d0ffc14a', description='ccdb7b45-8b29-4105-b9ea-f434d0ffc14a runid=fa1d76e661ac2bbb53a002e85e75a30e91827c51 sampleid=1 read=5389 ch=124 start_time=2019-1

Можно читать не только из файла, но и передавать списком внутри программы

In [7]:
handler2 = FastQFiltrator(
    reads=handler.reads, gc_bounds=(0, 60), length_bounds=(80, 300)
)
handler2

FastQFilter(
	path_to_input='None',
	gc_bounds=(0, 60),
	length_bounds=(80, 300),
	quality_threshold=0
)

In [8]:
handler2.filter_fastq()

In [9]:
len(handler2.reads)

8

In [10]:
handler2.write_to_file("example_data/my_filtered_example.fastq")
handler2.write_to_file("example_data/my_filtered_example.fastq", rewrite=False)

### Задание 1: Abtract sequences for bioinformaticians of all kinds

In [11]:
dna = DNASequence("ATGC")
rna = dna.transcribe()
protein = AminoAcidSequence("PUPUPUMDA")

seqs = [dna, rna, protein]

In [12]:
dna, rna, protein

(DNASequence('ATGC'), RNASequence('AUGC'), AminoAcidSequence('PUPUPUMDA'))

- #### common methods

In [13]:
for seq in seqs:
    print(seq.__repr__(), len(seq), seq.alphabet, seq.check_alphabet())

DNASequence('ATGC') 4 {'G', 'C', 'T', 'c', 'A', 'g', 't', 'a'} True
RNASequence('AUGC') 4 {'G', 'C', 'c', 'A', 'u', 'g', 'U', 'a'} True
AminoAcidSequence('PUPUPUMDA') 9 {'G', 'O', 'd', 'e', 'r', 'P', 'W', 'y', 'D', 'R', 'i', 'S', 'T', 'U', 'v', 'g', 'K', 'L', 'l', 'm', 'k', 'F', 's', 'p', 'o', 'E', 'q', 'M', 'Q', 'c', 'I', 'N', 't', 'V', 'h', 'a', 'H', 'C', 'f', 'A', 'u', 'Y', 'w', 'n'} True


- #### nucleic methods

In [14]:
for seq in seqs[:-1]:
    print(seq, seq.complement(), seq.gc_content)

ATGC TACG 0.25
AUGC UACG 0.25


In [15]:
na = NucleicAcidSequence("ATGC")

In [16]:
try:
    na.complement()
except NotImplementedError as e:
    print(e)

This method is not implemented for 'NucleicAcidSequence'


- #### DNA method

In [17]:
dna.transcribe()

RNASequence('AUGC')

In [18]:
type(dna.transcribe())

yet_another_bioinformatic_tool.RNASequence

- #### protein method

In [19]:
protein.count_aa()

defaultdict(int, {'P': 3, 'U': 3, 'M': 1, 'D': 1, 'A': 1})