# Getting statarted with biopython

## Manipulações de sequências
Em Biopython, sequências são declaradas como objetos e não como strings. 

In [1]:
# O módulo necessário para manipulação de sequências é o Bio.Seq.
from Bio.Seq import Seq

# Criando uma sequencia exemplo
seq_exemplo = Seq("ATGCGAGCTAGGAAAAAA")

In [2]:
# Sequencia reversa e reversa complementar
seq_complementar = seq_exemplo.complement() #TAC
seq_complementar_reversa = seq_exemplo.reverse_complement() #CAT

print(seq_complementar)
print(seq_complementar_reversa)

TACGCTCGATCCTTTTTT
TTTTTTCCTAGCTCGCAT


In [3]:
# Transcricao
seq_rna = seq_exemplo.transcribe() #AUG
seq_dna = seq_rna.back_transcribe() #ATG

print(seq_rna)
print(seq_dna)

AUGCGAGCUAGGAAAAAA
ATGCGAGCTAGGAAAAAA


In [4]:
# Traducao
seq_proteina_rna = seq_rna.translate() #M
seq_proteina_dna = seq_dna.translate() #M

print(seq_proteina_rna)
print(seq_proteina_dna)

MRARKK
MRARKK


## Módulo SeqIO 
Biopython apresenta o módulo SeqIO para leitura e manipulação de arquivos em diversos formatos.  
Pode-se utilizar o método SeqIO.parse para percorrer o arquivo e obter informações como: título do cabeçalho e sequência completa.

## The SeqRecord object  

The SeqRecord (Sequence Record) class is defined in the Bio.SeqRecord module. This class allows higher level features such as identifiers and features to be associated with a sequence (see Chapter ‍3), and is the basic data type for the Bio.SeqIO sequence input/output interface (see Chapter ‍5).  

The SeqRecord class itself is quite simple, and offers the following information as attributes:  

.seq – The sequence itself, typically a Seq object.  

.id – The primary ID used to identify the sequence – a string. In most cases this is something like an accession number.  

.name – A “common” name/id for the sequence – a string. In some cases this will be the same as the accession number, but it could also be a clone name. I think of this as being analogous to the LOCUS id in a GenBank record.  

.description – A human readable description or expressive name for the sequence – a string.  

.letter_annotations – Holds per-letter-annotations using a (restricted) dictionary of additional information about the letters in the sequence. The keys are the name of the information, and the information is contained in the value as a Python sequence (i.e. a list, tuple or string) with the same length as the sequence itself. This is often used for quality scores (e.g. Section ‍20.1.6) or secondary structure information (e.g. from Stockholm/PFAM alignment files).  

.annotations – A dictionary of additional information about the sequence. The keys are the name of the information, and the information is contained in the value. This allows the addition of more “unstructured” information to the sequence.  

.features – A list of SeqFeature objects with more structured information about the features on a sequence (e.g. position of genes on a genome, or domains on a protein sequence). The structure of sequence features is described below in Section ‍4.3.  

.dbxrefs - A list of database cross-references as strings.  

In [5]:
from Bio import SeqIO

for fasta in SeqIO.parse("ecoli_gene.fasta","fasta"):
    #imprime id do cabecalho
    print (fasta.id)

    # imprime nome da sequencia
    print(fasta.name)

    # imprime outras informações
    print(fasta.description)
    print(fasta.format)

    #imprime sequencia completa
    print (fasta.seq)


X81322.1
X81322.1
X81322.1 E.coli hpcC gene
<bound method SeqRecord.format of SeqRecord(seq=Seq('GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGC...CCA'), id='X81322.1', name='X81322.1', description='X81322.1 E.coli hpcC gene', dbxrefs=[])>
GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCATTGGATCAACGGCAAAAATGTTGCAGGTAACGACTACTTCCTGACCACCAATCCGGCAACGGGTGAAGTGCTGGCGGATGTGGCCTCTGGCGGTGAAGCGGAGATCAATCAGGCGGTAGCGACAGCGAAAGAGGCGTTCCCGAAATGGGCCAATCTGCCGATGAAAGAGCGTGCGCGCCTGATGCGCCGTCTGGGCGATCTGATCGACCAGAACGTGCCAGAGATCGCCGCGATGGAAACCGCGGACACGGGCCTGCCGATCCATCAGACCAAAAATGTGTTGATCCCACGCGCTTCTCACAACTTTGAATTTTTCGCGGAAGTCTGCCAGCAGATGAACGGCAAGACTTATCCGGTCGACGACAAGATGCTCAACTACACGCTGGTGCAGCCGGTAGGCGTTTGTGCACTGGTGTCACCGTGGAACGTGCCGTTTATGACCGCCACCTGGAAGGTCGCGCCGTGTCTGGCGCTGGGCATTACCGCGGTGCTGAAGATGTCCGAACTCTCCCCGCTGACCGCTGACCGCCTGGGTGAGCTGGCGCTGGAAGCCGGTATTCCGGCGGGCGTTCTGAACGTGGTACAGGGCTACGGCGCAACCGCAGGCGATGCGCTGGTCCGTCATCATGACGTGCGTGCCGTGTCGTTCACCGGCGGTACGGCGACCGGGCGCAATATC

In [6]:
seq1= fasta.seq
print(seq1)

GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCATTGGATCAACGGCAAAAATGTTGCAGGTAACGACTACTTCCTGACCACCAATCCGGCAACGGGTGAAGTGCTGGCGGATGTGGCCTCTGGCGGTGAAGCGGAGATCAATCAGGCGGTAGCGACAGCGAAAGAGGCGTTCCCGAAATGGGCCAATCTGCCGATGAAAGAGCGTGCGCGCCTGATGCGCCGTCTGGGCGATCTGATCGACCAGAACGTGCCAGAGATCGCCGCGATGGAAACCGCGGACACGGGCCTGCCGATCCATCAGACCAAAAATGTGTTGATCCCACGCGCTTCTCACAACTTTGAATTTTTCGCGGAAGTCTGCCAGCAGATGAACGGCAAGACTTATCCGGTCGACGACAAGATGCTCAACTACACGCTGGTGCAGCCGGTAGGCGTTTGTGCACTGGTGTCACCGTGGAACGTGCCGTTTATGACCGCCACCTGGAAGGTCGCGCCGTGTCTGGCGCTGGGCATTACCGCGGTGCTGAAGATGTCCGAACTCTCCCCGCTGACCGCTGACCGCCTGGGTGAGCTGGCGCTGGAAGCCGGTATTCCGGCGGGCGTTCTGAACGTGGTACAGGGCTACGGCGCAACCGCAGGCGATGCGCTGGTCCGTCATCATGACGTGCGTGCCGTGTCGTTCACCGGCGGTACGGCGACCGGGCGCAATATCATGAAAAACGCCGGGCTGAAAAAATACTCCATGGAACTGGGCGGTAAATCGCCGGTGCTGATTTTTGAAGATGCCGATATTGAGCGCGCGCTGGACGCCGCCCTGTTCACCATCTTCTCGATCAACGGCGAGCGCTGCACCGCCGGTTCGCGCATCTTTATTCAACAAAGCATCTACCCGGAATTCGTGAAATTTGCCGAACGCGCCAACCGTGTGCGCGTGGGCGATCCGACCGATCCGAATACCC

In [7]:
type(seq1)

Bio.Seq.Seq

## Biopython. O módulo SeqIO também pode ser utilizado para manipular outros formatos, como o formato GenBank, que armazena informações de anotações de proteínas em genomas.

In [None]:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

In [9]:
## Assesando o NCBI usando o Bio.Blast
from Bio.Blast import NCBIWWW

In [10]:
# Realizando um blast de nucleotideos do gene de e.coli. o resulltado sai em xml 
blast_handle = NCBIWWW.qblast('blastn', 'nt', seq1)

In [None]:
# saving the result in a file:
with open('blast_ecoli.xml', 'w') as save_to:
    save_to.write(blast_handle.read())
    blast_handle.close()

In [11]:
# Convertendo o xml
from Bio.Blast import NCBIXML
blast_result = open('blast_ecoli.xml', 'r')
blast_records = NCBIXML.parse(blast_result)

## The Expect value (E)  
## Is a parameter that describes the number of hits one can "expect" to see by chance when searching a database of a particular size. It decreases exponentially as the Score (S) of the match increases. Essentially, the E value describes the random background noise.

In [12]:
# Visualizando os resultados do blast
'''BLAST finds islands of similarity between sequences. This is a local alignment in which only 
High-scoring Segment Pairs (HSPs) are reported.'''
E_value_thresh = 0.0001
count=0
for record in blast_records:
    for alignments in record.alignments:
        for hsp in alignments.hsps:
            count += 1
            if hsp.expect < E_value_thresh:
                print('******************************* Alignments ****************************************')
                print('Sequence:', alignments.title)
                print('Length:', alignments.length)
                print('E value:', hsp.expect)
                print('Score', hsp.score)
                print('Num. alignments', hsp.num_alignments)
                print('Query ==>', hsp.query[0:75] + '...')
                print('         ', hsp.match[0:75] + '...')
                print('Sbjct ==>', hsp.sbjct[0:75] + '...')
                print('-'*100)
print(f'Foram econtradas {count} alinhamentos correspondentes a sequencia fornecida')


******************************* Alignments ****************************************
Sequence: gi|587109|emb|X81322.1| E.coli hpcC gene
Length: 1499
E value: 0.0
Score 2998.0
Num. alignments None
Query ==> GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCAT...
          |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct ==> GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCAT...
----------------------------------------------------------------------------------------------------
******************************* Alignments ****************************************
Sequence: gi|1815787944|gb|CP049050.1| Escherichia coli strain pV11-19-E11-025-038 chromosome
Length: 4621324
E value: 0.0
Score 2861.0
Num. alignments None
Query ==> GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCAT...
          |||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||...
Sbjct ==> GAAGTAGA