# Getting statarted with biopython

## Sequence Manipulations
In Biopython, sequences are declared as objects and not as strings.

In [1]:
# The necessary module for sequence manipulation is Bio.Seq.
from Bio.Seq import Seq

# Creating a seq object
seq_exemple = Seq("ATGCGAGCTAGGAAAAAA")

In [2]:
# Revese sequence and reverse complement sequence
seq_complement = seq_exemple.complement() #TAC
seq_reverse_complement = seq_exemple.reverse_complement() #CAT

print(seq_complement)
print(seq_reverse_complement)

TACGCTCGATCCTTTTTT
TTTTTTCCTAGCTCGCAT


In [4]:
# Transcrition
seq_rna = seq_exemple.transcribe() #AUG
seq_dna = seq_rna.back_transcribe() #ATG

print(seq_rna)
print(seq_dna)

AUGCGAGCUAGGAAAAAA
ATGCGAGCTAGGAAAAAA


In [5]:
# Translation
seq_proteina_rna = seq_rna.translate() #M
seq_proteina_dna = seq_dna.translate() #M

print(seq_proteina_rna)
print(seq_proteina_dna)

MRARKK
MRARKK


## SeqIO Module
Biopython presents the SeqIO module for reading and manipulating files in different formats.
You can use the SeqIO.parse method to iterate through the file and get information such as: header title and complete sequence.

## The SeqRecord object  

The SeqRecord (Sequence Record) class is defined in the Bio.SeqRecord module. This class allows higher level features such as identifiers and features to be associated with a sequence, and is the basic data type for the Bio.SeqIO sequence input/output interface.  

The SeqRecord class itself is quite simple, and offers the following information as attributes:  

   .seq – The sequence itself, typically a Seq object.  

   .id – The primary ID used to identify the sequence – a string. In most cases this is something like an accession number.  

   .name – A “common” name/id for the sequence – a string. In some cases this will be the same as the accession number, but it could also be a clone name. I think of this as being analogous to the LOCUS id in a GenBank record.  

   .description – A human readable description or expressive name for the sequence – a string.  

   .letter_annotations – Holds per-letter-annotations using a (restricted) dictionary of additional information about the letters in the sequence. The keys are the name of the information, and the information is contained in the value as a Python sequence (i.e. a list, tuple or string) with the same length as the sequence itself. This is often used for quality scores or secondary structure information.  

   .annotations – A dictionary of additional information about the sequence. The keys are the name of the information, and the information is contained in the value. This allows the addition of more “unstructured” information to the sequence.  

   .features – A list of SeqFeature objects with more structured information about the features on a sequence (e.g. position of genes on a genome, or domains on a protein sequence). 

   .dbxrefs - A list of database cross-references as strings.  

In [6]:
from Bio import SeqIO

for fasta in SeqIO.parse("ecoli_gene.fasta","fasta"):
    # prints id of header
    print (fasta.id)

    # prints seq name
    print(fasta.name)

    # printing other info
    print(fasta.description)
    print(fasta.format)

    # print whole seq
    print (fasta.seq)


X81322.1
X81322.1
X81322.1 E.coli hpcC gene
<bound method SeqRecord.format of SeqRecord(seq=Seq('GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGC...CCA'), id='X81322.1', name='X81322.1', description='X81322.1 E.coli hpcC gene', dbxrefs=[])>
GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCATTGGATCAACGGCAAAAATGTTGCAGGTAACGACTACTTCCTGACCACCAATCCGGCAACGGGTGAAGTGCTGGCGGATGTGGCCTCTGGCGGTGAAGCGGAGATCAATCAGGCGGTAGCGACAGCGAAAGAGGCGTTCCCGAAATGGGCCAATCTGCCGATGAAAGAGCGTGCGCGCCTGATGCGCCGTCTGGGCGATCTGATCGACCAGAACGTGCCAGAGATCGCCGCGATGGAAACCGCGGACACGGGCCTGCCGATCCATCAGACCAAAAATGTGTTGATCCCACGCGCTTCTCACAACTTTGAATTTTTCGCGGAAGTCTGCCAGCAGATGAACGGCAAGACTTATCCGGTCGACGACAAGATGCTCAACTACACGCTGGTGCAGCCGGTAGGCGTTTGTGCACTGGTGTCACCGTGGAACGTGCCGTTTATGACCGCCACCTGGAAGGTCGCGCCGTGTCTGGCGCTGGGCATTACCGCGGTGCTGAAGATGTCCGAACTCTCCCCGCTGACCGCTGACCGCCTGGGTGAGCTGGCGCTGGAAGCCGGTATTCCGGCGGGCGTTCTGAACGTGGTACAGGGCTACGGCGCAACCGCAGGCGATGCGCTGGTCCGTCATCATGACGTGCGTGCCGTGTCGTTCACCGGCGGTACGGCGACCGGGCGCAATATC

In [7]:
seq1= fasta.seq
print(seq1)

GAAGTAGAAGGCGTGGGCCGCCTGGTGAACCGAATTGTTGAGTGAGGAAACAGCGAAATGAAAAAAGTAAATCATTGGATCAACGGCAAAAATGTTGCAGGTAACGACTACTTCCTGACCACCAATCCGGCAACGGGTGAAGTGCTGGCGGATGTGGCCTCTGGCGGTGAAGCGGAGATCAATCAGGCGGTAGCGACAGCGAAAGAGGCGTTCCCGAAATGGGCCAATCTGCCGATGAAAGAGCGTGCGCGCCTGATGCGCCGTCTGGGCGATCTGATCGACCAGAACGTGCCAGAGATCGCCGCGATGGAAACCGCGGACACGGGCCTGCCGATCCATCAGACCAAAAATGTGTTGATCCCACGCGCTTCTCACAACTTTGAATTTTTCGCGGAAGTCTGCCAGCAGATGAACGGCAAGACTTATCCGGTCGACGACAAGATGCTCAACTACACGCTGGTGCAGCCGGTAGGCGTTTGTGCACTGGTGTCACCGTGGAACGTGCCGTTTATGACCGCCACCTGGAAGGTCGCGCCGTGTCTGGCGCTGGGCATTACCGCGGTGCTGAAGATGTCCGAACTCTCCCCGCTGACCGCTGACCGCCTGGGTGAGCTGGCGCTGGAAGCCGGTATTCCGGCGGGCGTTCTGAACGTGGTACAGGGCTACGGCGCAACCGCAGGCGATGCGCTGGTCCGTCATCATGACGTGCGTGCCGTGTCGTTCACCGGCGGTACGGCGACCGGGCGCAATATCATGAAAAACGCCGGGCTGAAAAAATACTCCATGGAACTGGGCGGTAAATCGCCGGTGCTGATTTTTGAAGATGCCGATATTGAGCGCGCGCTGGACGCCGCCCTGTTCACCATCTTCTCGATCAACGGCGAGCGCTGCACCGCCGGTTCGCGCATCTTTATTCAACAAAGCATCTACCCGGAATTCGTGAAATTTGCCGAACGCGCCAACCGTGTGCGCGTGGGCGATCCGACCGATCCGAATACCC

In [8]:
type(seq1)

Bio.Seq.Seq

### The SeqIO module 
Is also used to manipulate other formats, such as the GenBank format, which stores information from protein annotations in genomes.

In [None]:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))