In [1]:
!pip install biopython --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
# https://biopython.org/docs/1.76/api/Bio.Entrez.html
from Bio import Entrez
Entrez.email = 'taniagmangolini@gmail.com'

dict_keys(['Database', 'Query', 'CorrectedQuery', 'SpelledQuery'])


## Databases

In [None]:
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

## Espell function

In [None]:
records = Entrez.read(Entrez.espell(db='pmc', term='biobbython'))
for key, sugestions in records.items():
  print(f'{key} : {sugestions}')

Database : pmc
Query : biobbython
CorrectedQuery : biopython
SpelledQuery : ['', 'biopython']


## Esearch function
Permite fazer consultas no NCBI.

ESearch returns information about the query performed, which database was searched and UIDs (unique IDs) to the records that match the query. 

In [None]:
records = Entrez.read(Entrez.esearch(db='pmc', term='biopython'))
print(records)

{'Count': '2715', 'RetMax': '20', 'RetStart': '0', 'IdList': ['9898917', '9883684', '9886943', '9896449', '9894515', '9882952', '9877618', '9876871', '9872083', '9710682', '9837158', '9864840', '9861041', '9855806', '9850167', '9845338', '9826903', '9835423', '9837567', '9836094'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'biopython[All Fields]', 'Field': 'All Fields', 'Count': '2733', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'biopython[All Fields]'}


In [None]:
handle = Entrez.esearch(db="gene",retmax="10",term="P53 AND Homo Sapiens [organism]")
record = Entrez.read(handle)
print(record)

{'Count': '2595', 'RetMax': '10', 'RetStart': '0', 'IdList': ['7157', '128385374', '1956', '348', '7124', '7422', '3569', '7040', '4524', '2064'], 'TranslationSet': [{'From': 'Homo Sapiens[organism]', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': 'P53[All Fields]', 'Field': 'All Fields', 'Count': '24210', 'Explode': 'N'}, {'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '307665', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'P53[All Fields] AND "Homo sapiens"[Organism]'}


In [None]:
handle = Entrez.esearch(db="nucleotide",term="Cypripedioideae[Orgn] AND matK[Gene]")
record = Entrez.read(handle)
record["Count"]

'830'

## Esummary function
ESummary retrieves a brief summary, or Document Summary (DocSum), for each record. ESummary requires an input of which database to access and which records to retrieve, identified either by a list of UIDs passed 

In [None]:
records = Entrez.read(Entrez.esearch(db='pmc', term='biopython'))
papers = records['IdList']
for paper in papers[:10]:
  summary = Entrez.read(Entrez.esummary(db='pmc', id=paper))[0]
  print(summary)


{'Item': [], 'Id': '9898917', 'PubDate': '2023 Feb 3', 'EPubDate': '2023 Feb 3', 'Source': 'BMC Biol', 'AuthorList': ['Sternke-Hoffmann R', 'Pauly T', 'Norrild RK', 'Hansen J', 'Tucholski F', 'Høie MH', 'Marcatili P', 'Dupré M', 'Duchateau M', 'Rey M', 'Malosse C', 'Metzger S', 'Boquoi A', 'Platten F', 'Egelhaaf SU', 'Chamot-Rooke J', 'Fenk R', 'Nagel-Steger L', 'Haas R', 'Buell AK'], 'Title': 'Widespread amyloidogenicity potential of multiple myeloma patient-derived immunoglobulin light chains', 'Volume': '21', 'Issue': '', 'Pages': '21', 'ArticleIds': {'pubmed': [], 'medline': [], 'pmid': '0', 'doi': '10.1186/s12915-022-01506-w', 'pmcid': 'PMC9898917'}, 'DOI': '10.1186/s12915-022-01506-w', 'FullJournalName': 'BMC Biology', 'SO': '2023 Feb 3;21:21'}
{'Item': [], 'Id': '9883684', 'PubDate': '2019 Aug 22', 'EPubDate': '2019 Aug 22', 'Source': 'Bioinformatics', 'AuthorList': ['Hugoson E', 'Lam WT', 'Guy L'], 'Title': 'miComplete: weighted quality evaluation of assembled microbial genomes

In [None]:
assemblies = Entrez.read(Entrez.esearch(db='assembly', term='GCA_003713205.1'))
assembly = assemblies['IdList'][0] if assemblies['IdList'] else None
print(f'assembly: {assembly}')
if assembly:
  summary = Entrez.read(Entrez.esummary(db='assembly', id=assembly))
  summary_data = summary['DocumentSummarySet']['DocumentSummary'][0]
  print(summary_data['Taxid'], summary_data['Organism'], summary_data['SpeciesTaxid'], summary_data['SpeciesName'], summary_data['AssemblyStatus'])


assembly: 2070231
49369 Coffea eugenioides (eudicots) 49369 Coffea eugenioides Chromosome


## Efetch
Retrieves records in the requested format from a list of one or more primary IDs.
To perform the sequence analysis, you need to get the full GenBank record for each sequence. EFetch retrieves full records from Entrez databases. EFetch requires an input of a database and a list of UIDs.

Additionally, EFetch can return the output in different formats. You can specify which output format (i.e. GenBank (gb), FASTA) and file format (i.e. text, ASN.1, XML) you want through the rettype and retmode parameters, respectively.

https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/

In [None]:
from Bio import Entrez, SeqIO

handle = Entrez.esearch(db="nucleotide",retmax="10",term="S [Gene Name] AND Sars-Cov_2 [organism]")
#handle = Entrez.esearch(db="nucleotide",retmax="10",term="S [Gene Name] AND Sars-Cov_2 [organism] AND Refseq")
record = Entrez.read(handle)
print(record)

# get first
summary = Entrez.read(Entrez.esummary(db='nucleotide', id=record["IdList"][0]))[0]
print(summary)

genbank_record = Entrez.efetch(db="nucleotide", id=record["IdList"][0],rettype="gb", retmode="text")
record = SeqIO.read(genbank_record, "genbank")
print(record)


{'Count': '3337424', 'RetMax': '10', 'RetStart': '0', 'IdList': ['2438585312', '2438585299', '2438585287', '2438585274', '2438585261', '2438585249', '2438585236', '2438585223', '2438585210', '2438585198'], 'TranslationSet': [{'From': 'Sars-Cov_2[organism]', 'To': '"Severe acute respiratory syndrome coronavirus 2"[Organism]'}], 'TranslationStack': [{'Term': 'S[Gene Name]', 'Field': 'Gene Name', 'Count': '3398891', 'Explode': 'N'}, {'Term': '"Severe acute respiratory syndrome coronavirus 2"[Organism]', 'Field': 'Organism', 'Count': '6690456', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'S[Gene Name] AND "Severe acute respiratory syndrome coronavirus 2"[Organism]'}
{'Item': [], 'Id': '2438585312', 'Caption': 'OQ365040', 'Title': 'Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/VA-CDC-LC0997448/2023 ORF1ab polyprotein (ORF1ab), ORF1a polyprotein (ORF1ab), surface glycoprotein (S), ORF3a protein (ORF3a), envelope protein (E), membrane glycoprotein (M), ORF6 pro

In [None]:
from Bio import Entrez, SeqIO
# idtype: Specifies the type of identifier to return for sequence databases (nuccore, popset, protein). 
# By default, ESearch returns GI numbers in its output. If idtype is set to ‘acc’, ESearch will return accession.version identifiers rather than GI numbers.
handle = Entrez.esearch(db="nuccore",retmax="1", term="P53 AND Homo Sapiens [organism]", idtype='acc')
record = Entrez.read(handle)
print(record)
acc = record["IdList"][0]
print('id', acc)
handle = Entrez.efetch(db="nucleotide", id=record["IdList"][0],rettype="gb")
record = SeqIO.read(handle, "genbank")
print(record)

{'Count': '7209', 'RetMax': '1', 'RetStart': '0', 'IdList': ['NG_033079.1'], 'TranslationSet': [{'From': 'Homo Sapiens[organism]', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': 'P53[All Fields]', 'Field': 'All Fields', 'Count': '56999', 'Explode': 'N'}, {'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '28312156', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'P53[All Fields] AND "Homo sapiens"[Organism]'}
id NG_033079.1
ID: NG_033079.1
Name: NG_033079
Description: Homo sapiens killin, p53 regulated DNA replication inhibitor (KLLN), RefSeqGene (LRG_1087) on chromosome 10
Number of features: 12
/molecule_type=DNA
/topology=linear
/data_file_division=PRI
/date=30-JAN-2023
/accessions=['NG_033079']
/sequence_version=1
/keywords=['RefSeq', 'RefSeqGene']
/source=Homo sapiens (human)
/organism=Homo sapiens
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplor

In [None]:
fasta = SeqIO.read(Entrez.efetch(db="nucleotide", id=acc, rettype="fasta"), 'fasta')
fasta

SeqRecord(seq=Seq('TTTCTACAGAAAAAAAAAAATACATTTCTTAAGCTATGTTAATATTACATTTTC...GGT'), id='NG_033079.1', name='NG_033079.1', description='NG_033079.1 Homo sapiens killin, p53 regulated DNA replication inhibitor (KLLN), RefSeqGene (LRG_1087) on chromosome 10', dbxrefs=[])

In [None]:
from Bio import Entrez, SeqIO
handle = Entrez.efetch(db="nucleotide", id="186972394",rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
print(record)
handle.close()

ID: EU490707.1
Name: EU490707
Description: Selenipedium aequinoctiale maturase K (matK) gene, partial cds; chloroplast
Number of features: 3
/molecule_type=DNA
/topology=linear
/data_file_division=PLN
/date=26-JUL-2016
/accessions=['EU490707']
/sequence_version=1
/keywords=['']
/source=chloroplast Selenipedium aequinoctiale
/organism=Selenipedium aequinoctiale
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliopsida', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Selenipedium']
/references=[Reference(title='Phylogenetic utility of ycf1 in orchids: a plastid gene more variable than matK', ...), Reference(title='Direct Submission', ...)]
Seq('ATTTTTTACGAACCTGTGGAAATTTTTGGTTATGACAATAAATCTAGTTTAGTA...GAA')


## NCBI Global Search
Busca global em todos os databases.




In [None]:
handle = Entrez.egquery(term="biopython")
record = Entrez.read(handle)
handle.close()
for row in record["eGQueryResult"]:
    if row["Count"] != '0':
      print(row)

{'DbName': 'pubmed', 'MenuName': 'PubMed', 'Count': '41', 'Status': 'Ok'}
{'DbName': 'pmc', 'MenuName': 'PubMed Central', 'Count': '2715', 'Status': 'Ok'}
{'DbName': 'books', 'MenuName': 'Books', 'Count': '2', 'Status': 'Ok'}
{'DbName': 'pubmedhealth', 'MenuName': 'PubMed Health', 'Count': 'Error', 'Status': 'Database Error'}
{'DbName': 'ncbisearch', 'MenuName': 'Site Search', 'Count': '1', 'Status': 'Ok'}
{'DbName': 'nuccore', 'MenuName': 'Nucleotide', 'Count': '2', 'Status': 'Ok'}
{'DbName': 'sra', 'MenuName': 'SRA', 'Count': '1377', 'Status': 'Ok'}
{'DbName': 'biosystems', 'MenuName': 'BioSystems', 'Count': 'Error', 'Status': 'Database Error'}
{'DbName': 'gds', 'MenuName': 'GEO DataSets', 'Count': '17', 'Status': 'Ok'}
{'DbName': 'bioproject', 'MenuName': 'BioProject', 'Count': '1', 'Status': 'Ok'}


#Writing and Converting Sequence Files Formats

In [None]:
from Bio import SeqIO

# Turn a SeqRecord in the FASTA format into a two-line FASTA formated string
file = '/content/HBB-human.fasta'
file2lines = '/content/HBB-human.fasta2line' 

with open(file2lines, 'w+') as converted_file:
  SeqIO.convert(file, 'fasta', converted_file, 'fasta-2line')

In [None]:
!cat /content/HBB-human.fasta2line

>NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [None]:

# Turn a SeqRecord in the Genbank format into a two-line FASTA formated string
file = '/content/HBB.gb'
file2lines = '/content/HBB.fasta2line' 

with open(file2lines, 'w+') as converted_file:
  SeqIO.convert(file, 'gb', converted_file, 'fasta-2line')

In [None]:
!head /content/HBB.gb

LOCUS       NM_001168847             444 bp    mRNA    linear   PRI 27-APR-2020
DEFINITION  Papio anubis hemoglobin, beta (HBB), mRNA.
ACCESSION   NM_001168847
VERSION     NM_001168847.1
KEYWORDS    RefSeq.
SOURCE      Papio anubis (olive baboon)
  ORGANISM  Papio anubis
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
            Catarrhini; Cercopithecidae; Cercopithecinae; Papio.


In [None]:
!head /content/HBB.fasta2line

>NM_001168847.1 Papio anubis hemoglobin, beta (HBB), mRNA
ATGGTGCATCTGACTCCTGAGGAGAAGAATGCCGTTACCGCCCTGTGGGGCAAAGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGTTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGATTCCTTTGGGGATCTGTCCTCTCCTGCTGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTTGGTGCCTTTAGTGATGGCCTGAATCACCTGGACAACCTCAAGGGCACCTTTGCCCAGCTCAGTGAGCTGCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAAGCTCCTGGGCAACGTGCTGGTGTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCGCAAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTACCACTAA
>NM_033234.1 Rattus norvegicus hemoglobin subunit beta (Hbb), mRNA
TGCTTCTGACATAGTTGTGTTGACTCACAAACTCAGAAACAGACACCATGGTGCACCTGACTGATGCTGAGAAGGCTGCTGTTAATGGCCTGTGGGGAAAGGTGAACCCTGATGATGTTGGTGGCGAGGCCCTGGGCAGGCTGCTGGTTGTCTACCCTTGGACCCAGAGGTACTTTGATAGCTTTGGGGACCTGTCCTCTGCCTCTGCTATCATGGGTAACCCTAAGGTGAAGGCCCATGGCAAGAAGGTGATAAACGCCTTCAATGATGGCCTGAAACACTTGGACAACCTCAAGGGCACCTTTGCTCATCTGAGTGAACTCCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAGGCTCCTGGGCAATATGATTGTGATTGTGTTGGGCCACCACCTGGGCAAGGAATTCACCCCCTGTGCACA

# Parse Fasta

In [None]:
from Bio import SeqIO

multifasta = SeqIO.parse('/content/HBB-human.fasta', 'fasta')

for fasta in multifasta:
  print(f'id: {fasta.id}, name: {fasta.name}, description: {fasta.description}')
  print(fasta.seq)

id: NM_000518.5, name: NM_000518.5, description: NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [None]:
from Bio import SeqIO

multifasta = SeqIO.parse('/content/HBB.fasta2line', 'fasta')

for fasta in multifasta:
  print(f'id: {fasta.id}, name: {fasta.name}, description: {fasta.description}')

id: NM_001168847.1, name: NM_001168847.1, description: NM_001168847.1 Papio anubis hemoglobin, beta (HBB), mRNA
id: NM_033234.1, name: NM_033234.1, description: NM_033234.1 Rattus norvegicus hemoglobin subunit beta (Hbb), mRNA
id: NM_000518.5, name: NM_000518.5, description: NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA
id: NM_001304110.1, name: NM_001304110.1, description: NM_001304110.1 Esox lucius hemoglobin subunit beta (LOC105010112), mRNA
id: NM_001303935.1, name: NM_001303935.1, description: NM_001303935.1 Esox lucius hemoglobin subunit beta (LOC105013372), mRNA
id: NM_001303868.1, name: NM_001303868.1, description: NM_001303868.1 Esox lucius hemoglobin, beta (LOC105028448), mRNA
id: NM_001303858.1, name: NM_001303858.1, description: NM_001303858.1 Esox lucius hemoglobin subunit beta (LOC105028446), mRNA
id: NM_001246752.1, name: NM_001246752.1, description: NM_001246752.1 Cricetulus griseus beta major globin chain (Hbb), mRNA
id: NM_001086273.2, name: NM_00108627

# Parse Genbank Files (Gb)

In [None]:
from Bio import SeqIO

multigb = SeqIO.parse('/content/HBB.gb', 'gb')

print(dir(gb))
print('\n')
for gb in multigb:
  print(f'id: {gb.id}, name: {gb.name}, organism: {gb.annotations.get("organism")} , taxonomy: {gb.annotations.get("taxonomy")}')

['__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_per_letter_annotations', '_seq', '_set_per_letter_annotations', '_set_seq', 'annotations', 'count', 'dbxrefs', 'description', 'features', 'format', 'id', 'islower', 'isupper', 'letter_annotations', 'lower', 'name', 'reverse_complement', 'seq', 'translate', 'upper']


id: NM_001168847.1, name: NM_001168847, organism: Papio anubis , taxonomy: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Cercopithecidae', 'Cercopithecinae', 'Papio']
id: NM_033

In [None]:
# Genbank annotations
from Bio import SeqIO

multigb = SeqIO.parse('/content/HBB.gb', 'gb')

for gb in multigb:
  for key, value in gb.annotations.items():
    print(f'{key} : {value}')
  
  print('--------------------------------------------------------------------')

  for item in gb.features:
    if item.type in ('CDS', 'exon'):
      print(f'{item.type}, {item.location}')
      print(gb.seq[item.location.start:item.location.end])
  break

molecule_type : mRNA
topology : linear
data_file_division : PRI
date : 27-APR-2020
accessions : ['NM_001168847']
sequence_version : 1
keywords : ['RefSeq']
source : Papio anubis (olive baboon)
organism : Papio anubis
taxonomy : ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Cercopithecidae', 'Cercopithecinae', 'Papio']
references : [Reference(title='Strand symmetry around the beta-globin origin of replication in primates', ...)]
comment : INFERRED REFSEQ: This record is predicted by genome sequence
analysis and is not yet supported by experimental evidence. The
reference sequence was derived from DP000505.1.
structured_comment : defaultdict(<class 'dict'>, {'Evidence-Data': {'Transcript exon combination': 'EY275508.1, EY275701.1 [ECO:0000332]', 'RNAseq introns': 'single sample supports all introns SAMN02045698, SAMN02401340 [ECO:0000348]'}})
----------------------------

# Create a Fasta File

In [None]:
from Bio import SeqIO
from Bio.Seq import *
from Bio.SeqRecord import *

record = SeqIO.parse('/content/HBB-human.gb', 'genbank')

for item in record:
  fasta = SeqRecord(Seq(str(item.seq)), 
                    id=item.name, 
                    description=item.description)
  print(fasta.format('fasta'))

>NM_000518 Homo sapiens hemoglobin subunit beta (HBB), mRNA
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATC
TGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAG
TTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTG
AGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTC
ATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGG
GCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACT
TCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCA
CCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCC
ACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCC
CTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCC
TAATAAAAAACATTTATTTTCATTGCAA



# Sequence Analysis


### Central Dogma

In a DNA molecule, the two antiparallel strands (positive or 5’-> 3’ and negative or 3’-> 5’) are complementary to each other.

As per standard notation, we write the DNA sequence in 5’-> 3’ orientation. Therefore, to represent a sequence of the negative strand in 5’-> 3’ orientation we need to perform a reverse complementary of positive strain.
https://www.reneshbedre.com/blog/reverse-complementary.html

In [None]:
from Bio import SeqIO
from Bio.Seq import *
record = SeqIO.parse('/content/HBB-human.fasta', 'fasta')
for item in record:
  dna = str(item.seq) # 5’-> 3’ 
  print(f'DNA {dna}')
  complementary_strand = complement(dna) # 3’-> 5’ 
  print(f'Complementary strand {complementary_strand}')
  reversed_complementary_strand = complementary_strand[::-1] #  5’-> 3’ 
  print(f'Reversed complementary strand {reversed_complementary_strand}')
  reversed_complementary_strand_lib = reverse_complement(dna)
  print(f'Reversed complementary strand with lib {reversed_complementary_strand_lib}')
  rna = transcribe(dna)
  print(f'RNA {rna}')
  reversed_rna = back_transcribe(rna)
  print(f'Reverse Transcription {reversed_rna}')
  translation_first_frame = translate(rna) # uses the default genetic code table. To use another table specify the param table. Example: table=2
  print(f'Translation first frame {translation_first_frame}')
  translation_second_frame = translate(rna[1:]) # uses the default genetic code table. To use another table specify the param table. Example: table=2
  print(f'Translation second frame {translation_second_frame}')
  translation_third_frame = translate(rna[2:]) # uses the default genetic code table. To use another table specify the param table. Example: table=2
  print(f'Translation third frame {translation_third_frame}')

DNA ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
Complementary strand TGTAAACGAAGACTGTGTTGACACAAGTGATCGTTGGAGTTTGTCTGTGGTACCACGTAGACTGAGGACTCCTCTTCAGACGGCAATGACGGGACACCCCGTTCCACTTGCACCTACTTCAACCACCACTCCGGGACCCGTCCGACGACCACCAGATGGGAACCTGGGTCTCCAAGAAACTCAGGAAACCCCTAGACAGGTGAGGACTACGACAATACCCGTTGGGATTCCACTTCCGAGTACCGTTCTTTCACGAGCCACGGAAATCACTACCGGACCGAGTGGACCTGTTGGAGTTCCCGTGGAAACGGTGTGACTCACTCGACGTGACACTGTTCGACGTGCA

In [None]:
# https://biopython.org/docs/1.75/api/Bio.SeqUtils.html
from Bio import SeqIO
from Bio.SeqUtils import *
from Bio.Seq import *

record = SeqIO.parse('/content/HBB-human.fasta', 'fasta')
for item in record:
  dna = str(item.seq)
  rna = transcribe(dna)
  protein = translate(dna[50:494])
  # nt_search: Search for a DNA subseq in sequence, return list of [subseq, positions].
  print('Protein aminoacids 1 letter', protein)
  print('Protein aminoacids 3 letters', seq3(protein))
  print('count start codon ATG', nt_search(dna, 'ATG'))
  print('C content', nt_search(dna, 'C'))
  print('Molecular weight', molecular_weight(dna, 'DNA'))


Protein aminoacids 1 letter MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH*
Protein aminoacids 3 letters MetValHisLeuThrProGluGluLysSerAlaValThrAlaLeuTrpGlyLysValAsnValAspGluValGlyGlyGluAlaLeuGlyArgLeuLeuValValTyrProTrpThrGlnArgPhePheGluSerPheGlyAspLeuSerThrProAspAlaValMetGlyAsnProLysValLysAlaHisGlyLysLysValLeuGlyAlaPheSerAspGlyLeuAlaHisLeuAspAsnLeuLysGlyThrPheAlaThrLeuSerGluLeuHisCysAspLysLeuHisValAspProGluAsnPheArgLeuLeuGlyAsnValLeuValCysValLeuAlaHisHisPheGlyLysGluPheThrProProValGlnAlaAlaTyrGlnLysValValAlaGlyValAlaAsnAlaLeuAlaHisLysTyrHisTer
count start codon ATG ['ATG', 50, 114, 207, 215, 240, 270, 468, 570]
C content ['C', 1, 7, 10, 14, 16, 19, 26, 28, 32, 35, 36, 38, 42, 46, 48, 49, 56, 59, 63, 65, 66, 78, 81, 82, 87, 90, 91, 92, 100, 109, 132, 133, 134, 139, 143, 146, 154, 157, 158, 159, 165, 166, 167, 175, 183, 184, 194, 198, 199, 201, 203, 204, 210, 220, 223, 224, 225, 237, 239,

In [None]:
print('Six frame translations', six_frame_translations(dna[50:494]))

Six frame translations GC_Frame: a:88 t:107 g:136 c:113
Sequence: atggtgcatc ... gtatcactaa, 444 nt, 56.08 %GC


1/1
  G  A  S  D  S  *  G  E  V  C  R  Y  C  P  V  G  Q  G  E  R
 W  C  I  *  L  L  R  R  S  L  P  L  L  P  C  G  A  R  *  T
M  V  H  L  T  P  E  E  K  S  A  V  T  A  L  W  G  K  V  N
atggtgcatctgactcctgaggagaagtctgccgttactgccctgtggggcaaggtgaac   56 %
taccacgtagactgaggactcctcttcagacggcaatgacgggacaccccgttccacttg
T  C  R  V  G  S  S  F  D  A  T  V  A  R  H  P  L  T  F  T
 H  H  M  Q  S  R  L  L  L  R  G  N  S  G  Q  P  A  L  H  V
  P  A  D  S  E  Q  P  S  T  Q  R  *  Q  G  T  P  C  P  S  R

61/21
  G  *  S  W  W  *  G  P  G  Q  A  A  G  G  L  P  L  D  P  E
 W  M  K  L  V  V  R  P  W  A  G  C  W  W  S  T  L  G  P  R
V  D  E  V  G  G  E  A  L  G  R  L  L  V  V  Y  P  W  T  Q
gtggatgaagttggtggtgaggccctgggcaggctgctggtggtctacccttggacccag   63 %
cacctacttcaaccaccactccgggacccgtccgacgaccaccagatgggaacctgggtc
S  S  T  P  P  S  A  R  P  L  S  S  T  T  *  G  Q  V  W  L
 H  I  F  N  T  T  



# Sequence Alignment

Pairwise Sequence Alignment is a process in which two sequences are compared at a time and the best possible sequence alignment is provided. Pairwise sequence alignment uses a dynamic programming algorithm. Biopython has a special module Bio.pairwise2 which identifies the alignment sequence using pairwise method. Biopython provides the best algorithm to find alignment sequence as compared to other software.

https://biopython.org/docs/1.75/api/Bio.pairwise2.html

https://www.geeksforgeeks.org/biopython-pairwise-alignment/ 

https://rob-p.github.io/CSE549F17/lectures/Lec08.pdf

https://qcbsciprolab2019.readthedocs.io/en/latest/docs/Practical10.pdf

### Global

Compara sequences bastante similares com algumas divergências

In [None]:
from Bio import SeqIO
from Bio.SeqUtils import *
from Bio.Seq import *
from Bio import pairwise2
from Bio.pairwise2 import *

human = ''
rat = ''

for record in SeqIO.parse('/content/HBB-human.fasta', 'fasta'):
  human = str(record.seq)
for record in SeqIO.parse('/content/HBB-rat.fasta', 'fasta'):
  rat = str(record.seq)

print(human)
print(rat)
#print(help(pairwise2))
global_aligment = pairwise2.align.globalxx(human, rat) # returns a list with all comparisons
print(len(global_aligment))
print(global_aligment[0])
print(global_aligment[431])
print(format_alignment(*global_aligment[0]))


ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
TGCTTCTGACATAGTTGTGTTGACTCACAAACTCAGAAACAGACACCATGGTGCACCTGACTGATGCTGAGAAGGCTGCTGTTAATGGCCTGTGGGGAAAGGTGAACCCTGATGATGTTGGTGGCGAGGCCCTGGGCAGGCTGCTGGTTGTCTACCCTTGGACCCAGAGGTACTTTGATAGCTTTGGGGACCTGTCCTCTGCCTCTGCTATCATGGGTAACCCTAAGGTGAAGGCCCATGGCAAGAAGGTGATAAACGCCTTCAATGATGGCCTGAAACACTTGGACAACCTCAAGGGCACCTTTGCTCATCTGAGTGAACTCCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAGGCTCCTGGGC

### Local

Compara qequencias bastante divergentes que apresentam algumas regiões de similaridade

In [None]:
from Bio import SeqIO
from Bio.SeqUtils import *
from Bio.Seq import *
from Bio import pairwise2
from Bio.pairwise2 import *

human = ''
rat = ''

for record in SeqIO.parse('/content/HBB-human.fasta', 'fasta'):
  human = str(record.seq)
for record in SeqIO.parse('/content/HBB-rat.fasta', 'fasta'):
  rat = str(record.seq)

# The function localxx aligns the sequences with no gap penalties, but if you do want to penalize gaps use localxs instead.

print(human)
print(rat)
#print(help(pairwise2.align.localxx))
local_aligment = pairwise2.align.localxx(human, rat) # returns a list with all comparisons
print(len(local_aligment))
print(local_aligment[0])
print(format_alignment(*global_aligment[0]))

# Extended gap
#print(help(pairwise2.align.localxs))
local_aligment = pairwise2.align.localxs(human, rat, -10, -0.1) # returns a list with all comparisons
print(len(local_aligment))
print(local_aligment[0])
print(format_alignment(*global_aligment[0]))

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
TGCTTCTGACATAGTTGTGTTGACTCACAAACTCAGAAACAGACACCATGGTGCACCTGACTGATGCTGAGAAGGCTGCTGTTAATGGCCTGTGGGGAAAGGTGAACCCTGATGATGTTGGTGGCGAGGCCCTGGGCAGGCTGCTGGTTGTCTACCCTTGGACCCAGAGGTACTTTGATAGCTTTGGGGACCTGTCCTCTGCCTCTGCTATCATGGGTAACCCTAAGGTGAAGGCCCATGGCAAGAAGGTGATAAACGCCTTCAATGATGGCCTGAAACACTTGGACAACCTCAAGGGCACCTTTGCTCATCTGAGTGAACTCCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAGGCTCCTGGGC

In [None]:
local_no_penalties = pairwise2.align.localxx("PREFIX", "PREFIXPART_LOOONGSUFFIXPART")
for res in local_no_penalties:
  print(res)
local_with_penalties = pairwise2.align.localxs("PREFIX", "PREFIXPART_LOOONGSUFFIXPART", -1, -1)
for res in local_with_penalties: 
  print(res)

Alignment(seqA='PREFI-----------------X----', seqB='PREFIXPART_LOOONGSUFFIXPART', score=6.0, start=0, end=23)
Alignment(seqA='PREFIX---------------------', seqB='PREFIXPART_LOOONGSUFFIXPART', score=6.0, start=0, end=6)


### Multiple

#### Clustal

In [None]:
!wget http://www.clustal.org/omega/clustal-omega-1.2.4.tar.gz

--2023-02-21 22:12:25--  http://www.clustal.org/omega/clustal-omega-1.2.4.tar.gz
Resolving www.clustal.org (www.clustal.org)... 137.43.93.206
Connecting to www.clustal.org (www.clustal.org)|137.43.93.206|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1170516 (1.1M) [application/x-gzip]
Saving to: ‘clustal-omega-1.2.4.tar.gz’


2023-02-21 22:12:26 (2.07 MB/s) - ‘clustal-omega-1.2.4.tar.gz’ saved [1170516/1170516]



In [None]:
!tar -xf clustal-omega-1.2.4.tar.gz

In [None]:
# clustal dependency
!wget http://prdownloads.sourceforge.net/argtable/argtable2-13.tar.gz

--2023-02-21 22:20:30--  http://prdownloads.sourceforge.net/argtable/argtable2-13.tar.gz
Resolving prdownloads.sourceforge.net (prdownloads.sourceforge.net)... 204.68.111.105
Connecting to prdownloads.sourceforge.net (prdownloads.sourceforge.net)|204.68.111.105|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.sourceforge.net/project/argtable/argtable/argtable-2.13/argtable2-13.tar.gz [following]
--2023-02-21 22:20:31--  http://downloads.sourceforge.net/project/argtable/argtable/argtable-2.13/argtable2-13.tar.gz
Resolving downloads.sourceforge.net (downloads.sourceforge.net)... 204.68.111.105
Reusing existing connection to prdownloads.sourceforge.net:80.
HTTP request sent, awaiting response... 302 Found
Location: http://pilotfiber.dl.sourceforge.net/project/argtable/argtable/argtable-2.13/argtable2-13.tar.gz [following]
--2023-02-21 22:20:31--  http://pilotfiber.dl.sourceforge.net/project/argtable/argtable/argtable-2.13/argtable2

In [None]:
!tar -xf argtable2-13.tar.gz

In [None]:
%%bash
cd argtable2-13
./configure
make
sudo make install
make clean

checking build system type... x86_64-unknown-linux-gnu
checking host system type... x86_64-unknown-linux-gnu
checking target system type... x86_64-unknown-linux-gnu
checking for a BSD-compatible install... /usr/bin/install -c
checking whether build environment is sane... yes
checking for a thread-safe mkdir -p... /usr/bin/mkdir -p
checking for gawk... no
checking for mawk... mawk
checking whether make sets $(MAKE)... yes
checking for gcc... gcc
checking whether the C compiler works... yes
checking for C compiler default output file name... a.out
checking for suffix of executables... 
checking whether we are cross compiling... no
checking for suffix of object files... o
checking whether we are using the GNU C compiler... yes
checking whether gcc accepts -g... yes
checking for gcc option to accept ISO C89... none needed
checking for style of include used by make... GNU
checking dependency style of gcc... gcc3
checking for a sed that does not truncate output... /usr/bin/sed
checking for g

arg_int.c: In function ‘strtol0X’:
   60 |     while (isspace(*ptr))
      |            ^~~~~~~
   89 |    if (toupper(*ptr++)!=toupper(X))
      |        ^~~~~~~
ar: `u' modifier ignored since `D' is the default (see `U')


In [None]:
%%bash

cd clustal-omega-1.2.4
./configure
make
sudo make install

configure:

    Welcome to Clustal Omega - version 1.2.4 (AndreaGiacomo)

                       +NMMMMMMMMMS=
                    MMMMM?      :MMMMM8
                  IMMMMS           MMMMM,
                ~MMMMN              ,MMMMM
               =MMMMM                 MMMMM
              ?MMMMM                  +MMMMM
              MMMMMS                   MMMMM
              MMMMM                    MMMMM:
              MMMMM                    MMMMM+
              MMMMM                    MMMMM~
              MMMMM                    MMMMM
              MMMMMZ                   MMMMM
               MMMMM                  ,MMMMZ
                MMMM8                 MMMM+
                 MMMMS               MMMM,
                   MMMMO           MMMM
            ,N       SMMM        MMM8       +8
            ,M7         M+       M+        :M8
            ,MMMMMMMMMMMMI       MMMMMMMMMMMM8
            ,MMMMMMMMMMMM8       MMMMMMMMMMMM8
            ,MMMMMMMMMMMMN       MMMMMMMMM

ar: `u' modifier ignored since `D' is the default (see `U')
ar: `u' modifier ignored since `D' is the default (see `U')
ar: `u' modifier ignored since `D' is the default (see `U')
ar: `u' modifier ignored since `D' is the default (see `U')
ar: `u' modifier ignored since `D' is the default (see `U')
/usr/bin/ld: ./.libs/libclustalo.a(clustal-omega.o): in function `AlnToHHMFile':


In [None]:
# Alternative is to use the binary instead of install through the source code.
#!wget http://www.clustal.org/omega/clustalo-1.2.4-Ubuntu-x86_64
#!sudo chmod +x /content/clustalo-1.2.4-Ubuntu-x86_64
!./clustalo-1.2.4-Ubuntu-x86_64 --help

Clustal Omega - 1.2.4 (AndreaGiacomo)

If you like Clustal-Omega please cite:
 Sievers F, Wilm A, Dineen D, Gibson TJ, Karplus K, Li W, Lopez R, McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG.
 Fast, scalable generation of high-quality protein multiple sequence alignments using Clustal Omega.
 Mol Syst Biol. 2011 Oct 11;7:539. doi: 10.1038/msb.2011.75. PMID: 21988835.
If you don't like Clustal-Omega, please let us know why (and cite us anyway).

Check http://www.clustal.org for more information and updates.

Usage: clustalo-1.2.4-Ubuntu-x86_64 [-hv] [-i {<file>,-}] [--hmm-in=<file>]... [--hmm-batch=<file>] [--dealign] [--profile1=<file>] [--profile2=<file>] [--is-profile] [-t {Protein, RNA, DNA}] [--infmt={a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]}] [--distmat-in=<file>] [--distmat-out=<file>] [--guidetree-in=<file>] [--guidetree-out=<file>] [--pileup] [--full] [--full-iter] [--cluster-size=<n>] [--clustering-out=<file>] [--trans=<n>] [--posterior-out=<fi

Usage: clustalo-1.2.4-Ubuntu-x86_64 [-hv] [-i {<file>,-}] [--hmm-in=<file>]... [--hmm-batch=<file>] [--dealign] [--profile1=<file>] [--profile2=<file>] [--is-profile] [-t {Protein, RNA, DNA}] [--infmt={a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]}] [--distmat-in=<file>] [--distmat-out=<file>] [--guidetree-in=<file>] [--guidetree-out=<file>] [--pileup] [--full] [--full-iter] [--cluster-size=<n>] [--clustering-out=<file>] [--trans=<n>] [--posterior-out=<file>] [--use-kimura] [--percent-id] [-o {file,-}] [--outfmt={a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]}] [--residuenumber] [--wrap=<n>] [--output-order={input-order,tree-order}] [--iterations=<n>] [--max-guidetree-iterations=<n>] [--max-hmm-iterations=<n>] [--maxnumseq=<n>] [--maxseqlen=<l>] [--auto] [--threads=<n>] [--pseudo=<file>] [-l <file>] [--version] [--long-version] [--force] [--MAC-RAM=<n>]

In [None]:
from Bio.Align.Applications import ClustalwCommandline
print(help(ClustalwCommandline))

Help on class ClustalwCommandline in module Bio.Align.Applications._Clustalw:

class ClustalwCommandline(Bio.Application.AbstractCommandline)
 |  ClustalwCommandline(cmd='clustalw', **kwargs)
 |  
 |  Command line wrapper for clustalw (version one or two).
 |  
 |  http://www.clustal.org/
 |  
 |  Notes
 |  -----
 |  Last checked against versions: 1.83 and 2.1
 |  
 |  References
 |  ----------
 |  Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
 |  McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
 |  Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
 |  Bioinformatics, 23, 2947-2948.
 |  
 |  Examples
 |  --------
 |  >>> from Bio.Align.Applications import ClustalwCommandline
 |  >>> in_file = "unaligned.fasta"
 |  >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
 |  >>> print(clustalw_cline)
 |  clustalw2 -infile=unaligned.fasta
 |  
 |  You would typically run the command line with clustalw_cline() or via
 |  the

In [None]:
from Bio.Align.Applications import ClustalwCommandline

cmd = ClustalwCommandline('/content/clustalo-1.2.4-Ubuntu-x86_64', 
                          infile='/content/HBB.fasta', 
                          newtree='clustal.dnd',
                          type='DNA',
                          output='FASTA',
                          outfile='HBB.aligned.fasta')

# ClustalW creates a multiple sequence alignment from a group of related sequences using progressive, pairwise alignments (clustal.ann file)
# It can also create a dendrogram (.dnd) showing the clustering relationships used to create the alignment. 
std_out, std_err = cmd()
print(std_out)
print(std_err)

Using 2 threads
Read 24 sequences (type: DNA) from /content/HBB.fasta
not more sequences (24) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress: 0 % (0 out of 300)
Ktuple-distance calculation progress: 12 % (39 out of 300)
Ktuple-distance calculation progress: 13 % (40 out of 300)
Ktuple-distance calculation progress: 22 % (67 out of 300)
Ktuple-distance calculation progress: 29 % (88 out of 300)
Ktuple-distance calculation progress: 31 % (93 out of 300)
Ktuple-distance calculation progress: 37 % (111 out of 300)
Ktuple-distance calculation progress: 44 % (134 out of 300)
Ktuple-distance calculation progress: 46 % (139 out of 300)
Ktuple-distance calculation progress: 51 % (155 out of 300)
Ktuple-distance calculation progress: 57 % (174 out of 300)
Ktuple-distance calculation progress: 60 % (182 out of 300)
Ktuple-distance calculation progress: 68 % (205 out of 300)
Ktuple-distance calculation progress: 72 % (216 out of

In [None]:
#alginments
!wc -l /content/HBB.aligned.fasta
!head -n45 /content/HBB.aligned.fasta

648 /content/HBB.aligned.fasta
>NM_001168847.1 Papio anubis hemoglobin, beta (HBB), mRNA
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
----------------------------ATGGTGCATCTGACTCCTGAGGAGAAGAATGC
CGTTACCGCCCTGTGGGGCAAAGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAG
GTTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGATTCCTTTGGGGATCTGTCCTC
TCCTGCTGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTTGGTGC
CTTTAGTGATGGCCTGAATCACCTGGACAACCTCAAGGGCACCTTTGCCCAGCTCAGTGA
GCTGCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAAGCTCCTGGGCAACGTGCT
GGTGTGTGTGCTGGCCCATCACTTTGGCAAA---GAATTCACCCCGCAAGTGCAGGCTGC
CTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTACCACTAA-----
------------------------------------------------------------
------------------------------------------------------------
---------------------------------------------------------

In [None]:
# dendogram
!cat /content/clustal.dnd

(
(
NM_001086273.2:0.26111
,
(
NM_001246752.1:0.152875
,
(
NM_001279263.1:0.142892
,
(
NM_033234.1:0.135371
,
(
(
(
NM_001314043.1:0
,
NM_001082260.3:0
):0.0960043
,
(
NM_001164018.1:0.0683066
,
(
NM_001304885.1:0.0795199
,
(
NM_001304883.1:0.0793379
,
(
NM_000518.5:0.0228062
,
(
NM_001329918.1:0.0114199
,
(
NM_001168847.1:0.00675676
,
(
NM_001283367.1:0.00424628
,
NM_001164428.1:0.00424628
):0.00251047
):0.0046631
):0.0113864
):0.0565317
):0.00018198
):-0.0112133
):0.0276977
):-0.00375915
,
(
NM_001144841.1:0.0791354
,
(
NM_001097648.1:0.033105
,
NM_173917.2:0.033105
):0.0460303
):0.0131098
):0.0431257
):0.0075209
):0.0099837
):0.108235
):0.0174499
,
(
NM_001201019.1:0.245115
,
(
NM_131020.3:0.20879
,
(
NM_001304110.1:0.182458
,
(
NM_001123666.1:0.135102
,
(
NM_001303858.1:0.113895
,
(
NM_001303935.1:0.0906149
,
NM_001303868.1:0.0906149
):0.0232801
):0.0212071
):0.0473559
):0.0263323
):0.0363242
):0.0334455
)
;


#### Muscle

In [None]:
# MUSCLE é frequentemente utilizado como um substituto para o Clustal

#from Bio.Align.Applications import MuscleCommandline
#muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe"
#in_file = r"C:\My Documents\unaligned.fasta"
#out_file = r"C:\My Documents\aligned.fasta"
#muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
#print(muscle_cline)

# Phylogenetic Trees

### Reading a Phylip file

PHYLogeny Inference Package (PHYLIP) is a free computational phylogenetics package of programs for inferring evolutionary trees (phylogenies).

In [None]:
from Bio import AlignIO

filename = '/content/HBB.phy'
read_file = AlignIO.read(open(filename), 'phylip')
print(read_file)

for seq in read_file:
  print(seq)
  break

Alignment with 24 rows and 1511 columns
--------------------------------------------...--- NM_033234.
--------------------------------------------...--- NM_0012467
--------------------------------------------...--- NM_0012833
--------------------------------------------...--- NM_0013299
--------------------------------------------...--- NM_000518.
--------------------------------------------...--- NM_0011644
--------------------------------------------...--- NM_0011688
ACTGCTGGTATGGGTCTGGGAGATACATAGAAGGAAGGCTGAGT...--- NM_0013140
ACTGCTGGTATGGGTCTGGGAGATACATAGAAGGAAGGCTGAGT...--- NM_0010822
--------------------------------------------...--- NM_0010976
--------------------------------------------...--- NM_173917.
--------------------------------------------...--- NM_0011448
--------------------------------------------...--- NM_0012792
--------------------------------------------...--- NM_0013048
--------------------------------------------...--- NM_0013048
------------------------------

In [None]:
from Bio import Phylo

# o formato de árvore de Newick é uma maneira de representar árvores teóricas de grafos com comprimentos de arestas usando parênteses e vírgulas.
read_tree = Phylo.read('/content/clustal.dnd', 'newick')
print(read_tree)
print('\n')
Phylo.draw_ascii(read_tree)

Tree(rooted=False, weight=1.0)
    Clade()
        Clade(branch_length=0.0174499)
            Clade(branch_length=0.26111, name='NM_001086273.2')
            Clade(branch_length=0.108235)
                Clade(branch_length=0.152875, name='NM_001246752.1')
                Clade(branch_length=0.0099837)
                    Clade(branch_length=0.142892, name='NM_001279263.1')
                    Clade(branch_length=0.0075209)
                        Clade(branch_length=0.135371, name='NM_033234.1')
                        Clade(branch_length=0.0431257)
                            Clade(branch_length=-0.00375915)
                                Clade(branch_length=0.0960043)
                                    Clade(branch_length=0.0, name='NM_001314043.1')
                                    Clade(branch_length=0.0, name='NM_001082260.3')
                                Clade(branch_length=0.0276977)
                                    Clade(branch_length=0.0683066, name='NM_001164018.1'

# Blast

https://biopython.org/docs/1.75/api/Bio.Blast.NCBIWWW.html

Veja também: http://ncbi.github.io/blast-cloud/dev/api.html

https://diegomariano.com/blast/

In [12]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW

record = SeqIO.read("/content/HBB-human.fasta", "fasta")

command = NCBIWWW.qblast(program='blastn',
                         database='nt',
                         expect=2e-95,
                         sequence=record.seq)

with open('blast_results.txt', 'w+') as w:
  w.write(command.read())

In [6]:
!head -n50 blast_results.txt

<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
  <BlastOutput_program>blastn</BlastOutput_program>
  <BlastOutput_version>BLASTN 2.13.0+</BlastOutput_version>
  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
  <BlastOutput_db>nt</BlastOutput_db>
  <BlastOutput_query-ID>Query_37727</BlastOutput_query-ID>
  <BlastOutput_query-def>No definition line</BlastOutput_query-def>
  <BlastOutput_query-len>628</BlastOutput_query-len>
  <BlastOutput_param>
    <Parameters>
      <Parameters_expect>10</Parameters_expect>
      <Parameters_sc-match>2</Parameters_sc-match>
      <Parameters_sc-mismatch>-3</Parameters_sc-mismatch>
      

In [None]:
from Bio.Blast import NCBIXML
record = NCBIXML.read(open('/content/blast_results.txt'))
print(dir(record.descriptions[0]))
for ind, align in enumerate(record.descriptions):
  print('Alignment', ind)
  print('Accession', align.accession)
  print('Title', align.title)
  print('Score', align.score)
  print('E-value', align.e)
  print('Num aligments', align.num_alignments)
  print('\n')

In [16]:
from Bio.Blast import NCBIXML
record = NCBIXML.read(open('/content/blast_results.txt'))
print(dir(record.descriptions[0]))
for ind, align in enumerate(record.alignments):
  print('Alignment', ind)
  print('Title', align.title)
  print('Length', align.length)
  print('HSPs', align.hsps) #A High-scoring Segment Pair (HSP) is a local alignment with no gaps that achieves one of the highest alignment scores..
  for hsp in align.hsps:
    print('HSP score, bits, evalue, identities, gaps, strand, frame', hsp.score,  hsp.bits, hsp.expect, hsp.identities, hsp.gaps, hsp.strand, hsp.frame)
    print('HSP Query start-end', hsp.query_start, hsp.query_end)
    print('HSP Subject start-end', hsp.sbjct_start, hsp.sbjct_end)
    print('HSP Alignment length', hsp.align_length)
    print(hsp.query[:])
    print(hsp.match[:])
    print(hsp.sbjct[:])
  print('\n')
  break

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'accession', 'bits', 'e', 'num_alignments', 'score', 'title']
Alignment 0
Title gi|1401724401|ref|NM_000518.5| Homo sapiens hemoglobin subunit beta (HBB), mRNA
Length 628
HSPs [<Bio.Blast.Record.HSP object at 0x7f0c7324a0a0>]
HSP score, bits, evalue, identities, gaps, strand, frame 1256.0 1133.8 0.0 628 0 ('Plus', 'Plus') (1, 1)
HSP Query start-end 1 628
HSP Subject start-end 1 628
HSP Alignment length 628
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGC