In [18]:
# Import de módulos e bibliotecas

from Bio import Entrez
from Bio import SeqIO

# Análise de sequências e features no NCBI

Funções para aceder, extraír e guardar informação de sequências do NCBI nomeadamente as suas annotations, features e qualifiers

In [19]:
Entrez.email = "...@gmail.com" 
def access_ncbi_seq(accession : str, database : str, format : str, filename : str):
    """
    função para aceder, extraír e guardar a informação de sequências do NCBI
    recebe o accession id da sequência, a respetiva base de dados ('nucleotide' ou 'protein'),
    o formato em que queremos trabalhar a informação ('gb' ou 'faa') e o nome que queremos dar ao ficheiro
    """
    #if database != "nucleotide" or "protein":
     #   return print("Not accepted database: only accepts 'nucleotide' or 'protein' databases")
    #if format != "gb" or "genbank" or "faa" or "fasta":
     #   return print("Not accepted format: only accepts 'gb' or 'genbank' or 'faa' or 'fasta'")
    
    handle = Entrez.efetch(db=database, id=accession, rettype=format, retmode="text")
    record = SeqIO.read(handle, format)
    handle.close()

    SeqIO.write(record, filename, format)

    record = SeqIO.read(filename, format)

    return record


In [21]:
def seq_annotations(record):
    """
    função que mostra as anotações do ficheiro genbank criado
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print("ID: ", record.id)
    print("Name:", record.name)
    print("Description: ", record.description)
    print("Sequence length: ", len(record))  
    print("General annotations:\n", record.annotations)
    if len(record.dbxrefs) != 0:
        print("External Databases References:", record.dbxrefs) 

In [23]:
def seq_features_qualifiers(record):
    """
    função que mostra as features e os seus qualifiers
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print(len(record.features), "features\n")
    print("Type and Location:")
    for feature in record.features:
        print(feature.type, feature.location)
    print("\nQualifiers:")
    for k in range(len(record.features)):
        print(record.features[k].qualifiers)

Análise das sequências de mRNA

Anotações das sequências de mRNA

In [22]:
seq_annotations(access_ncbi_seq("1732746205", "nucleotide", "gb", "flg_mrna")) # annotations flg_mrna

ID:  NM_002016.2
Name: NM_002016
Description:  Homo sapiens filaggrin (FLG), mRNA
Sequence length:  12793
General annotations:
 {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '18-NOV-2023', 'accessions': ['NM_002016'], 'sequence_version': 2, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Filaggrin gene polymorphisms in Indian children with atopic dermatitis: A cross-sectional multicentre study', ...), Reference(title='Filaggrin gene variants among Saudi patients with ichthyosis vulgaris', ...), Reference(title='Deep palmar phenotyping in atopic eczema: patterns associated with filaggrin variants, disease severity and barrier function in a South Asian population', ...), 

In [None]:
#seq_annotations(access_ncbi_seq("", "nucleotide", "gb", "_mrna")) # annotations 

In [None]:
#seq_annotations(access_ncbi_seq("", "nucleotide", "gb", "_mrna")) # annotations

In [None]:
#seq_annotations(access_ncbi_seq("", "nucleotide", "gb", "_mrna")) # annotations 

Features e qualifiers das sequências de mRNA

In [24]:
seq_features_qualifiers(access_ncbi_seq("1732746205", "nucleotide", "gb", "flg_mrna"))

10 features

Type and Location:
source [0:12793](+)
gene [0:12793](+)
exon [0:51](+)
exon [51:210](+)
CDS [72:12258](+)
misc_feature [345:708](+)
misc_feature [834:11985](+)
exon [210:12793](+)
regulatory [12772:12778](+)
polyA_site [12792:12793](+)

Qualifiers:
OrderedDict([('organism', ['Homo sapiens']), ('mol_type', ['mRNA']), ('db_xref', ['taxon:9606']), ('chromosome', ['1']), ('map', ['1q21.3'])])
OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('note', ['filaggrin']), ('db_xref', ['GeneID:2312', 'HGNC:HGNC:3748', 'MIM:135940'])])
OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('inference', ['alignment:Splign:2.1.0'])])
OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('inference', ['alignment:Splign:2.1.0'])])
OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('note', ['epidermal filaggrin']), ('codon_start', ['1']), ('product', ['filaggrin']), ('protein_id', ['NP_002007.1']), ('

In [None]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

In [None]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

In [None]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

Análise das sequências de proteína

Anotações das sequências de proteína

Features e qualifiers das sequências de proteína

# BLAST