# Análise de sequências e features no NCBI

In [1]:
# Import de módulos e bibliotecas

from Bio import Entrez
from Bio import SeqIO

Funções para aceder, extraír e guardar informação de sequências do NCBI nomeadamente as suas annotations, features e qualifiers

In [2]:
Entrez.email = input("Insere o teu e-mail") 
def access_ncbi_seq(accession : str, database : str, format : str, filename : str):
    """
    função para aceder, extraír e guardar a informação de sequências do NCBI
    recebe o accession id da sequência, a respetiva base de dados ('nucleotide' ou 'protein'),
    o formato em que queremos trabalhar a informação ('gb' ou 'faa') e o nome que queremos dar ao ficheiro
    """
    #if database != "nucleotide" or "protein":
     #   return print("Not accepted database: only accepts 'nucleotide' or 'protein' databases")
    #if format != "gb" or "genbank" or "faa" or "fasta":
     #   return print("Not accepted format: only accepts 'gb' or 'genbank' or 'faa' or 'fasta'")
    
    handle = Entrez.efetch(db=database, id=accession, rettype=format, retmode="text")
    record = SeqIO.read(handle, format)
    handle.close()

    SeqIO.write(record, filename, format)

    record = SeqIO.read(filename, format)

    return record


In [3]:
def seq_annotations(record):
    """
    função que mostra as anotações do ficheiro genbank criado
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print("ID:", record.id)
    print("Name:", record.name)
    print("Description", record.description)
    print("Sequence length", len(record))  
    print("General annotations:\n", record.annotations)
    if len(record.dbxrefs) != 0:
        print("External Databases References:", record.dbxrefs) 

In [4]:
def seq_features_qualifiers(record):
    """
    função que mostra as features e os seus qualifiers
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print(len(record.features), "features\n")
    print("Type and Location:")
    for feature in record.features:
        print(feature.type, feature.location)
    print("\nQualifiers:")
    for k in range(len(record.features)):
        print(record.features[k].qualifiers)

Análise das sequências de mRNA

Anotações das sequências de mRNA

In [5]:
seq_annotations(access_ncbi_seq("444739122", "nucleotide", "gb", "C11ORF30_mrna")) # annotations C11ORF30_mrna

ID: HF584130.1
Name: HF584130
Description Homo sapiens C11orf30 gene for alternative protein C11orf30, isolate 110393
Sequence length 141
General annotations:
 {'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '25-SEP-2013', 'accessions': ['HF584130'], 'sequence_version': 1, 'keywords': [''], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Direct detection of alternative open reading frames translation products in human significantly expands the proteome', ...), Reference(title='Direct Submission', ...)]}


In [6]:
#seq_annotations(access_ncbi_seq("444739122", "nucleotide", "gb", "C11ORF30_mrna")) # annotations C11ORF30_mrna

In [7]:
#seq_annotations(access_ncbi_seq("", "nucleotide", "gb", "_mrna")) # annotations

In [8]:
#seq_annotations(access_ncbi_seq("", "nucleotide", "gb", "_mrna")) # annotations 

Features e qualifiers das sequências de mRNA

In [9]:
seq_features_qualifiers(access_ncbi_seq("444739122", "nucleotide", "gb", "C11ORF30_mrna")) # features, qualifiers C11ORF3O_mrna

3 features

Type and Location:
source [0:141](+)
gene [0:141](+)
CDS [0:141](+)

Qualifiers:
{'organism': ['Homo sapiens'], 'mol_type': ['genomic DNA'], 'db_xref': ['taxon:9606']}
{'gene': ['C11orf30']}
{'gene': ['C11orf30'], 'codon_start': ['1'], 'product': ['alternative protein C11orf30'], 'protein_id': ['CCQ43627.1'], 'db_xref': ['UniProtKB/TrEMBL:L8ECI5'], 'translation': ['MLLLMQLSSIMHLFQCLQKQEARKWFAIPTQVPRQPQPLPLFQVAA']}


In [10]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

In [11]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

In [12]:
#seq_features_qualifiers(access_ncbi_seq("", "nucleotide", "gb", "_mrna"))

Análise das sequências de proteína

Anotações das sequências de proteína

In [13]:
seq_annotations(access_ncbi_seq("47605660", "protein", "gb", "C11ORF_protein")) # annotations C11ORF30_protein

ID: Q7Z589.2
Name: EMSY_HUMAN
Description RecName: Full=BRCA2-interacting transcriptional repressor EMSY
Sequence length 1322
General annotations:
 {'topology': 'linear', 'data_file_division': 'PRI', 'date': '08-NOV-2023', 'accessions': ['Q7Z589'], 'sequence_version': 2, 'keywords': ['3D-structure', 'Alternative splicing', 'Chromatin regulator', 'DNA damage', 'DNA repair', 'Glycoprotein', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repressor', 'Transcription', 'Transcription regulation'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'db_source': 'UniProtKB: locus EMSY_HUMAN, accession Q7Z589; class: standard. extra accessions:B7ZKT8,B7ZKU0,B7ZKU2,Q17RM7,Q4G109,Q8NBU6,Q8TE50,Q9H8I9, Q9NRH0 created: May 24, 2004. sequence updated: May 24, 2004. annotation updated: Nov 



Features e qualifiers das sequências de proteína

In [14]:
seq_features_qualifiers(access_ncbi_seq("47605660", "protein", "gb", "C11ORF30_protein")) # features, qualifiers C11ORF30_protein

52 features

Type and Location:
source [0:1322]
gene [0:1322]
Protein [0:1322]
Region [0:1322]
Region [0:478]
Region [6:9]
Region [12:38]
Region [15:100]
Region [16:86]
Region [42:55]
Region [59:71]
Region [73:84]
Region [81:82]
Region [81:82]
Region [82:1322]
Region [89:97]
Region [97:103]
Site [99:102]
Region [103:108]
Site [105:106]
Region [107:110]
Region [111:116]
Region [121:124]
Region [139:140]
Region [147:178]
Region [191:215]
Site [206:207]
Site [208:209]
Site [212:213]
Site [227:228]
Site [235:236]
Site [237:238]
Site [270:271]
Region [416:444]
Region [416:438]
Site [500:501]
Site [505:506]
Site [556:557]
Region [697:736]
Region [732:747]
Region [747:852]
Site [817:818]
Site [820:821]
Region [838:852]
Region [<949:1121]
Region [1090:1257]
Site [1119:1120]
Site [1135:1136]
Region [1204:1231]
Region [1204:1223]
Region [1289:1322]
Region [1306:1322]

Qualifiers:
{'organism': ['Homo sapiens'], 'db_xref': ['taxon:9606']}
{'gene': ['EMSY'], 'locus_tag': ['GL002'], 'gene_synonym': 