# Análise de sequências e features no NCBI

In [47]:
# Import de módulos e bibliotecas

from Bio import Entrez
from Bio import SeqIO

Funções para aceder, extraír e guardar informação de sequências do NCBI nomeadamente as suas annotations, features e qualifiers

In [48]:
Entrez.email = input("Insere o teu e-mail") 
def access_ncbi_seq(accession : str, database : str, format : str, filename : str):
    """
    função para aceder, extraír e guardar a informação de sequências do NCBI
    recebe o accession id da sequência, a respetiva base de dados ('nucleotide' ou 'protein'),
    o formato em que queremos trabalhar a informação ('gb' ou 'faa') e o nome que queremos dar ao ficheiro
    """    
    handle = Entrez.efetch(db=database, id=accession, rettype=format, retmode="text")
    record = SeqIO.read(handle, format)
    handle.close()

    SeqIO.write(record, filename, format)

    record = SeqIO.read(filename, format)

    return record


In [49]:
def seq_annotations(record):
    """
    função que mostra as anotações do ficheiro genbank criado
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print("ID:", record.id)
    print("\nName:", record.name)
    print("\nDescription:", record.description)
    print("\nSequence length:", len(record))
    print("\nGeneral annotations:")
    for annotation, content in record.annotations.items():
          print(f">{annotation}: {content}")
    if len(record.dbxrefs) != 0:
        print("External Databases References:", record.dbxrefs) 

In [50]:
def seq_features_qualifiers(record):
    """
    função que mostra as features e os seus qualifiers
    recebe o record obtido ao ler o ficheiro através da função read_genbank_file
    """
    print(len(record.features), "features\n")
    print("Type and Location:")
    for feature in record.features:
        print(f">{feature.type}, {feature.location}")
    print("\nQualifiers:")
    for k in range(len(record.features)):
        print(f">{record.features[k].qualifiers}")

## Gene FLG

### Análise da sequência de mRNA

#### Anotações da sequência de mRNA

In [51]:
seq_annotations(access_ncbi_seq("1732746205", "nucleotide", "gb", "flg_mrna")) # annotations flg_mrna

ID: NM_002016.2

Name: NM_002016

Description: Homo sapiens filaggrin (FLG), mRNA

Sequence length: 12793

General annotations:
>molecule_type: mRNA
>topology: linear
>data_file_division: PRI
>date: 22-JAN-2024
>accessions: ['NM_002016']
>sequence_version: 2
>keywords: ['RefSeq', 'MANE Select']
>source: Homo sapiens (human)
>organism: Homo sapiens
>taxonomy: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
>references: [Reference(title='Temporal relationships between Staphylococcus aureus colonization, filaggrin expression, and pediatric atopic dermatitis', ...), Reference(title='Filaggrin gene polymorphisms in Indian children with atopic dermatitis: A cross-sectional multicentre study', ...), Reference(title='Filaggrin gene variants among Saudi patients with ichthyosis vulgaris', ...), Reference(title='Deep palmar phenotyping in atopic eczema: pattern

### Features e qualifiers da sequência de mRNA

In [52]:
seq_features_qualifiers(access_ncbi_seq("1732746205", "nucleotide", "gb", "flg_mrna")) # features, qualifiers flg_mrna

10 features

Type and Location:
>source, [0:12793](+)
>gene, [0:12793](+)
>exon, [0:51](+)
>exon, [51:210](+)
>CDS, [72:12258](+)
>misc_feature, [345:708](+)
>misc_feature, [834:11985](+)
>exon, [210:12793](+)
>regulatory, [12772:12778](+)
>polyA_site, [12792:12793](+)

Qualifiers:
>OrderedDict([('organism', ['Homo sapiens']), ('mol_type', ['mRNA']), ('db_xref', ['taxon:9606']), ('chromosome', ['1']), ('map', ['1q21.3'])])
>OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('note', ['filaggrin']), ('db_xref', ['GeneID:2312', 'HGNC:HGNC:3748', 'MIM:135940'])])
>OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('inference', ['alignment:Splign:2.1.0'])])
>OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('inference', ['alignment:Splign:2.1.0'])])
>OrderedDict([('gene', ['FLG']), ('gene_synonym', ['ATOD2; FLG-1; FLG1']), ('note', ['epidermal filaggrin']), ('codon_start', ['1']), ('product', ['filaggrin']), ('protein_

### Análise da sequência da proteína

#### Anotações da sequência da proteína do registo NCBI

In [53]:
seq_annotations(access_ncbi_seq("60097902", "protein", "gb", "flg_protein")) # annotations flg_protein

ID: NP_002007.1

Name: NP_002007

Description: filaggrin [Homo sapiens]

Sequence length: 4061

General annotations:
>topology: linear
>data_file_division: PRI
>date: 22-JAN-2024
>accessions: ['NP_002007']
>sequence_version: 1
>keywords: ['RefSeq', 'MANE Select']
>source: Homo sapiens (human)
>organism: Homo sapiens
>taxonomy: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
>references: [Reference(title='Temporal relationships between Staphylococcus aureus colonization, filaggrin expression, and pediatric atopic dermatitis', ...), Reference(title='Filaggrin gene polymorphisms in Indian children with atopic dermatitis: A cross-sectional multicentre study', ...), Reference(title='Filaggrin gene variants among Saudi patients with ichthyosis vulgaris', ...), Reference(title='Deep palmar phenotyping in atopic eczema: patterns associated with filaggrin vari

#### Features e qualifiers da sequência da proteína do registo NCBI

In [54]:
seq_features_qualifiers(access_ncbi_seq("60097902", "protein", "gb", "flg_protein")) # features, qualifiers flg_protein

31 features

Type and Location:
>source, [0:4061]
>Protein, [0:4061]
>Region, [1:88]
>Site, order{[1:19], [24:27], [35:37], [38:41], [68:72], [73:75], [76:80], [81:88]}
>Site, order{[18:19], [23:24], [26:27], [31:33], [61:62], [63:64], [65:66], [67:68], [69:70], [72:73]}
>Region, [91:212]
>Region, [254:3971]
>Region, [353:>590]
>Region, [372:428]
>Region, [564:>950]
>Region, [696:753]
>Region, [898:955]
>Region, [984:>1368]
>Region, [1021:1077]
>Region, [1224:1279]
>Region, [1345:1401]
>Region, [1546:1603]
>Region, [1606:>1826]
>Region, [1670:1725]
>Region, [1993:2050]
>Region, [2196:2252]
>Region, [2232:>2533]
>Region, [2318:2374]
>Region, [2642:2698]
>Region, [<2648:>2961]
>Region, [2966:3022]
>Region, [<2972:>3252]
>Region, [3290:3346]
>Region, [3605:>3832]
>Region, [3614:3670]
>CDS, [0:4061]

Qualifiers:
>OrderedDict([('organism', ['Homo sapiens']), ('db_xref', ['taxon:9606']), ('chromosome', ['1']), ('map', ['1q21.3'])])
>OrderedDict([('product', ['filaggrin']), ('note', ['epiderm

#### Anotações da sequência da proteína do registo UniProt no NCBI

In [55]:
seq_annotations(access_ncbi_seq("84028206", "protein", "gb", "flg_protein_uniprot")) # annotations flg_protein_uniprot

ID: P20930.3

Name: FILA_HUMAN

Description: RecName: Full=Filaggrin

Sequence length: 4061

General annotations:
>topology: linear
>data_file_division: PRI
>date: 08-NOV-2023
>accessions: ['P20930']
>sequence_version: 3
>keywords: ['3D-structure', 'Calcium', 'Citrullination', 'Coiled coil', 'Developmental protein', 'Direct protein sequencing', 'Ichthyosis', 'Metal-binding', 'Phosphoprotein', 'Reference proteome', 'Repeat']
>source: Homo sapiens (human)
>organism: Homo sapiens
>taxonomy: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
>references: [Reference(title='The DNA sequence and biological annotation of human chromosome 1', ...), Reference(title='Characterization of the human epidermal profilaggrin gene. Genomic organization and identification of an S-100-like calcium binding domain at the amino terminus', ...), Reference(title='Characterizatio

#### Features e qualifiers da sequência da proteína do registo UniProt no NCBI

In [56]:
seq_features_qualifiers(access_ncbi_seq("84028206", "protein", "gb", "flg_protein_uniprot")) # features, qualifiers flg_protein_uniprot

225 features

Type and Location:
>source, [0:4061]
>gene, [0:4061]
>Protein, [0:4061]
>Region, [0:4061]
>Region, [1:88]
>Site, order{[1:19], [24:27], [35:37], [38:41], [68:72], [73:75], [76:80], [81:88]}
>Region, [3:19]
>Region, [5:43]
>Site, order{[18:19], [23:24], [26:27], [31:33], [61:62], [63:64], [65:66], [67:68], [69:70], [72:73]}
>Region, [29:39]
>Region, [42:46]
>Region, [48:84]
>Region, [49:53]
>Region, [53:61]
>Region, [70:85]
>Region, [91:212]
>Region, [108:212]
>Region, [185:216]
>Region, [254:3971]
>Region, [257:306]
>Region, [262:285]
>Region, [285:355]
>Region, [331:332]
>Region, [353:>590]
>Region, [355:452]
>Region, [372:428]
>Region, [373:428]
>Region, [443:444]
>Region, [453:454]
>Region, [459:494]
>Region, [477:478]
>Region, [526:574]
>Region, [564:>950]
>Region, [574:595]
>Region, [578:630]
>Region, [595:627]
>Region, [647:679]
>Region, [679:700]
>Region, [696:753]
>Region, [697:753]
>Region, [700:715]
>Region, [715:750]
>Region, [724:725]
>Region, [741:742]
>Regio

In [None]:
# acrescentar tudo sobre o gene C11ORF30 seguindo a ordem anterior
# acrescentar tudo sobre o gene TSLP seguindo a ordem anterior