## Resultados NCBI + Uniprot

Depois de termos tanto os resultados do NCBI para cada uma das proteínas, bem como os da Uniprot, resolvemos compilar os resultados obtidos em ambas as bases de dados numa única tabela e comparar os mesmos. Os resultados da Uniprot permitiram complementar a informação obtida no NCBI, uma vez que, para algumas proteínas, dados como a função das mesmas estão especificadas com um maior nível de detalhe. Além disso, detetamos diferenças em 22 dos resultados entre as bases de dados, os quais estão sublinhados a azul. Optamos por considerar os resultados da Uniprot mais corretos, uma vez que as entradas se encontram revistas nesses casos.

In [1]:
import os, sys, inspect, math
import pandas as pd
from IPython.core.display import display, HTML

def import_modules():
    """
    Importar os módulos que desenvolvemos neste trabalho.
    """
    current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
    parent_dir = os.path.dirname(current_dir)
    sys.path.insert(0, parent_dir)

def itemize(l):
    """
    Criar uma lista HTML.
    """
    if isinstance(l, float) and math.isnan(l):
        return ""
    if isinstance(l, dict):
        return itemize_dict(l)

    html = "<ul>"
    for i in l:
        html += "<li>"
        if isinstance(i, dict):
            html += itemize_dict(i)
        else:
            html += i
        html +="</li>"
    html += "</ul>"
    return html

def itemize_dict(d):
    """
    Criar uma lista HTML dado um dicionário.
    """
    html = "<ul>"
    for k in d:
        html += "<li><strong>" + k + ":</strong> " + str(d[k]) + "</li>"  
    html += "</ul>"
    return html

def pretty_print(v):
    """
    Remove valores NaN e None.
    """
    if isinstance(v, float) and math.isnan(v):
        return ""
    if v is None:
        return ""
    return v

def background_it(v):
    """
    Torna o background azul.
    """
    return "<div style=\"background-color:powderblue;\">" + str(v) + "</div>"

def shorten_it(v):
    """
    Retorna uma string mais pequena + "..."
    """
    return v[:10] + "..."

def main():
    import_modules()
    import util.rw as rw
    
    # mostra todas as linhas
    pd.options.display.max_rows = 250
    
    # não truncar informação
    pd.set_option('display.max_colwidth', -1)

    ncbi = rw.read_json("files/.ncbi_uniprot.json")
    diff = rw.read_json("files/.ncbi_uniprot_diff.json")

    columns_to_itemize = [
        "location",
        "accessions",
        "cofactors",
        "biological_processes",
        "comment_functions",
        "locations",
        "molecular_functions",
        "pdbs"
    ]
        
    columns_to_pp = [
        "gene",
        "EC_number",
        "uniprot_id",
        "protein_id",
        "organism",
        "length",
        "mass",
        "translation"
    ]

    columns_to_show = [
        "short_name",
        "product",
        "gene",
        "EC_number",
        "accessions",
        "status",
        "type",
        "uniprot_id",
        "protein_id",
        "organism",
        "location",
        "length",
        "mass",
        "comment_functions",
        "molecular_functions",
        "biological_processes",
        "locations",
        "cofactors",
        "pdbs",
        "translation"
    ]
    
    df = pd.DataFrame(ncbi).transpose()

    for p in columns_to_itemize:
        df[p] = df[p].apply(itemize)
        
    for p in columns_to_pp:
        df[p] = df[p].apply(pretty_print)
        
    df["translation"] = df["translation"].apply(shorten_it)
        
    for tag in diff:
        for p in diff[tag]:
            df[p][tag] = background_it(df[p][tag])

    display(HTML(df[columns_to_show].to_html(escape=False)))

    
main()

Unnamed: 0,short_name,product,gene,EC_number,accessions,status,type,uniprot_id,protein_id,organism,location,length,mass,comment_functions,molecular_functions,biological_processes,locations,cofactors,pdbs,translation
lpg0232,Q5ZYX9_LEGPH,"Transcriptional regulator np20, Fur family",np20,,Q5ZYX9,unreviewed,mRNA,Q5ZYX9,YP_094286.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 270569start: 270036,177.0,20453.0,,"DNA bindingtranscription factor activity, sequence-specific DNA binding",,,,,MIGCCLIIFP...
lpg0233,Q5ZYX8_LEGPH,Benzoylformate decarboxylase,mdlC,4.1.1.7,Q5ZYX8,unreviewed,mRNA,Q5ZYX8,YP_094287.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: -1end: 272278start: 270686,530.0,58497.0,,benzoylformate decarboxylase activitymagnesium ion bindingthiamine pyrophosphate binding,,,,,MKKTGSDVLK...
lpg0234,Q5ZYX7_LEGPH,SidE,sidE,,Q5ZYX7,unreviewed,mRNA,Q5ZYX7,YP_094288.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: -1end: 277121start: 272577,1514.0,171651.0,,,,,,,MLIFKSQILI...
lpg0235,Q5ZYX6_LEGPH,Uncharacterized protein,,,Q5ZYX6,unreviewed,mRNA,Q5ZYX6,YP_094289.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: -1end: 277987start: 277484,167.0,19095.0,,carbon-sulfur lyase activity,metabolic process,,,,MKKAFRIMAT...
lpg0236,Q5ZYX5_LEGPH,Uncharacterized protein,,,Q5ZYX5,unreviewed,mRNA,Q5ZYX5,YP_094290.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: -1end: 280039start: 278060,659.0,77097.0,,,,,,,MRYTNIELLK...
lpg0237,Q5ZYX4_LEGPH,Lipolytic enzyme,mhpC,,Q5ZYX4,unreviewed,mRNA,Q5ZYX4,YP_094291.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 281114start: 280320,264.0,29403.0,,,,,,,MATLKINGVD...
lpg0238,Q5ZYX3_LEGPH,Glycine betaine aldehyde dehydrogenase,gbsA,1.2.1.8,Q5ZYX3,unreviewed,mRNA,Q5ZYX3,YP_094292.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 282597start: 281131,488.0,52945.0,,betaine-aldehyde dehydrogenase activity,,,,,MEIYKMYIDG...
lpg0239,Q5ZYX2_LEGPH,4-aminobutyrate aminotransferase,gabT,2.6.1.19,Q5ZYX2,unreviewed,mRNA,Q5ZYX2,YP_094293.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 283924start: 282572,450.0,49049.0,,4-aminobutyrate transaminase activitypyridoxal phosphate binding,gamma-aminobutyric acid metabolic process,,,,MKHQLVGTKL...
lpg0240,Q5ZYX1_LEGPH,DNA repair protein,recN,,Q5ZYX1,unreviewed,mRNA,Q5ZYX1,YP_094294.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 284787start: 284008,259.0,29552.0,,,,,,,MNDIMWYQNI...
lpg0241,GLSA_LEGPH,Glutaminase,,3.5.1.2,Q5ZYX0,reviewed,mRNA,Q5ZYX0,YP_094295.1,Legionella pneumophila subsp. pneumophila (strain Philadelphia 1 / ATCC 33152 / DSM 7513),strand: 1end: 285979start: 285047,310.0,33970.0,,glutaminase activity,glutamine metabolic process,,,,MSSKLLTIQL...
