In [1]:
import sys
import zipfile
import pandas as pd
from pprint import pprint
from datetime import datetime
from collections import defaultdict, Counter
from IPython.display import display

import matplotlib.pyplot as plt
plt.style.use('ggplot')

try:
    import ncbi.datasets
except ImportError:
    print('ncbi.datasets module not found. To install, run `pip install ncbi-datasets-pylib`.')

# Como baixar genomas automaticamente do NCBI?

## Documentation for API Endpoints

All URIs are relative to *https://api.ncbi.nlm.nih.gov/datasets/v1alpha*

Class | Method | HTTP request | Description
------------ | ------------- | ------------- | -------------
*GenomeApi* | [**assembly_descriptors_by_accessions**](docs/GenomeApi.md#assembly_descriptors_by_accessions) | **GET** /genome/accession/{accessions} | Get genome metadata by accession
*GenomeApi* | [**assembly_descriptors_by_bioproject**](docs/GenomeApi.md#assembly_descriptors_by_bioproject) | **GET** /genome/bioproject/{accessions} | Get genome metadata by bioproject accession
*GenomeApi* | [**assembly_descriptors_by_taxon**](docs/GenomeApi.md#assembly_descriptors_by_taxon) | **GET** /genome/taxon/{taxon} | Get genome metadata by taxonomic identifier
*GenomeApi* | [**check_assembly_availability**](docs/GenomeApi.md#check_assembly_availability) | **GET** /genome/accession/{accessions}/check | Check the validity of genome accessions
*GenomeApi* | [**check_assembly_availability_post**](docs/GenomeApi.md#check_assembly_availability_post) | **POST** /genome/check | Check the validity of many genome accessions in a single request
*GenomeApi* | [**download_assembly_package**](docs/GenomeApi.md#download_assembly_package) | **GET** /genome/accession/{accessions}/download | Get a genome dataset by accession
*GenomeApi* | [**download_assembly_package_post**](docs/GenomeApi.md#download_assembly_package_post) | **POST** /genome/download | Get a genome dataset by post
*GenomeApi* | [**genome_download_summary**](docs/GenomeApi.md#genome_download_summary) | **GET** /genome/accession/{accessions}/download_summary | Preview genome dataset download
*GenomeApi* | [**genome_download_summary_by_post**](docs/GenomeApi.md#genome_download_summary_by_post) | **POST** /genome/download_summary | Preview genome dataset download by POST
*GenomeApi* | [**genome_tax_name_query**](docs/GenomeApi.md#genome_tax_name_query) | **GET** /genome/taxon_suggest/{taxon_query} | Get a list of taxonomy names and IDs found in the assembly dataset given a partial taxonomic name.
*GenomeApi* | [**genome_tax_tree**](docs/GenomeApi.md#genome_tax_tree) | **GET** /genome/taxon/{taxon}/tree | Get a taxonomic subtree by taxonomic identifier

In [3]:
api_instance = ncbi.datasets.GenomeApi(ncbi.datasets.ApiClient()) #Cria objeto controlador 

In [None]:
genome_summary = api_instance.assembly_descriptors_by_taxon(taxon = 2, limit=100000)

In [5]:
assm_counter = Counter()
for assembly in map(lambda d: d.assembly, genome_summary.assemblies):
    if assembly.assembly_accession[:3] == 'GCA':
        assm_counter['GenBank'] += 1
    elif assembly.assembly_accession[:3] == 'GCF':
        assm_counter['RefSeq'] += 1
print(assm_counter)

Counter({'RefSeq': 25237, 'GenBank': 24763})


---

In [None]:
import seaborn as sn
import numpy as np
import customfunc as cf
import matrix

**CLASSES**

**FUNÇÕES**

In [None]:
def acessar_coordenadas(dataframe):
    import customfunc
    """Retira apenas os dados com coordenadas de coleta"""
    dataframe_com_coordenadas = dataframe.query("colection_date != 'Na' ").query("coordenates != 'Na'").query("country != 'Na'")
    dataframe_com_coordenadas = dataframe_com_coordenadas.set_index("country").filter(like = "Brazil", axis = 0)
    dataframe_com_coordenadas.reset_index(inplace = True)
    customfunc.SplitCoords(dataframe_com_coordenadas, "lat")
    customfunc.SplitCoords(dataframe_com_coordenadas, "lon")
    dataframe_com_coordenadas["lat"] = dataframe_com_coordenadas.lat.apply(lambda x: customfunc.NumCoord(x))
    dataframe_com_coordenadas["lon"] = dataframe_com_coordenadas.lon.apply(lambda x: customfunc.NumCoord(x))
    return dataframe_com_coordenadas

def hash_argannot(data):
    argannot_dictionary = {}
    for args in data.itertuples():
        argannot_dictionary[args.gene] = data.organism
    return argannot_dictionary
    

**IMPORTAR DADOS**

In [None]:
#Catalogo das espécies dos genomas que baixamos
catalogo_bacterias_br = pd.read_csv("/home/tiago/documents/github/ARG-Sentinel/spreadsheets/catalogo_teste.csv")

limpeza de dados

In [None]:
#Selecionar genoams com local de coleta = brazil
catalogo_bacterias_br = catalogo_bacterias_br[catalogo_bacterias_br["country"].str.contains("Brazil")]

In [None]:
argannot_database = pd.read_csv("/home/tiago/documents/github/ARG-Sentinel/spreadsheets/argannot_br.csv",sep = "\t", names = ['file','sequence','start','end','strand','gene','coverage','coverage_map','gaps','coverage_pec','identity_perc','database','accession','product','resistance'])
argannot_database['file'] = argannot_database["file"].apply(lambda caminho: caminho.split("/")[-1])
argannot_database = cf.filldf(argannot_database, catalogo_bacterias_br) # adiciona a espécie de cada genoma

In [None]:
argannot_com_coordenadas = acessar_coordenadas(argannot_database)

In [None]:
argannot_com_coordenadas.head()

**Grafos**

Encontrar relação entre as bacérias que compartilham genes de resistência

In [None]:
import networkx as nx

In [None]:
argannot_adj_matrix = argannot_com_coordenadas.set_index("file")["gene"].str.get_dummies().groupby(level = 0).sum()
argannot_adj_matrix.sort_index(inplace = True)

In [None]:
grafo_genomas_br = nx.Graph()
for file in argannot_adj_matrix.index:
    for gene in argannot_adj_matrix.columns:
        if argannot_adj_matrix.loc[file,gene] != 0:
            grafo_genomas_br.add_node(file)
            grafo_genomas_br.add_node(gene)
            print(f"{file} <----> {gene}")
            grafo_genomas_br.add_edge(file,gene)

In [None]:
nx.draw(grafo_genomas_br, pos = nx.spring_layout(grafo_genomas_br))

In [None]:
teste_dic = hash_argannot(argannot_database)

In [None]:
teste_dic.keys()