In [5]:
import requests
import json
import os
import pandas as pd

In [6]:
RAW_DATA_PATH = os.path.join("data", "raw")
PROCESSED_DATA_PATH = os.path.join("data", "processed")
SOURCE_DATA_PATH = os.path.join("data", "source")

Getting Response list genome search

In [7]:
urls_genome_reports = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/dataset_report"
payload = {
    "filters": {
        "exclude_paired_reports": True,
        "assembly_version": "current"
    },
    "page_size": 1000,
    "page_token": None,
    "returned_content": "COMPLETE",
    "sort": [],
    "taxons": [
        "33208"
    ]
}

In [8]:
res_genome_reports = requests.post(url=urls_genome_reports, json=payload)

In [9]:
genome_reports_filename = os.path.join(RAW_DATA_PATH, "genome_reports.json")

with open(genome_reports_filename, 'w', encoding ='utf8') as json_file: 
    json.dump(res_genome_reports.json(), json_file) 

Get List name taxonomy and ID Acession for download FASTA File

In [10]:
genome_reports_filename = os.path.join(RAW_DATA_PATH, "genome_reports.json")

In [11]:
genome_raw_json = None

with open(genome_reports_filename, 'r', encoding ='utf8') as json_file: 
    genome_raw_json = json.load(json_file) 

In [12]:
genome_reports_json = genome_raw_json['reports']

In [13]:
len(genome_reports_json)

1000

In [14]:
print(json.dumps(genome_reports_json[1], indent=4))

{
    "accession": "GCF_000001405.40",
    "current_accession": "GCF_000001405.40",
    "paired_accession": "GCA_000001405.29",
    "source_database": "SOURCE_DATABASE_REFSEQ",
    "organism": {
        "tax_id": 9606,
        "organism_name": "Homo sapiens",
        "common_name": "human"
    },
    "assembly_info": {
        "assembly_level": "Chromosome",
        "assembly_status": "current",
        "paired_assembly": {
            "accession": "GCA_000001405.29",
            "status": "current",
            "only_genbank": "4 unlocalized and unplaced scaffolds."
        },
        "assembly_name": "GRCh38.p14",
        "assembly_type": "haploid-with-alt-loci",
        "bioproject_lineage": [
            {
                "bioprojects": [
                    {
                        "accession": "PRJNA31257",
                        "title": "The Human Genome Project, currently maintained by the Genome Reference Consortium (GRC)"
                    }
                ]
           

In [15]:
data_genomes = []

for data_genome in genome_reports_json:

    data_list = {
        "accession" : data_genome.get('accession'),
        "tax_id" : data_genome.get('organism').get('tax_id'),
        "organism_name" : data_genome.get('organism').get('organism_name'),
    }
    data_genomes.append(data_list) 

    

In [16]:
data_genomes_df = pd.DataFrame(data_genomes)

In [17]:
data_genomes_df

Unnamed: 0,accession,tax_id,organism_name
0,GCF_000002985.6,6239,Caenorhabditis elegans
1,GCF_000001405.40,9606,Homo sapiens
2,GCF_000001635.27,10090,Mus musculus
3,GCF_000001215.4,7227,Drosophila melanogaster
4,GCF_000002035.6,7955,Danio rerio
...,...,...,...
995,GCA_927399485.1,1371681,Calamotropha paludella
996,GCA_929112965.1,987983,Mythimna albipuncta
997,GCA_905220435.1,689058,Amphipyra tragopoginis
998,GCA_907165235.1,1101072,Zeuzera pyrina


In [18]:
data_genomes_df.to_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon.csv"), index=False)