In [1]:
import requests
import json
import os
import pandas as pd

In [2]:
RAW_DATA_PATH = os.path.join("data", "raw")
PROCESSED_DATA_PATH = os.path.join("data", "processed")
SOURCE_DATA_PATH = os.path.join("data", "source")

Getting Response list genome search

In [3]:
index_file = 1
response = {"next_page_token": None}

while "next_page_token" in response.keys():
    urls_genome_reports = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/dataset_report"
    payload = {
        "filters": {
            "exclude_paired_reports": True,
            "assembly_version": "current"
        },
        "page_size": 1000,
        "page_token": response["next_page_token"],
        "returned_content": "COMPLETE",
        "sort": [],
        "taxons": [
            "40674"
        ]
    }

    res_genome_reports = requests.post(url=urls_genome_reports, json=payload)
    response = res_genome_reports.json()
    
    genome_reports_filename = os.path.join(RAW_DATA_PATH, "genome_reports_{index_file}.json".format(index_file=index_file))
    index_file += 1
    
    with open(genome_reports_filename, 'w', encoding ='utf8') as json_file: 
        json.dump(res_genome_reports.json(), json_file) 


Get List name taxonomy and ID Acession for download FASTA File

In [4]:
genome_raw_json = []

genome_reports_filename = [
    os.path.join(RAW_DATA_PATH, "genome_reports_1.json"),
    os.path.join(RAW_DATA_PATH, "genome_reports_2.json"),
    os.path.join(RAW_DATA_PATH, "genome_reports_3.json"),
]

for filename in genome_reports_filename:
    with open(filename, 'r', encoding ='utf8') as json_file: 
        genome_raw_json.append(json.load(json_file) )

In [5]:
len(genome_raw_json)

3

In [6]:
genome_reports_json = []

for page in genome_raw_json:

    genome_reports_json += page['reports']

In [7]:
len(genome_reports_json)

2858

In [8]:
print(json.dumps(genome_reports_json[1], indent=4))

{
    "accession": "GCF_000001635.27",
    "current_accession": "GCF_000001635.27",
    "paired_accession": "GCA_000001635.9",
    "source_database": "SOURCE_DATABASE_REFSEQ",
    "organism": {
        "tax_id": 10090,
        "organism_name": "Mus musculus",
        "common_name": "house mouse",
        "infraspecific_names": {
            "strain": "C57BL/6J"
        }
    },
    "assembly_info": {
        "assembly_level": "Chromosome",
        "assembly_status": "current",
        "paired_assembly": {
            "accession": "GCA_000001635.9",
            "status": "current"
        },
        "assembly_name": "GRCm39",
        "assembly_type": "haploid",
        "bioproject_lineage": [
            {
                "bioprojects": [
                    {
                        "accession": "PRJNA20689",
                        "title": "Genome sequence finishing for Mus musculus, currently maintained by the Genome Reference Consortium (GRC)"
                    }
                

In [9]:
data_genomes = []

for data_genome in genome_reports_json:

    data_list = {
        "accession" : data_genome.get('accession'),
        "tax_id" : data_genome.get('organism').get('tax_id'),
        "organism_name" : data_genome.get('organism').get('organism_name'),
        "common_name" : data_genome.get('organism').get('common_name'),
    }
    data_genomes.append(data_list) 

data_genomes_df = pd.DataFrame(data_genomes)

In [10]:
data_genomes_df = pd.DataFrame(data_genomes)

In [11]:
data_genomes_df

Unnamed: 0,accession,tax_id,organism_name,common_name
0,GCF_000001405.40,9606,Homo sapiens,human
1,GCF_000001635.27,10090,Mus musculus,house mouse
2,GCF_019923935.1,89462,Bubalus bubalis,water buffalo
3,GCA_023701655.1,2918886,Ovis ammon polii x Ovis aries,
4,GCA_032405125.1,34865,Capricornis sumatraensis,Sumatran serow
...,...,...,...,...
2853,GCA_016894425.1,9606,Homo sapiens,human
2854,GCA_023065335.1,9823,Sus scrofa,pig
2855,GCA_023065355.1,9823,Sus scrofa,pig
2856,GCA_030378505.1,9913,Bos taurus,cattle


In [12]:
data_genomes_df.to_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon.csv"), index=False)