In [1]:
import requests
import json
import os
import pandas as pd
import re
from tqdm import tqdm

tqdm.pandas()

In [2]:
RAW_DATA_PATH = os.path.join("data", "raw")
PROCESSED_DATA_PATH = os.path.join("data", "processed")

Load Data CSV Taxon and Acession

In [3]:
data_accession_taxon = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon.csv"))

In [4]:
data_accession_taxon.sample(5)

Unnamed: 0,accession,tax_id,organism_name
480,GCF_000757795.1,693986,Methylobacterium oryzae CBMB20
613,GCF_013466785.1,419476,Nocardioides marinisabuli
430,GCF_900475915.1,28264,Arcanobacterium haemolyticum
49,GCF_004799605.1,2242,Halobacterium salinarum
752,GCF_008245045.1,1031542,Campylobacter volucris


In [5]:
data_accession_taxon[data_accession_taxon.organism_name.str.contains("ixo")]

Unnamed: 0,accession,tax_id,organism_name
987,GCF_030316605.1,1276219,Spiroplasma ixodetis Y32


URL Download FASTA

In [6]:
urls_summary = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download_summary?include_annotation_type[0]=GENOME_FASTA"

In [7]:
urls_summary.format(accession="GCF_030316605.1")

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download_summary?include_annotation_type[0]=GENOME_FASTA'

In [8]:
summary_response = requests.get(urls_summary.format(accession="GCF_030316605.1"))

In [9]:
print(json.dumps(summary_response.json(), indent = 4))

{
    "record_count": 1,
    "resource_updated_on": "2023-10-14T03:40:16Z",
    "hydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome"
    },
    "dehydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?hydrated=DATA_REPORT_ONLY&include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome --dehydrated",
        "cli_rehydrate_command_line": "datasets rehydrate --help"
    },
    "available_files": {
        "all_genomic_fasta": {
            "file_count": 1,
            "size_mb": 0.5104723
        },
        "genome_gff": {
            "file_count

In [10]:
urls_download = summary_response.json()["hydrated"]["url"]

In [11]:
urls_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA'

List All download URL

In [12]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name
0,GCF_000006945.2,99287,Salmonella enterica subsp. enterica serovar Ty...
1,GCF_000195955.2,83332,Mycobacterium tuberculosis H37Rv
2,GCF_000009045.1,224308,Bacillus subtilis subsp. subtilis str. 168
3,GCF_000005845.2,511145,Escherichia coli str. K-12 substr. MG1655
4,GCF_000008865.2,386585,Escherichia coli O157:H7 str. Sakai
...,...,...,...
995,GCF_001042695.1,1150468,Scardovia inopinata JCM 12537
996,GCF_003339775.1,1276259,Spiroplasma phoeniceum P40
997,GCF_000319575.2,1260251,Spiribacter salinus M19-40
998,GCF_000020145.1,445932,Elusimicrobium minutum Pei191


In [13]:
urls_download_template = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download?include_annotation_type=DEFAULT,GENOME_FASTA&filename={filename}"

def get_fasta_url_download(row):
    accession = row["accession"]
    filename = re.sub('[^a-zA-Z0-9 \n\.]', '', row["organism_name"])
    filename = filename.replace(" ", "")
    filename = filename + ".zip"
    
    url_download = urls_download_template.format(accession=accession, filename=filename)
    
    return url_download


data_accession_taxon["fasta_url_download"] = data_accession_taxon.progress_apply(get_fasta_url_download, axis=1)

100%|██████████| 1000/1000 [00:00<00:00, 100143.35it/s]


In [14]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name,fasta_url_download
0,GCF_000006945.2,99287,Salmonella enterica subsp. enterica serovar Ty...,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
1,GCF_000195955.2,83332,Mycobacterium tuberculosis H37Rv,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2,GCF_000009045.1,224308,Bacillus subtilis subsp. subtilis str. 168,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
3,GCF_000005845.2,511145,Escherichia coli str. K-12 substr. MG1655,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
4,GCF_000008865.2,386585,Escherichia coli O157:H7 str. Sakai,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
...,...,...,...,...
995,GCF_001042695.1,1150468,Scardovia inopinata JCM 12537,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
996,GCF_003339775.1,1276259,Spiroplasma phoeniceum P40,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
997,GCF_000319575.2,1260251,Spiribacter salinus M19-40,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
998,GCF_000020145.1,445932,Elusimicrobium minutum Pei191,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...


In [15]:
data_accession_taxon.iloc[0].fasta_url_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000006945.2/download?include_annotation_type=DEFAULT,GENOME_FASTA&filename=Salmonellaentericasubsp.entericaserovarTyphimuriumstr.LT2.zip'

In [16]:
data_accession_taxon.to_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon_fasta.csv"), index=False)