In [1]:
import requests
import json
import os
import pandas as pd
import re
from tqdm import tqdm

tqdm.pandas()

In [2]:
RAW_DATA_PATH = os.path.join("data", "raw")
PROCESSED_DATA_PATH = os.path.join("data", "processed")

Load Data CSV Taxon and Acession

In [3]:
data_accession_taxon = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon.csv"))

In [4]:
data_accession_taxon.sample(5)

Unnamed: 0,accession,tax_id,organism_name
114,GCF_905237065.1,8030,Salmo salar
569,GCA_903992545.1,59474,Pipistrellus pipistrellus
693,GCA_947568825.1,987436,Sphinx pinastri
973,GCA_929108735.1,1411666,Andrena dorsata
841,GCF_017654505.1,7913,Polyodon spathula


In [5]:
data_accession_taxon[data_accession_taxon.organism_name.str.contains("ixo")]

Unnamed: 0,accession,tax_id,organism_name


URL Download FASTA

In [6]:
urls_summary = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download_summary?include_annotation_type[0]=GENOME_FASTA"

In [7]:
urls_summary.format(accession="GCF_030316605.1")

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download_summary?include_annotation_type[0]=GENOME_FASTA'

In [8]:
summary_response = requests.get(urls_summary.format(accession="GCF_030316605.1"))

In [9]:
print(json.dumps(summary_response.json(), indent = 4))

{
    "record_count": 1,
    "resource_updated_on": "2023-10-14T03:40:16Z",
    "hydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome"
    },
    "dehydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?hydrated=DATA_REPORT_ONLY&include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome --dehydrated",
        "cli_rehydrate_command_line": "datasets rehydrate --help"
    },
    "available_files": {
        "all_genomic_fasta": {
            "file_count": 1,
            "size_mb": 0.5104723
        },
        "genome_gff": {
            "file_count

In [10]:
urls_download = summary_response.json()["hydrated"]["url"]

In [11]:
urls_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA'

List All download URL

In [12]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name
0,GCF_000002985.6,6239,Caenorhabditis elegans
1,GCF_000001405.40,9606,Homo sapiens
2,GCF_000001635.27,10090,Mus musculus
3,GCF_000001215.4,7227,Drosophila melanogaster
4,GCF_000002035.6,7955,Danio rerio
...,...,...,...
995,GCA_927399485.1,1371681,Calamotropha paludella
996,GCA_929112965.1,987983,Mythimna albipuncta
997,GCA_905220435.1,689058,Amphipyra tragopoginis
998,GCA_907165235.1,1101072,Zeuzera pyrina


In [13]:
urls_download_template = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download?include_annotation_type=DEFAULT,GENOME_FASTA&filename={filename}"

def get_fasta_url_download(row):
    accession = row["accession"]
    filename = re.sub('[^a-zA-Z0-9 \n\.]', '', row["organism_name"])
    filename = filename.replace(" ", "")
    filename = filename + ".zip"
    
    url_download = urls_download_template.format(accession=accession, filename=filename)
    
    return url_download


data_accession_taxon["fasta_url_download"] = data_accession_taxon.progress_apply(get_fasta_url_download, axis=1)

100%|██████████| 1000/1000 [00:00<00:00, 29787.61it/s]


In [14]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name,fasta_url_download
0,GCF_000002985.6,6239,Caenorhabditis elegans,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
1,GCF_000001405.40,9606,Homo sapiens,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2,GCF_000001635.27,10090,Mus musculus,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
3,GCF_000001215.4,7227,Drosophila melanogaster,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
4,GCF_000002035.6,7955,Danio rerio,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
...,...,...,...,...
995,GCA_927399485.1,1371681,Calamotropha paludella,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
996,GCA_929112965.1,987983,Mythimna albipuncta,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
997,GCA_905220435.1,689058,Amphipyra tragopoginis,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
998,GCA_907165235.1,1101072,Zeuzera pyrina,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...


In [15]:
data_accession_taxon.iloc[0].fasta_url_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000002985.6/download?include_annotation_type=DEFAULT,GENOME_FASTA&filename=Caenorhabditiselegans.zip'

In [16]:
data_accession_taxon.to_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon_fasta.csv"), index=False)