In [1]:
import requests
import json
import os
import pandas as pd
import re
from tqdm import tqdm

tqdm.pandas()

In [2]:
RAW_DATA_PATH = os.path.join("data", "raw")
PROCESSED_DATA_PATH = os.path.join("data", "processed")

Load Data CSV Taxon and Acession

In [3]:
data_accession_taxon = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon.csv"))

In [4]:
len(data_accession_taxon["tax_id"].unique())

823

In [5]:
data_accession_taxon.sample(5)

Unnamed: 0,accession,tax_id,organism_name,common_name
725,GCA_026262465.1,9915,Bos indicus,zebu cattle
1443,GCA_015145955.1,9606,Homo sapiens,human
2460,GCA_900008165.2,9606,Homo sapiens,human
2509,GCA_904813355.1,9305,Sarcophilus harrisii,Tasmanian devil
2440,GCA_900008175.2,9606,Homo sapiens,human


In [6]:
data_accession_taxon[data_accession_taxon.organism_name.str.contains("ixo")]

Unnamed: 0,accession,tax_id,organism_name,common_name


URL Download FASTA

In [7]:
urls_summary = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download_summary?include_annotation_type[0]=GENOME_FASTA"

In [8]:
urls_summary.format(accession="GCF_030316605.1")

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download_summary?include_annotation_type[0]=GENOME_FASTA'

In [9]:
summary_response = requests.get(urls_summary.format(accession="GCF_030316605.1"))

In [10]:
print(json.dumps(summary_response.json(), indent = 4))

{
    "record_count": 1,
    "resource_updated_on": "2023-11-22T05:35:23Z",
    "hydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome"
    },
    "dehydrated": {
        "estimated_file_size_mb": 1,
        "url": "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?hydrated=DATA_REPORT_ONLY&include_annotation_type=DEFAULT,GENOME_FASTA",
        "cli_download_command_line": "datasets download genome accession GCF_030316605.1 --include genome --dehydrated",
        "cli_rehydrate_command_line": "datasets rehydrate --help"
    },
    "available_files": {
        "all_genomic_fasta": {
            "file_count": 1,
            "size_mb": 0.5104723
        },
        "genome_gff": {
            "file_count

In [11]:
urls_download = summary_response.json()["hydrated"]["url"]

In [12]:
urls_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_030316605.1/download?include_annotation_type=DEFAULT,GENOME_FASTA'

List All download URL

In [13]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name,common_name
0,GCF_000001405.40,9606,Homo sapiens,human
1,GCF_000001635.27,10090,Mus musculus,house mouse
2,GCF_019923935.1,89462,Bubalus bubalis,water buffalo
3,GCA_023701655.1,2918886,Ovis ammon polii x Ovis aries,
4,GCA_032405125.1,34865,Capricornis sumatraensis,Sumatran serow
...,...,...,...,...
2853,GCA_016894425.1,9606,Homo sapiens,human
2854,GCA_023065335.1,9823,Sus scrofa,pig
2855,GCA_023065355.1,9823,Sus scrofa,pig
2856,GCA_030378505.1,9913,Bos taurus,cattle


In [14]:
urls_download_template = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{accession}/download?include_annotation_type=DEFAULT&filename={filename}"

def get_fasta_url_download(row):
    accession = row["accession"]
    filename = re.sub('[^a-zA-Z0-9 \n\.]', '', row["organism_name"])
    filename = filename.replace(" ", "")
    filename = filename + ".zip"
    
    url_download = urls_download_template.format(accession=accession, filename=filename)
    
    return url_download


data_accession_taxon["fasta_url_download"] = data_accession_taxon.progress_apply(get_fasta_url_download, axis=1)

100%|██████████| 2858/2858 [00:00<00:00, 114986.29it/s]


In [15]:
data_accession_taxon

Unnamed: 0,accession,tax_id,organism_name,common_name,fasta_url_download
0,GCF_000001405.40,9606,Homo sapiens,human,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
1,GCF_000001635.27,10090,Mus musculus,house mouse,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2,GCF_019923935.1,89462,Bubalus bubalis,water buffalo,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
3,GCA_023701655.1,2918886,Ovis ammon polii x Ovis aries,,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
4,GCA_032405125.1,34865,Capricornis sumatraensis,Sumatran serow,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
...,...,...,...,...,...
2853,GCA_016894425.1,9606,Homo sapiens,human,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2854,GCA_023065335.1,9823,Sus scrofa,pig,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2855,GCA_023065355.1,9823,Sus scrofa,pig,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...
2856,GCA_030378505.1,9913,Bos taurus,cattle,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/...


In [16]:
data_accession_taxon.iloc[0].fasta_url_download

'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000001405.40/download?include_annotation_type=DEFAULT&filename=Homosapiens.zip'

In [17]:
data_accession_taxon.to_csv(os.path.join(PROCESSED_DATA_PATH, "data_accession_taxon_fasta.csv"), index=False)