In [1]:
"""
Created on Mon Mar  3 17:10:54 2025

@author: Sachithra Yaddehige
"""

'''This tool downloads the gbff files from refseq when a .txt file with the GCF numbers of the genome is provided'''

import os
import re
import requests
import gzip
import shutil
from pathlib import Path
from urllib.parse import urljoin

# Helper function to extract triplets
def extract_triplets(gcf_number):
    match = re.search(r'GCF_(\d+)\.', gcf_number)
    if not match:
        print(f"Invalid GCF number format: {gcf_number}")
        return None
    digits = match.group(1)
    triplets = [digits[i:i+3] for i in range(0, len(digits), 3)]
    return '/'.join(triplets)

# Function to download and extract genome
def download_genome(gcf_number, output_dir="downloads"):
    triplet_path = extract_triplets(gcf_number)
    if not triplet_path:
        return

    base_url = f"https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/{triplet_path}/{gcf_number}/"
    genome_file = f"{gcf_number}_genomic.gbff"
    genome_gz_file = genome_file + ".gz"

    output_gbff_path = Path(output_dir) / genome_file
    output_gz_path = Path(output_dir) / genome_gz_file

    if output_gz_path.exists() or output_gbff_path.exists():
        print(f"File already exists for {gcf_number}, skipping.")
        return

    genome_url = urljoin(base_url, genome_gz_file)
    response = requests.head(genome_url)
    if response.status_code != 200:
        print(f"File not found for {gcf_number}")
        return

    os.makedirs(output_dir, exist_ok=True)

    with requests.get(genome_url, stream=True) as r:
        r.raise_for_status()
        with open(output_gz_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print(f"Downloaded: {output_gz_path}")

    with gzip.open(output_gz_path, 'rb') as f_in:
        with open(output_gbff_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    os.remove(output_gz_path)
    print(f"Extracted: {output_gbff_path}")

# Set the input file and output directory (manually, not via argparse)
input_file = input("please type your file name as <name.txt> and hit enter ")
output_dir = "bacteria_downloads"

# Read GCF numbers and process them
with open(input_file, 'r') as file:
    gcf_list = [line.strip() for line in file if line.strip()]

for gcf in gcf_list:
    download_genome(gcf, output_dir)


KeyboardInterrupt: Interrupted by user