# Pre-download frequently used genomes

Download genomes from the orthology table and convert them to a sorted BED file.

**Input**: Orthology table (.xsl, .csv, .tsv), version of Ensembl compara used, path to directory to save genome files

In [1]:
import pandas as pd
import numpy as np
import pathlib
import urllib.request
import gget
import requests
import sys
import json

In [2]:
# enter the deisred ensembl release
ensembl_release = 109

In [3]:
# enter path to genomes directory
downloaded_genomes_dir = pathlib.Path("genomes/")

In [4]:
# enter path to orthology table
orthology_table_file = pathlib.Path("/home/vecerkok/orthology_table/orthology_table-68species_ensembl_ids.csv")

In [5]:
def get_orthology_table(table_file: str):
    table_df = pd.read_csv(table_file, low_memory=False, index_col=0)
    species_names = table_df.columns
    
    if file_type == ".csv":
        return pd.read_csv(table_file, sep=",", header=0, index_col=0, low_memory=False)
    elif file_type == ".tsv":
        return pd.read_csv(table_file, sep="\t", header=0, index_col=0, low_memory=False)
    elif (file_type == ".xsls") or (file_type == ".xsl"):
        return pd.read_excel(table_file, header=0, index_col=0, low_memory=False)
    else:
        print("Accepted file formats: .csv, .tsv, .xsls, .xsl!")
        return
    
    # modify species names to gtf filename format
    species_names = [x.lower().replace(" ", "_") for x in species_names]

    species_names = list(map(lambda x: x.replace("heterocephalus_glaber", "heterocephalus_glaber_female"), species_names))
    species_names = list(map(lambda x: x.replace("gorilla_gorilla_gorilla", "gorilla_gorilla"), species_names))
    species_names = list(map(lambda x: x.replace("cricetulus_griseus", "cricetulus_griseus_chok1gshd"), species_names))
    species_names = list(map(lambda x: x.replace("ovis_aries", "ovis_aries_rambouillet"), species_names))
    
    table_df.columns = species_names
    return table_df, species_names

In [6]:
orthology_table_df, species_names = get_orthology_table(orthology_table_file)

In [7]:
# check if assemblies in orthology table correspond to assemblies in Ensembl db
species_in_db_release = gget.ref(species=None, list_species=True, release=109) # list all species in release
species_names = [x for x in species_names if x in species_in_db_release] # intersect of the two lists

Sun Apr 23 18:23:20 2023 INFO Fetching available genomes (GTF and FASTAs present) from Ensembl release 109.


In [8]:
def is_genome_annotation_downloaded(organism_name: str):
    organism_genome_annotation = downloaded_genomes_dir / (organism_name + ".gtf")
    return organism_genome_annotation.exists()

In [9]:
def download_genome_annotation(organism_name: str):
    # get genome annotation gtf
    gtf_ftp = gget.ref(organism_name, which=["gtf"], release=ensembl_release)[organism_name]["annotation_gtf"]["ftp"]
    print(gtf_ftp)
    
    # download gtf genome nanotation
    genome_file = downloaded_genomes_dir / (organism_name + ".gtf.gz")
    urllib.request.urlretrieve(gtf_ftp, genome_file)
    
    # decompress the genome annotation
    !gzip -d $genome_file

In [10]:
def get_genome_annotation(organism_name: str):
    if is_genome_annotation_downloaded(organism_name):
        print("Genome annotation already downloaded and decompressed!")
    else:
        print("Downloading and then decompressing the genome annotation!")
        download_genome_annotation(organism_name)

In [11]:
# download assemblies one by one
for species in species_names:
    print(species)
    get_genome_annotation(species)

homo_sapiens
Genome annotation already downloaded and decompressed!
cricetulus_griseus_chok1gshd
Genome annotation already downloaded and decompressed!
mesocricetus_auratus
Genome annotation already downloaded and decompressed!
rattus_norvegicus
Genome annotation already downloaded and decompressed!
cavia_porcellus
Genome annotation already downloaded and decompressed!
octodon_degus
Genome annotation already downloaded and decompressed!
heterocephalus_glaber_female
Genome annotation already downloaded and decompressed!
maylandia_zebra
Genome annotation already downloaded and decompressed!
pteropus_vampyrus
Genome annotation already downloaded and decompressed!
monodelphis_domestica
Genome annotation already downloaded and decompressed!
pelodiscus_sinensis
Genome annotation already downloaded and decompressed!
anolis_carolinensis
Genome annotation already downloaded and decompressed!
pundamilia_nyererei
Genome annotation already downloaded and decompressed!
vicugna_pacos
Genome annotati

In [12]:
def gtf_to_bed(assembly_name):
    # convert gtf file to bed format
    gtf_file = downloaded_genomes_dir / (assembly_name + ".gtf")
    bed_file = downloaded_genomes_dir / ("sorted_" + assembly_name + ".bed")
    !gtf2bed < $gtf_file > $bed_file

In [13]:
# generate sorted bed files from genome annotations
for species in species_names:
    print(species)
    gtf_to_bed(species)

homo_sapiens
cricetulus_griseus_chok1gshd
mesocricetus_auratus
rattus_norvegicus
cavia_porcellus
octodon_degus
heterocephalus_glaber_female
maylandia_zebra
pteropus_vampyrus
monodelphis_domestica
pelodiscus_sinensis
anolis_carolinensis
pundamilia_nyererei
vicugna_pacos
otolemur_garnettii
takifugu_rubripes
neolamprologus_brichardi
chinchilla_lanigera
sorex_araneus
ictidomys_tridecemlineatus
geospiza_fortis
jaculus_jaculus
myotis_lucifugus
taeniopygia_guttata
ficedula_albicollis


chlorocebus_sabaeus
nomascus_leucogenys
gasterosteus_aculeatus
petromyzon_marinus
latimeria_chalumnae
lepisosteus_oculatus
danio_rerio
microtus_ochrogaster
astyanax_mexicanus
gadus_morhua
xiphophorus_maculatus
oryzias_latipes
oreochromis_niloticus
haplochromis_burtoni
xenopus_tropicalis
chrysemys_picta_bellii
gallus_gallus
meleagris_gallopavo
ornithorhynchus_anatinus
sarcophilus_harrisii
notamacropus_eugenii
dasypus_novemcinctus
erinaceus_europaeus
echinops_telfairi
callithrix_jacchus


macaca_fascicularis
macaca_mulatta
gorilla_gorilla
pan_troglodytes
pongo_abelii
canis_lupus_familiaris
ailuropoda_melanoleuca
mustela_putorius_furo
felis_catus
tursiops_truncatus
loxodonta_africana
equus_caballus
bos_taurus
capra_hircus
ovis_aries_rambouillet
ochotona_princeps
oryctolagus_cuniculus
tetraodon_nigroviridis
