# Pre-download frequently used genomes

In [19]:
import pandas as pd
import numpy as np
import pathlib
import urllib.request
import gget
import requests
import sys
import json

In [20]:
ensembl_release = 109

In [21]:
downloaded_genomes_dir = pathlib.Path("genomes/")

In [22]:
orthology_table_file = pathlib.Path("/home/vecerkok/orthology_table/orthology_table-68species_ensembl_ids.csv")

In [23]:
def get_orthology_table(table_path):
    orthology_table_df = pd.read_csv(orthology_table_file, low_memory=False).drop(columns="Unnamed: 0")
    species_names = orthology_table_df.columns
    
    # modify names to gtf filename format
    species_names = [x.lower().replace(" ", "_") for x in species_names]

    species_names = list(map(lambda x: x.replace("heterocephalus_glaber", "heterocephalus_glaber_female"), species_names))
    species_names = list(map(lambda x: x.replace("gorilla_gorilla_gorilla", "gorilla_gorilla"), species_names))
    species_names = list(map(lambda x: x.replace("cricetulus_griseus", "cricetulus_griseus_chok1gshd"), species_names))
    species_names = list(map(lambda x: x.replace("ovis_aries", "ovis_aries_rambouillet"), species_names))
    
    orthology_table_df.columns = species_names
    return orthology_table_df, species_names

In [24]:
orthology_table_df, species_names = get_orthology_table(orthology_table_file)

In [25]:
# check if assemblies in orthology table correspond to assemblies in Ensembl db
species_in_db_release = gget.ref(species=None, list_species=True, release=109) # list all species in release
species_in_db_release
#species_names = [x for x in species_names if x in species_in_db_release] # intersect of the two lists

Mon Feb 20 20:52:10 2023 INFO Fetching available genomes (GTF and FASTAs present) from Ensembl release 109.


['acanthochromis_polyacanthus',
 'accipiter_nisus',
 'ailuropoda_melanoleuca',
 'amazona_collaria',
 'amphilophus_citrinellus',
 'amphiprion_ocellaris',
 'amphiprion_percula',
 'anabas_testudineus',
 'anas_platyrhynchos',
 'anas_platyrhynchos_platyrhynchos',
 'anas_zonorhyncha',
 'anolis_carolinensis',
 'anser_brachyrhynchus',
 'anser_cygnoides',
 'aotus_nancymaae',
 'apteryx_haastii',
 'apteryx_owenii',
 'apteryx_rowi',
 'aquila_chrysaetos_chrysaetos',
 'astatotilapia_calliptera',
 'astyanax_mexicanus',
 'astyanax_mexicanus_pachon',
 'athene_cunicularia',
 'balaenoptera_musculus',
 'betta_splendens',
 'bison_bison_bison',
 'bos_grunniens',
 'bos_indicus_hybrid',
 'bos_mutus',
 'bos_taurus',
 'bos_taurus_hybrid',
 'bubo_bubo',
 'buteo_japonicus',
 'caenorhabditis_elegans',
 'cairina_moschata_domestica',
 'calidris_pugnax',
 'calidris_pygmaea',
 'callithrix_jacchus',
 'callorhinchus_milii',
 'camarhynchus_parvulus',
 'camelus_dromedarius',
 'canis_lupus_dingo',
 'canis_lupus_familiaris'

```
ovis_aries = "ovis_aries"
ovis_aries_score = 0
ovis_aries_rambouillet = "ovis_aries_rambouillet"
ovis_aries_rambouillet_score = 0

server = "https://rest.ensembl.org"

ovis_aries = orthology_table_df["ovis_aries"].values.tolist()
ovis_aries = [x for x in ovis_aries if str(x) != "nan"]
for gene in ovis_aries[:100]:

    ext = f"/lookup/id/{gene}"
    response = requests.get(server+ext, headers={"Content-Type": "application/json"})
    response = response.json()["species"]
    if response == ovis_aries:
        ovis_aries_score += 1
    elif response == ovis_aries_rambouillet:
        ovis_aries_rambouillet_score += 1
    else:
        print(response)
        
print(f"{ovis_aries}: {ovis_aries_score}")
print(f"{ovis_aries_rambouillet}: {ovis_aries_rambouillet_score}")
```

```
chok1gshd = "cricetulus_griseus_chok1gshd"
chok1gshd_score = 0
crigri = "cricetulus_griseus_crigri"
crigri_score = 0
picr = "cricetulus_griseus_picr"
picr_score = 0

server = "https://rest.ensembl.org"

chinese_hamster_table_genes = orthology_table_df["Cricetulus griseus"].values.tolist()
chinese_hamster_table_genes = [x for x in chinese_hamster_table_genes if str(x) != "nan"]
for gene in chinese_hamster_table_genes:

    ext = f"/lookup/id/{gene}"
    response = requests.get(server+ext, headers={"Content-Type": "application/json"})
    response = response.json()["species"]
    if response == chok1gshd:
        chok1gshd_score += 1
    elif response == crigri:
        crigri_score += 1
    elif reponse == picr:
        picr_score += 1
    else:
        print(reponse)
        
print(f"{chok1gshd}: {chok1gshd_score}")
print(f"{crigri}: {crigri_score}")
print(f"{picr}: {picr_score}")
```

```
# TODO find naked mole rat assembly
female = "heterocephalus_glaber_female"
female_score = 0
male = "heterocephalus_glaber_male"
male_score = 0

server = "https://rest.ensembl.org"

naked_mole_rat_table_genes = orthology_table_df["Heterocephalus glaber"].values.tolist()
naked_mole_rat_table_genes = [x for x in naked_mole_rat_table_genes if str(x) != "nan"]
for i, gene in enumerate(naked_mole_rat_table_genes):

    ext = f"/lookup/id/{gene}"
    response = requests.get(server+ext, headers={"Content-Type": "application/json"})
    response = response.json()["species"]
    if response == female:
        female_score += 1
    elif response == male:
        male_score += 1
    else:
        print(reponse)
        
    if (i % 500) == 0:
        print(f"{female}: {female_score}")
        print(f"{male}: {male_score}")
        
print(f"{female}: {female_score}")
print(f"{male}: {male_score}")
```

In [26]:
def is_genome_annotation_downloaded(organism_name: str):
    organism_genome_annotation = downloaded_genomes_dir / (organism_name + ".gtf")
    return organism_genome_annotation.exists()

In [27]:
# def is_genome_annotation_unziped(organism_name: str):
#     organism_genome_annotation = downloaded_genomes_dir / (organism_name + ".gtf")
#     return organism_genome_annotation.exists()

In [28]:
def download_genome_annotation(organism_name: str):
    # get genome annotation gtf
    gtf_ftp = gget.ref(organism_name, which=["gtf"], release=ensembl_release)[organism_name]["annotation_gtf"]["ftp"]
    print(gtf_ftp)
    
    # download gtf genome nanotation
    genome_file = downloaded_genomes_dir / (organism_name + ".gtf.gz")
    urllib.request.urlretrieve(gtf_ftp, genome_file)
    
    # decompress the genome annotation
    !gzip -d $genome_file

In [29]:
def get_genome_annotation(organism_name: str):
    if is_genome_annotation_downloaded(organism_name):
        print("Genome annotation already downloaded and decompressed!")
    else:
        print("Downloading and then decompressing the genome annotation!")
        download_genome_annotation(organism_name)

In [30]:
# download assemblies one by one
for species in species_names:
    print(species)
    get_genome_annotation(species)

homo_sapiens
Genome annotation already downloaded and decompressed!
cricetulus_griseus_chok1gshd
Genome annotation already downloaded and decompressed!
mesocricetus_auratus
Genome annotation already downloaded and decompressed!
rattus_norvegicus
Genome annotation already downloaded and decompressed!
cavia_porcellus
Genome annotation already downloaded and decompressed!
octodon_degus
Genome annotation already downloaded and decompressed!
heterocephalus_glaber_female
Genome annotation already downloaded and decompressed!
maylandia_zebra
Genome annotation already downloaded and decompressed!
pteropus_vampyrus
Genome annotation already downloaded and decompressed!
monodelphis_domestica
Genome annotation already downloaded and decompressed!
pelodiscus_sinensis
Genome annotation already downloaded and decompressed!
anolis_carolinensis
Genome annotation already downloaded and decompressed!
pundamilia_nyererei
Genome annotation already downloaded and decompressed!
vicugna_pacos
Genome annotati

Mon Feb 20 20:52:13 2023 INFO Fetching reference information for ovis_aries_rambouillet from Ensembl release: 109.


http://ftp.ensembl.org/pub/release-109/gtf/ovis_aries_rambouillet/Ovis_aries_rambouillet.Oar_rambouillet_v1.0.109.gtf.gz
ochotona_princeps
Genome annotation already downloaded and decompressed!
oryctolagus_cuniculus
Genome annotation already downloaded and decompressed!
tetraodon_nigroviridis
Genome annotation already downloaded and decompressed!


In [31]:
def gtf_to_bed(organism_name):
    # convert gtf to bed format
    gtf_file = downloaded_genomes_dir / (organism_name + ".gtf")
    bed_file = downloaded_genomes_dir / ("sorted_" + organism_name + ".bed")
    !gtf2bed < $gtf_file > $bed_file

In [32]:
# generate sorted bed files from genome annotations
for species in species_names:
    print(species)
    gtf_to_bed(species)

homo_sapiens
cricetulus_griseus_chok1gshd
mesocricetus_auratus
rattus_norvegicus
cavia_porcellus
octodon_degus
heterocephalus_glaber_female
maylandia_zebra
pteropus_vampyrus
monodelphis_domestica
pelodiscus_sinensis
anolis_carolinensis
pundamilia_nyererei
vicugna_pacos
otolemur_garnettii
takifugu_rubripes
neolamprologus_brichardi
chinchilla_lanigera
sorex_araneus
ictidomys_tridecemlineatus
geospiza_fortis
jaculus_jaculus
myotis_lucifugus
taeniopygia_guttata
ficedula_albicollis


chlorocebus_sabaeus
nomascus_leucogenys
gasterosteus_aculeatus
petromyzon_marinus
latimeria_chalumnae
lepisosteus_oculatus
danio_rerio
microtus_ochrogaster
astyanax_mexicanus
gadus_morhua
xiphophorus_maculatus
oryzias_latipes
oreochromis_niloticus
haplochromis_burtoni
xenopus_tropicalis
chrysemys_picta_bellii
gallus_gallus
meleagris_gallopavo
ornithorhynchus_anatinus
sarcophilus_harrisii
notamacropus_eugenii
dasypus_novemcinctus
erinaceus_europaeus
echinops_telfairi
callithrix_jacchus


macaca_fascicularis
macaca_mulatta
gorilla_gorilla
pan_troglodytes
pongo_abelii
canis_lupus_familiaris
ailuropoda_melanoleuca
mustela_putorius_furo
felis_catus
tursiops_truncatus
loxodonta_africana
equus_caballus
bos_taurus
capra_hircus
ovis_aries_rambouillet
ochotona_princeps
oryctolagus_cuniculus
tetraodon_nigroviridis
