-
Notifications
You must be signed in to change notification settings - Fork 4
/
genome.py
66 lines (59 loc) · 2.7 KB
/
genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
from util import remove_safely, check_call
class Genome(object):
all = dict()
by_accid = dict()
def __init__(self, category, organism, lineage, versioned_accession_ids, genome_assembly_URL=None):
self.category = category
self.organism = organism
self.lineage = lineage
assert ["subspecies", "species", "genus", "family"] == [l[0] for l in lineage]
self.subspecies_taxid, self.species_taxid, self.genus_taxid, self.family_taxid = [(l[1] or 0) for l in lineage]
self.taxid = self.subspecies_taxid or self.species_taxid
self.genome_assembly_URL = genome_assembly_URL
self.versioned_accession_ids = [Genome.ensure_versioned(vaccid) for vaccid in versioned_accession_ids]
self.key = f"{category}__{organism}__{self.taxid}"
self.filename = f"{self.key}.fasta"
# The size (number of bases) is filled in by fetch_all()
self.size = None
Genome.all[self.key] = self
for vaccid in self.versioned_accession_ids:
assert vaccid not in Genome.by_accid, "Accession ID {vaccid} should not occur in multiple genomes."
Genome.by_accid[vaccid] = self
@staticmethod
def ensure_versioned(vaccid):
if "." not in vaccid:
vaccid += ".1"
print(f"INFO: Changed {vaccid[:-2]} to {vaccid} in order to match ISS-generated read headers.")
return vaccid
@staticmethod
def fetch_versioned_accession_id(vaccid): # e.g., "NC_004325.2"
output_file = f"{vaccid}.fa"
if os.path.isfile(output_file):
print(f"{output_file} already exists, nice")
else:
try:
command = f"ncbi-acc-download --format fasta {vaccid} -e all"
check_call(command)
except:
remove_safely(output_file)
raise
return output_file
@staticmethod
def fetch_all():
for g in Genome.all.values():
remove_safely(g.filename)
accession_fas = []
for f in (g.versioned_accession_ids):
af = Genome.fetch_versioned_accession_id(f)
accession_fas.append(af)
accession_fastas = " ".join(accession_fas)
command = f"cat {accession_fastas} > {g.filename}"
check_call(command)
assert os.path.isfile(g.filename), f"Failed to download genome {g.filename}"
command = f"grep -v '^>' {g.filename} | tr -d '\n' | wc > {g.key}.size"
check_call(command)
with open(f"{g.key}.size") as f:
line = f.readline().rstrip()
g.size = int(line.split()[2])
print(f"Genome {g.key} size {g.size} bases.")