Skip to content

Commit

Permalink
Merge 95b2440 into 3b8f98b
Browse files Browse the repository at this point in the history
  • Loading branch information
iskandr committed Nov 16, 2020
2 parents 3b8f98b + 95b2440 commit dac9807
Show file tree
Hide file tree
Showing 8 changed files with 305 additions and 74 deletions.
82 changes: 68 additions & 14 deletions mhcgnomes/data/allele_aliases.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,70 @@
####################################################
#
# Names which map to null "~" are those that don't yet have a known name in
# the "standard" format
#
####################################################

####################################################
#
# Swine allele aliases manually scraped from:
# https://www.ebi.ac.uk/ipd/mhc/
#
####################################################
SLA:
# treating "HB" as a locus
1*HB01: 1*HB:01
1*HB02: 1*HB:02
1*HB03: 1*HB:03
1*HB04: 1*HB:04
# provisional allele names
# Allele names from NetMHCpan
# many seem to to be derived from Chinese publications that I can't read,
#
1*CHANGDA: ~
1*CDY.AA: 1*CDY
2*CDY.AA: 2*CDY
3*CDY.AA: 3*CDY
1*CDY: ~
2*CDY: ~
3*CDY: ~
1*HB01: ~
1*HB02: ~
1*HB03: ~
1*HB04: ~
2*HB01: ~
3*HB01: ~
1*LWH: ~
2*LWH: ~
3*LWH: ~
1*TPK.AA: 1*TPK
2*TPK.AA: 2*TPK
3*TPK.AA: 3*TPK
1*TPK: ~
2*TPK: ~
3*TPK: ~
1*YC.AA: 1*YC
2*YC.AA: 2*YC
3*YC.AA: 3*YC
1*YC: ~
2*YC: ~
3*YC: ~
1*YDL.AA: 1*YDL
2*YDL.AA: 2*YDL
3*YDL.AA: 3*YDL
1*YDL: ~
2*YDL: ~
3*YDL: ~
1*YDY.AA: 1*YDY
2*YDY.AA: 2*YDY
3*YDY01.AA: 3*YDY01
3*YDY02.AA: 3*YDY01
1*YDY: ~
2*YDY: ~
3*YDY01: ~
3*YDY02: ~
1*YTH.AA: 1*YTH
2*YTH.AA: 2*YTH
3*YTH.AA: 3*YTH
1*YTH: ~
2*YTH: ~
3*YTH: ~

# provisional allele names mapped to their permanent names
1*01rh28: 1*01:03
1*02we02: 1*02:03
1*04gx01: 1*04:03
Expand Down Expand Up @@ -235,13 +289,13 @@ Patr:
DRB*W903: DRB3*0703
BoLA:
# Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4225172/
HD6: BoLA-6*013:01
JSP.1: BoLA-3*002:01
D18.4: BoLA-1*023:01
AW10: BoLA-3*001:01
T2A: BoLA-2*012:01
T2B: BoLA-6*041:01
# DRA: BoLA-DRA*01:01
T2C: ~
HD6: 6*013:01
JSP.1: 3*002:01
D18.4: 1*023:01
AW10: 3*001:01
T2a: 2*012:01
T2b: 6*041:01
T2c: ~
T7: ~
T5: ~
# DRA: DRA*01:01
185 changes: 136 additions & 49 deletions mhcgnomes/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def __init__(
map_species_group_to_top_species: bool = MAP_SPECIES_GROUP_TO_TOP_SPECIES,
collapse_singleton_haplotypes: bool = COLLAPSE_SINGLETON_HAPLOTYPES,
collapse_singleton_serotypes: bool = COLLAPSE_SINGLETON_SEROTYPES,
gene_seps: Sequence[str] = GENE_SEPS):
gene_seps: Sequence[str] = GENE_SEPS,
verbose=False):
"""
map_allele_aliases : bool
Convert old allele aliases to newer names. For example,
Expand All @@ -80,12 +81,16 @@ def __init__(
collapse_singleton_serotypes : bool
If a serotype contains just one allele, return that instead of
the Serotype object containing it.
verbose : bool
Print the parse candidates for every distinct token
"""
self.map_allele_aliases = map_allele_aliases
self.map_species_group_to_top_species = map_species_group_to_top_species
self.collapse_singleton_haplotypes = collapse_singleton_haplotypes
self.collapse_singleton_serotypes = collapse_singleton_serotypes
self.gene_seps = gene_seps
self.verbose = verbose

def parse_species_from_prefix(self, name: str):
"""
Expand Down Expand Up @@ -522,22 +527,13 @@ def parse_allele_or_gene_candidates(
return []
candidate_results = []
if str_after_species in species.allele_aliases:
alias = species.allele_aliases[str_after_species]

if alias is None:
name = species.allele_aliases.original_key(str_after_species)
allele = AlleleWithoutGene.get(
species=species,
name=name,
raw_string=str_after_species)
candidate_results.append(allele)
elif self.map_allele_aliases:
parsed_alias = self.parse(
alias,
default_species=species,
raise_on_error=False)
if parsed_alias is not None:
candidate_results.append(parsed_alias)
original = species.allele_aliases.original_key(str_after_species)
if "*" not in original:
candidate_results.append(
AlleleWithoutGene.get(
species=species,
name=original,
raw_string=str_after_species))

for gene, allele_name in self.parse_gene_candidates(
species, str_after_species):
Expand All @@ -564,16 +560,11 @@ def parse_allele_with_gene(self, gene, str_after_gene):
if contains_whitespace(str_after_gene):
return None

str_after_gene = self.strip_extra_chars(str_after_gene)

species = gene.species
gene_name = gene.name
if self.map_allele_aliases:
# if the remaining string is an allele string which has
# been renamed or deprecated, then get its new/canonical form
# TODO: make this an optional transformation after parsing
new_allele_name = species.allele_aliases.get(
"%s*%s" % (gene_name, str_after_gene))
if new_allele_name:
gene_name, str_after_gene = new_allele_name.split("*")

if species.is_mouse:
if str_after_gene.isalnum() and not str_after_gene.isnumeric():
# mouse alleles can be a mixture of numbers and letters
Expand All @@ -584,8 +575,8 @@ def parse_allele_with_gene(self, gene, str_after_gene):
elif species.is_rat:
return Allele.get_with_gene(gene, str_after_gene.lower())
elif species.is_pig:
# parse e.g. "SLA-1-HB03" or "SLA-3-US#11"
if str_after_gene[:2] == "HB" or "#" in str_after_gene:
# parse e.g. "SLA-3-US#11"
if "#" in str_after_gene:
return Allele.get_with_gene(
gene,
str_after_gene.upper())
Expand Down Expand Up @@ -839,30 +830,107 @@ def parse_and_apply_mutations(
gene_to_mutations)


def transform_parse_candidates(
self,
parse_candidates: Sequence[Result],
raw_string: str):
def adjust_raw_string_and_transform_parse_candidates(
self, candidates: Sequence[Result], raw_string: str):
"""
Annotate every ParseResult in a list with its `raw_string` field
updated to `raw_string`.
Also perform optional transformations such as collapsing singleton
serotypes and haplotypes.
Returns
-------
List of Result objects
"""
results = []
for parse_candidate in parse_candidates:
if parse_candidate is None:
continue
t = type(parse_candidate)
if ((self.collapse_singleton_haplotypes and t is Haplotype) or
(self.collapse_singleton_serotypes and t is Serotype)):
simpler_result = parse_candidate.collapse_if_possible()
if simpler_result:
parse_candidate = simpler_result
for parse_candidate in candidates:
parse_candidate = parse_candidate.copy(raw_string=raw_string)
assert parse_candidate is not None
results.append(parse_candidate)
return self.transform_parse_candidates(results)

def transform_parse_candidate(self, parse_candidate : Result):
"""
Perform optional transformations on Result objects such as collapsing
singleton serotypes and haplotypes.
"""
if parse_candidate is None:
return None
t = type(parse_candidate)
transformed = None
if t is Haplotype:
if self.collapse_singleton_haplotypes:
transformed = parse_candidate.collapse_if_possible()
elif t is Serotype:
if self.collapse_singleton_serotypes:
transformed = parse_candidate.collapse_if_possible()
elif t is Allele:
species = parse_candidate.species
gene = parse_candidate.gene
gene_name = gene.name
old_name = parse_candidate.name
corrected_old_name = new_allele_string = None
for gene_name in species.reverse_gene_aliases[gene_name]:
query = "%s*%s" % (gene_name, old_name)
if query in species.allele_aliases:
_, corrected_old_name = \
species.allele_aliases.original_key(query).split("*")
new_allele_string = species.allele_aliases[query]
break
if corrected_old_name and corrected_old_name != old_name:
transformed = parse_candidate.copy(
allele_fields=corrected_old_name.split(":"))
if self.map_allele_aliases and new_allele_string:
# if the remaining string is an allele string which has
# been renamed or deprecated, then get its new/canonical
# form
if new_allele_string.count("*") == 1:
new_gene_name, new_allele_name = new_allele_string.split("*")

if new_allele_name != old_name:
gene = Gene.get(
species,
new_gene_name)
transformed = parse_candidate.copy(
gene=gene,
allele_fields=new_allele_name.split(":"))

elif t is AlleleWithoutGene:
species = parse_candidate.species
old_name = parse_candidate.name
corrected_old_name = new_name = None
if old_name in species.allele_aliases:
corrected_old_name = species.allele_aliases.original_key(old_name)
new_name = species.allele_aliases[old_name]
if self.map_allele_aliases and new_name:
if new_name.count("*") == 1:
new_gene_name, new_allele_name = new_name.split("*")
gene = Gene.get(
species,
new_gene_name)
transformed = Allele.get(
species,
gene,
*new_allele_name.split(":"),
raw_string=parse_candidate.raw_string)
else:
transformed = parse_candidate.copy(name=new_name)
elif corrected_old_name and corrected_old_name != old_name:
transformed = parse_candidate.copy(name=corrected_old_name)
if transformed is not None:
return transformed
else:
return parse_candidate

def transform_parse_candidates(
self,
parse_candidates: Sequence[Result]):
"""
Apply transform_parse_candidate to a list of results.
"""
results = []
for parse_candidate in parse_candidates:
result = self.transform_parse_candidate(parse_candidate)
if result:
results.append(result)
return unique(results)

def parse_gene_without_species(
Expand Down Expand Up @@ -924,6 +992,9 @@ def parse_single_token_to_multiple_candidates(
Returns list of result objects for a single token string which
should not contain any whitespace.
"""
if self.verbose:
print(f""">>> Parser.parse_single_token_to_multiple_candidates(
{token}, {default_species})""")
seq = token.seq
raw_string = token.raw_string

Expand All @@ -938,9 +1009,18 @@ def parse_single_token_to_multiple_candidates(
self.parse_gene_without_species,
self.parse_allele_without_species,
]
if self.verbose:
print("=== Functions without required species argument ===")
for fn in fns_without_species:
result = fn(seq, default_species=default_species)

if self.verbose:
print("%s('%s', default_species=%s) = %s" % (
fn.__qualname__,
seq,
('%s' % default_species if type(default_species) is str
else default_species),
('%s' % result if type(result) is str else result)
))
if type(result) in (list, tuple):
parse_candidates.extend(result)
elif isinstance(result, Result):
Expand All @@ -955,6 +1035,8 @@ def parse_single_token_to_multiple_candidates(
if len(str_after_species) == 0:
parse_candidates.append(species)
else:
if self.verbose:
print("=== Functions with required species argument ===")
# all of these functions are expected to take two arguments
# (Species, str_after_species) and returns either a parsed
# represntation or None
Expand All @@ -972,7 +1054,13 @@ def parse_single_token_to_multiple_candidates(
result = fn(
species,
str_after_species)

if self.verbose:
print("%s(%s, '%s') = %s" % (
fn.__qualname__,
species,
seq,
"None" if not result else '%s' % result
))
if not result:
continue
if type(result) in (list, tuple):
Expand All @@ -986,10 +1074,9 @@ def parse_single_token_to_multiple_candidates(

# update all the objects to set their raw_string field to raw_string
# and also perform optional transformations
parse_candidates = self.transform_parse_candidates(
return self.adjust_raw_string_and_transform_parse_candidates(
parse_candidates,
raw_string=raw_string)
return parse_candidates

def restrict_result_type_if_possible(
self,
Expand Down Expand Up @@ -1268,7 +1355,7 @@ def parse_tokens_to_multiple_candidates(
if isinstance(second_result, ResultWithSpecies):
if second_result.species == first_result:
candidates.append(second_result)
return unique(candidates)
return self.transform_parse_candidates(candidates)

def select_species_from_optional_attributes(
self,
Expand Down Expand Up @@ -1355,8 +1442,8 @@ def parse(
- Allele
- Class2Pair
"""
candidates = self.parse_multiple_candidates\
(name, default_species=default_species)
candidates = self.parse_multiple_candidates(
name, default_species=default_species)

if required_result_types:
if type(required_result_types) not in (list, set, tuple):
Expand Down

0 comments on commit dac9807

Please sign in to comment.