Skip to content

Commit

Permalink
Merge pull request #7 from til-unc/add-human-allele-aliases
Browse files Browse the repository at this point in the history
Copied human allele aliases from IMGT
  • Loading branch information
iskandr committed Nov 23, 2020
2 parents 9988a9a + 35afb7e commit b5d8581
Show file tree
Hide file tree
Showing 19 changed files with 661 additions and 198 deletions.
2 changes: 1 addition & 1 deletion mhcgnomes/allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def num_allele_fields(self):

@property
def name(self):
return ":".join(self.allele_fields)
return ":".join(self.allele_fields) + "".join(self.annotations)

def restrict_num_allele_fields(
self,
Expand Down
2 changes: 1 addition & 1 deletion mhcgnomes/allele_without_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def to_string(
"""
Return allele strings like "BoLA-T2C"
"""
species_str = self.species_str.to_string(
species_str = self.species.to_string(
include_species=include_species,
use_old_species_prefix=use_old_species_prefix)
return "%s-%s" % (species_str, self.name)
Expand Down
16 changes: 13 additions & 3 deletions mhcgnomes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@ def unique(xs : Iterable):
"""
if type(xs) is list and len(xs) == 0 or len(xs) == 1:
return xs
return list(set(xs))
result = []
unique_set = set()
for xi in xs:
if xi in unique_set:
continue
result.append(xi)
unique_set.add(xi)
return result

def arg_to_cache_key(x):
if type(x) in {list, tuple}:
Expand Down Expand Up @@ -49,14 +56,17 @@ def cached_fn(*args, **kwargs):
return cache[key]
return cached_fn

def normalize_string(name, chars_to_remove="-_'"):
def normalize_string(name, chars_to_remove="-_':"):
"""
Return uppercase string without any surrounding whitespace and
without any characters such as '-', '_' or "'"
without any characters such as '-', '_' ':' or "'"
"""
if name is None:
return None
if type(name) in (float, int):
name = str(name)


if not isinstance(name, str):
return name

Expand Down
6 changes: 6 additions & 0 deletions mhcgnomes/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ def load(
normalize_first_level_keys=True,
normalize_second_level_keys=True)

# Dictionary mapping species -> gene -> list of known allele names
known_alleles = load(
"known_alleles.yaml",
normalize_first_level_keys=True,
normalize_second_level_keys=True)

# Dictionary mapping species to haplotype name to list of alleles
haplotypes = load(
"haplotypes.yaml",
Expand Down
234 changes: 195 additions & 39 deletions mhcgnomes/data/allele_aliases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,66 +8,34 @@
####################################################
#
# Swine allele aliases manually scraped from:
# https://www.ebi.ac.uk/ipd/mhc/
# - https://www.ebi.ac.uk/ipd/mhc/
#
####################################################
SLA:
# Allele names from NetMHCpan
# many seem to to be derived from Chinese publications that I can't read,
#
1*CHANGDA: ~
1*CDY.AA: 1*CDY
2*CDY.AA: 2*CDY
3*CDY.AA: 3*CDY
1*CDY: ~
2*CDY: ~
3*CDY: ~
1*HB01: ~
1*HB02: ~
1*HB03: ~
1*HB04: ~
2*HB01: ~
3*HB01: ~
1*LWH.AA: 1*LWH
2*LWH.AA: 2*LWH
3*LWH.AA: 3*LWH
1*LWH: ~
2*LWH: ~
3*LWH: ~
1*TPK.AA: 1*TPK
2*TPK.AA: 2*TPK
3*TPK.AA: 3*TPK
1*TPK: ~
2*TPK: ~
3*TPK: ~
1*YC.AA: 1*YC
2*YC.AA: 2*YC
3*YC.AA: 3*YC
1*YC: ~
2*YC: ~
3*YC: ~
1*YDL.AA: 1*YDL
2*YDL.AA: 2*YDL
3*YDL.AA: 3*YDL
1*YDL: ~
2*YDL: ~
3*YDL: ~
1*YDL01: ~
2*YDL02: ~
1*YDY.AA: 1*YDY
2*YDY.AA: 2*YDY
3*YDY01.AA: 3*YDY01
3*YDY02.AA: 3*YDY01
1*YDY: ~
2*YDY: ~
3*YDY01: ~
3*YDY02: ~
1*YTH.AA: 1*YTH
2*YTH.AA: 2*YTH
3*YTH.AA: 3*YTH
1*YTH: ~
2*YTH: ~
3*YTH: ~

# provisional allele names mapped to their permanent names
1*01rh28: 1*01:03
Expand Down Expand Up @@ -289,9 +257,9 @@ SLA:
DRB3*01bL85: DRB3*05:01
Patr:
# footnote in Table 1 of 10.1016/j.molimm.2009.09.003
DRB*W901: DRB3*0701
DRB*W902: DRB3*0702
DRB*W903: DRB3*0703
DRB*W901: DRB3*07:01
DRB*W902: DRB3*07:02
DRB*W903: DRB3*07:03
BoLA:
# Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4225172/
HD6: 6*013:01
Expand All @@ -300,8 +268,196 @@ BoLA:
AW10: 3*001:01
T2a: 2*012:01
T2b: 6*041:01
T2c: ~
T7: ~
T5: ~
# TODO: What to do with the invariant DR alpha genes of many species?
# DRA: DRA*01:01
####################################################
#
# Human deleted and renamed alleles copied from:
# - https://www.ebi.ac.uk/cgi-bin/ipd/imgt/hla/deleted.cgi
#
####################################################
HLA:
# renamed to entirely different alleles
A*01:05N: A*01:04:01:01N
A*01:34N: A*01:01:38L
A*02:01:16: A*02:134
A*02:01:20: A*02:01:18
A*02:23: A*02:22:01
A*02:98: A*02:96
A*02:01:82: A*02:01:84
A*02:17:01: A*02:17:02:01
A*03:194: A*03:213
A*03:200Q: A*03:266N
A*03:260: A*03:284N
A*11:28: A*11:15:02
A*11:53: A*11:02:01:01
A*23:69: A*23:01:01
A*24:12: A*24:08
A*24:16: A*31:08
A*24:65: A*24:13:02
A*24:211: A*24:135:02
A*26:03:02: A*26:111
A*26:44: A*26:43:02
A*30:05: A*30:04:01:01
A*30:21: A*30:11:02
A*31:011: A*31:01:02
A*33:02: A*33:03:01
A*33:38: A*33:44
B*08:06: B*08:20:01
B*13:05: B*13:04
B*13:24: B*13:22:02
B*15:01:05: B*15:120
B*15:22: B*35:43
B*15:41: B*15:39:01:01
B*15:59: B*35:44
B*18:16: B*18:14
B*27:051: B*27:05:02
B*27:22: B*27:06:01:01
B*27:215N: B*27:05:02:01
B*35:73: B*35:08:03
B*35:43:02: B*35:185
B*39:012: B*39:01:01:01
B*39:21: B*39:24
B*40:17: B*40:16:01:01
B*40:41: B*40:40
B*44:01: B*44:02:01:01
B*45:26: B*45:01:01:01
B*47:01:01:01: B*47:01:01:03
B*49:15: B*49:01:01
B*50:03: B*50:02:01:01
B*51:25: B*51:22
B*51:47: B*51:09:02
B*55:06: B*55:04
B*56:55:01:01: B*56:55:01:02
B*58:30: B*58:01:01
B*78:01:01:01: B*78:01:01:02
B*79:01: B*15:18:01
B*95:30: B*15:27:02
C*01:63:01: C*01:141
C*02:02:05: C*02:10:06
C*02:16:01: C*02:137
C*03:12: C*03:19
C*03:99:01: C*01:169:01
C*03:99:02: C*01:169:02
C*04:01:01:30: C*04:01:01:27
C*08:64: C*08:33:03
C*12:274:02N: C*12:329N
C*12:299N: C*12:03:01:01
C*15:20: C*15:27
C*17:01:01:01: C*17:01:01:02
C*01:01: C*01:02:01
C*02:01: C*02:02:02
C*02:02:04: C*02:10
C*02:16:03: C*02:16:02
C*03:01: C*03:04:01:01
C*04:02: C*04:01:01:01
C*04:21: C*04:15:02
C*04:22: Cw*04:21
C*05:02: C*05:09
C*06:01: C*06:02:01:01
C*06:02:02: C*06:17
C*07:34: C*07:27:02
C*12:01: C*12:02:02
C*14:01: C*14:02:01
C*15:01: C*15:02:01
C*15:14: C*15:10:02
C*16:03: C*14:03:01:01
C*16:042: C*16:04:01
C*16:05: C*16:04:01
DPA1*01:01: DPA1*01:03:01
DPA1*01:02: DPA1*01:03:01
DPA1*02:02:01: DPA1*02:07:01
DPA1*02:02:02:06Q: DPA1*02:02:02:01
DPB1*02:011: DPB1*02:01:02
DPB1*35:01:02: DPB1*621:01
DPB1*42:01: DPB1*31:01:01:01
DPB1*43:01: DPB1*28:01
DPB1*793:01N: DPB1*786:01:02N
DQA1*01:01:01:04: DQA1*01:01:01:01
DQA1*02:06: DQA1*02:01:01:01
DQA1*03:012: DQA1*03:02
DQA1*03:01:01:02: DQA1*03:01:01:01
DQA1*03:02:01:03: DQA1*03:02:01:02
DQA1*05:013: DQA1*05:05
DQB1*02:02:06:01: DQB1*02:02:06:02
DQB1*03:031: DQB1*03:03:02
DQB1*03:01:01:13: DQB1*03:01:01:07
DQB1*03:325: DQB1*03:317:02
DQB1*04:02:01:02: DQB1*04:02:01:01
DQB1*04:02:01:03: DQB1*04:02:01:01
DQB1*06:220: DQB1*06:217
DRB1*03:11:02: DRB1*03:81
DRB1*04:05:12: DRB1*04:05:14
DRB1*04:94:02N: DRB1*04:212N
DRB1*07:02: DRB1*07:01:01:01
DRB1*08:031: DRB1*08:03:02
DRB1*08:01:03: DRB1*08:01:01
DRB1*09:011: DRB1*09:01:02
DRB1*11:71: DRB1*11:02:01:01
DRB1*11:11:02: DRB1*11:11:01
DRB1*12:031: DRB1*12:01:01
DRB1*14:66: DRB1*14:32:02
DRB1*16:06: DRB1*16:05:01
DRB3*01:01:01: DRB3*01:01:02:01
DRB4*01:011:02N: DRB4*01:03:01:02N
DRB5*02:01: DRB5*02:02
E*01:02: E*01:01:01:01
E*01:21:02N: E*01:117N
F*01:01:03:01: F*01:01:01:09
F*01:01:03:02: F*01:01:01:10
F*01:01:03:03: F*01:01:01:11
F*01:01:03:04: F*01:01:01:12
G*01:01:10: G*01:04:04
MICA*007:02: MICA*111:01:01
MICA*007:04: MICA*112:01:01
MICA*008:07: MICA*008:01:10
MICA*021: MICA*012:03
MICA*071: MICA*017
MICB*017: MICB*005:05

# alleles with suffix changes
A*01:159: A*01:159Q
A*01:281: A*01:281Q
A*01:301: A*01:301Q
A*02:01:14: A*02:01:14Q
A*02:437: A*02:437Q
A*02:581: A*02:581Q
A*02:728: A*02:728Q
A*02:795: A*02:795Q
A*11:50: A*11:50Q
A*11:52: A*11:52Q
A*23:19Q: A*23:19N
A*24:02:01:17: A*24:02:01:17Q
A*24:329: A*24:329Q
A*24:378: A*24:378Q
A*24:447: A*24:447Q
A*24:450: A*24:450Q
A*33:03:03: A*33:03:03Q
B*07:44: B*07:44N
B*13:08Q: B*13:08
B*18:01:01:12: B*18:01:01:12Q
B*18:106: B*18:106Q
B*27:185: B*27:185Q
B*3716: B*37:16Q
B*38:68Q: B*38:68L
B*41:56: B*41:56Q
B*44:02:01:13: B*44:02:01:13Q
B*44:142Q: B*44:142
B*44:160: B*44:160Q
C*02:67: C*02:67Q
C*03:23: C*03:23N
C*03:169: C*03:169Q
C*04:338: C*04:338Q
C*04:382: C*04:382Q
C*06:74: C*06:74Q
C*07:226: C*07:226Q
C*07:235: C*07:235Q
C*07:432: C*07:432Q
C*07:713: C*07:713Q
C*08:70: C*08:70Q
C*12:139: C*12:139Q
Cw*16:16: C*16:16Q
DRB4*01:14: DRB4*01:14N
DRB5*02:26: DRB5*02:26N
DQA1*01:07: DQA1*01:07Q
DPB1*939:01N: DPB1*939:01
45 changes: 45 additions & 0 deletions mhcgnomes/data/known_alleles.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
SLA:
"1":
- CDY
- CHANGDA
- HB01
- HB02
- HB03
- HB04
- LWH
- TPK
- YC
- YTH
- YDY
- YDL
- YDL01
"2":
- CDY
- HB01
- LWH
- TPK
- YC
- YTH
- YDY
- YDL
- YDL02
"3":
- CDY
- HB01
- LWH
- TPK
- YC
- YTH
- YDY01
- YDY02
- YDL
BoLA:
~:
- HD6
- JSP.1
- D18.4
- AW10
- T2b
- T2c
- T5
- T7

0 comments on commit b5d8581

Please sign in to comment.