In [1]:

from tqdm import tqdm
import json
import requests
import pandas as pd
import numpy as np


def filter_alleles(alleles: list[dict]) -> list[str]:
    filtered_alleles = []
    for item in alleles:
        if item["confirmation_status.confirmed"] and item["sequence_status.full"]:
            filtered_alleles.append(item["name"])
    return filtered_alleles


def get_alleles_by_ipd_accession(ipd_accession):
    """
    Retrieve a list of alleles that start with the given IPD accession.

    Args:
        ipd_accession (str): The IPD accession (e.g., "B*08")

    Returns:
        list: List of allele names that match the IPD accession
    """
    base_url = "https://www.ebi.ac.uk/cgi-bin/ipd/api/allele"
    params = {
        "query": f'startsWith(name, "{ipd_accession}")',
        "fields": "name, sequence_status.full, confirmation_status.confirmed",
        "limit": 1000,  # Adjust limit as needed
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors

        alleles = response.json().get("data", [])
        alleles = filter_alleles(alleles)
        return alleles

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {str(e)}")
        return []


def get_protein_sequence_by_allele(allele_name):
    """
    Retrieve protein sequence for a specific HLA allele from IPD-IMGT/HLA API.

    Args:
        allele_name (str): The allele name (e.g., "B*08:02")

    Returns:
        str: Protein sequence or error message
    """
    base_url = "https://www.ebi.ac.uk/cgi-bin/ipd/api/allele/download"

    params = {"project": "HLA", "type": "protein", "query": f'eq(name,"{allele_name}")'}
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors

        if response.text:
            return response.text.split()[-1]
        else:
            return "No sequence found for this allele"

    except requests.exceptions.RequestException as e:
        return f"Request failed: {str(e)}"


def save_to_json(allele_sequences, filename="allele_sequences.json"):
    """Save allele sequences to a JSON file."""
    with open(filename, "w") as json_file:
        json.dump(allele_sequences, json_file, indent=4)


input_alleles = [
    "HLA-A*02:256",
    "HLA-B*35:42",
    "HLA-B*57:06",
    "HLA-A*02:266",
    "HLA-B*37:01",
    "HLA-A*08:01",
    "HLA-A*24:01",
    "HLA-B*12:01",
    "HLA-B*07:01",
    "HLA-C*14:02",
]


# if __name__ == "__main__":
#     paired_TCR = pd.read_csv("data/external/TCR/VDJB/paired_TCR.csv")
#     data = list(set(paired_TCR["MHC A"].unique()).union(paired_TCR["MHC B"].unique()))
#     data.remove("B2M")
#     ipd_accessions = list(map(lambda x: x.split("HLA-")[1], data))

#     allele_sequences = {}

#     for ipd_accession in tqdm(ipd_accessions, total=len(ipd_accessions)):
#         alleles = get_alleles_by_ipd_accession(ipd_accession)
#         if alleles:
#             allele_sequences["HLA-" + ipd_accession] = {}

#         for allele in alleles:
#             protein_sequence = get_protein_sequence_by_allele(allele)

#             if (
#                 "Request failed" not in protein_sequence
#                 and "No sequence found" not in protein_sequence
#             ):
#                 allele_sequences["HLA-" + ipd_accession][allele] = protein_sequence

#     for key in allele_sequences:
#         if len(allele_sequences[key]) == 0:
#             del allele_sequences[key]

#     save_to_json(allele_sequences, "/home/user11/data/mhcii/allele_sequences.json")


In [2]:
import pandas as pd
from collections import Counter


def format_name(name):
    if name == '':
        return ''
    elif name == 'HLA-DRA0101':
        return 'DRA*01:01:01:01'
    else:
        _, name = name.split('-')
        first_part = name[:4]
        second_part = name[4:6]
        third_part = name[6:8]
        return f'{first_part}*{second_part}:{third_part}'


path = '/home/user11/data/data_processed/data.tsv'
df = pd.read_csv(path, sep='\t', header=None)
counter = Counter(df[2]).most_common()
ids = set(df[2])

ids_table = []
for i in ids:
    line = dict()
    if not i.startswith('HLA-'):
        if 'A' in i:
            a_chain = f"{i.replace('_', '')}"
            b_chain = f'{a_chain[:2]}B'
        if 'B' in i:
            b_chain = f"{i.replace('_', '')}"
            a_chain = f'{b_chain[:2]}A0101'
    else:
        _, a_chain, b_chain = i.split('-')
    line['name_A'] = f"HLA-{a_chain}"
    line['name_B'] = f"HLA-{b_chain}"
    ids_table.append(line)

table = pd.DataFrame(ids_table)
table['allele_A'] = table['name_A'].apply(format_name)
table['allele_B'] = table['name_B'].apply(format_name)
ipd_accessions = list(set(table["allele_A"].unique()).union(table["allele_B"].unique()))

In [4]:
counter

[('DRB1_0101', 10280),
 ('DRB1_0701', 6237),
 ('DRB1_0401', 6188),
 ('DRB1_1101', 5900),
 ('DRB1_0301', 5223),
 ('DRB5_0101', 5053),
 ('DRB1_1501', 4766),
 ('DRB3_0101', 4585),
 ('DRB1_1302', 4440),
 ('DRB1_0802', 4433),
 ('DRB1_0901', 4280),
 ('DRB1_0405', 3923),
 ('DRB4_0101', 3903),
 ('DRB1_0404', 3601),
 ('HLA-DQA10501-DQB10301', 3546),
 ('DRB3_0202', 3315),
 ('HLA-DQA10301-DQB10302', 3063),
 ('HLA-DQA10101-DQB10501', 2913),
 ('HLA-DQA10401-DQB10402', 2858),
 ('HLA-DQA10501-DQB10201', 2778),
 ('HLA-DPA10103-DPB10401', 2696),
 ('HLA-DQA10102-DQB10602', 2676),
 ('HLA-DPA10301-DPB10402', 2611),
 ('HLA-DPA10201-DPB10501', 2440),
 ('HLA-DPA10201-DPB10101', 2417),
 ('DRB1_1201', 2360),
 ('HLA-DPA10201-DPB11401', 2285),
 ('DRB1_1001', 2053),
 ('DRB1_1602', 1686),
 ('HLA-DPA10103-DPB10301', 1550),
 ('DRB1_1301', 1007),
 ('HLA-DQA10201-DQB10202', 936),
 ('DRB1_0801', 931),
 ('HLA-DQA10104-DQB10503', 877),
 ('DRB3_0301', 877),
 ('DRB4_0103', 840),
 ('HLA-DQA10501-DQB10302', 838),
 ('HLA-DQA1

In [3]:
counter['HLA-DRB1*04:02']

TypeError: list indices must be integers or slices, not str

In [20]:
from collections import defaultdict

ipd_accessions = list(set(table["allele_A"].unique()).union(table["allele_B"].unique()))

allele_sequences = defaultdict(dict)

index = 0 
for ipd_accession in tqdm(ipd_accessions, total=len(ipd_accessions)):
    alleles = get_alleles_by_ipd_accession(ipd_accession)

    for allele in alleles:
        protein_sequence = get_protein_sequence_by_allele(allele)

        if (
            "Request failed" not in protein_sequence
            and "No sequence found" not in protein_sequence
        ):
            allele_sequences["HLA-" + ipd_accession][allele] = protein_sequence

    for key in allele_sequences:
        if len(allele_sequences[key]) == 0:
            del allele_sequences[key]

save_to_json(allele_sequences, "/home/user11/data/mhcii/allele_sequences.json")


100%|██████████| 73/73 [08:52<00:00,  7.29s/it]


## Для тестовой выборки

In [8]:
import pandas as pd
from collections import Counter


def format_name(name):
    if name == '':
        return ''
    elif name == 'HLA-DRA0101':
        return 'DRA*01:01:01:01'
    else:
        _, name = name.split('-')
        first_part = name[:4]
        second_part = name[4:6]
        third_part = name[6:8]
        return f'{first_part}*{second_part}:{third_part}'


path = '/home/user11/data/embeddings_test/test_dataset.csv'
df = pd.read_csv(path, sep='\t')
counter = Counter(df['allele']).most_common()
ids = set(df['allele'])

ids_table = []
for i in ids:
    line = dict()
    if not i.startswith('HLA-'):
        if 'A' in i:
            a_chain = f"{i.replace('_', '')}"
            b_chain = f'{a_chain[:2]}B'
        if 'B' in i:
            b_chain = f"{i.replace('_', '')}"
            a_chain = f'{b_chain[:2]}A0101'
    else:
        _, a_chain, b_chain = i.split('-')
    line['name_A'] = f"HLA-{a_chain}"
    line['name_B'] = f"HLA-{b_chain}"
    ids_table.append(line)

table = pd.DataFrame(ids_table)
# table['allele_A'] = table['name_A'].apply(format_name)
# table['allele_B'] = table['name_B'].apply(format_name)
table['allele_A'] = [i.split('-')[1] for i in table['name_A']]
table['allele_B'] = [i.split('-')[1] for i in table['name_B']]

ipd_accessions = list(set(table["allele_A"].unique()).union(table["allele_B"].unique()))

In [14]:
table

Unnamed: 0,name_A,name_B,allele_A,allele_B
0,HLA-DRA1*01:01,HLA-DRB1*10:01,DRA1*01:01,DRB1*10:01
1,HLA-DQA1*01:02,HLA-DQB1*04:02,DQA1*01:02,DQB1*04:02
2,HLA-DPA1*02:01,HLA-DPB1*05:01,DPA1*02:01,DPB1*05:01
3,HLA-DRA1*01:01,HLA-DRB1*11:04,DRA1*01:01,DRB1*11:04
4,HLA-DRA1*01:01,HLA-DRB3*02:02,DRA1*01:01,DRB3*02:02
5,HLA-DRA1*01:01,HLA-DRB1*15:02,DRA1*01:01,DRB1*15:02
6,HLA-DPA1*01:03,HLA-DPB1*02:01,DPA1*01:03,DPB1*02:01
7,HLA-DRA1*01:01,HLA-DRB1*04:04,DRA1*01:01,DRB1*04:04
8,HLA-DPA1*01:03,HLA-DPB1*03:01,DPA1*01:03,DPB1*03:01
9,HLA-DPA1*02:01,HLA-DPB1*14:01,DPA1*02:01,DPB1*14:01


In [26]:
from collections import defaultdict

ipd_accessions = list(set(table["allele_A"].unique()).union(table["allele_B"].unique()))

allele_sequences = defaultdict(dict)

index = 0 
for ipd_accession in tqdm(ipd_accessions, total=len(ipd_accessions)):
    if 'DRA1*01:01'== ipd_accession:
        alleles = get_alleles_by_ipd_accession('DRA*01:01:01:01')
    else:
        alleles = get_alleles_by_ipd_accession(ipd_accession)

    for allele in alleles:
        protein_sequence = get_protein_sequence_by_allele(allele)

        if (
            "Request failed" not in protein_sequence
            and "No sequence found" not in protein_sequence
        ):
            allele_sequences["HLA-" + ipd_accession][allele] = protein_sequence

    for key in allele_sequences:
        if len(allele_sequences[key]) == 0:
            del allele_sequences[key]

save_to_json(allele_sequences, "/home/user11/data/mhcii/allele_sequences_test.json")


100%|██████████████████████████████████████████████████████████| 46/46 [07:18<00:00,  9.52s/it]


In [25]:
ipd_accession

'DRB1*08:02'

In [21]:
get_protein_sequence_by_allele('DRA*01:01:01:01')

'MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSNYTPITNVPPEVTVLTNSPVELREPNVLICFIDKFTPPVVNVTWLRNGKPVTTGVSETVFLPREDHLFRKFHYLPFLPSTEDVYDCRVEHWGLDEPLLKHWEFDAPSPLPETTENVVCALGLTVGLVGIIIGTIFIIKGVRKSNAAERRGPL'

In [24]:
for i in allele_sequences.keys():
    print(i)

HLA-DRB1*07:01
HLA-DRB1*01:01
HLA-DQB1*04:02
HLA-DRB1*10:01
HLA-DRB1*11:01
HLA-DRB1*14:02
HLA-DRB1*13:01
HLA-DPB1*05:01
HLA-DRB1*04:02
HLA-DPB1*02:01
HLA-DPB1*03:01
HLA-DRB1*12:01
HLA-DQB1*03:01
HLA-DRB3*02:02
HLA-DPB1*04:01
HLA-DQB1*02:01
HLA-DQA1*05:01
HLA-DRB5*01:01
HLA-DRB1*08:01
HLA-DPA1*02:01
HLA-DRB1*04:05
HLA-DRB1*11:04
HLA-DRB1*12:02
HLA-DQA1*03:01
HLA-DRB1*16:02
HLA-DPA1*01:03
HLA-DRB1*15:01
HLA-DRB1*04:04
HLA-DRB4*01:01
HLA-H
HLA-DRB1*13:02
HLA-DQA1*01:01
HLA-DQB1*05:01
HLA-DRB3*01:01
HLA-DPA1*03:01
HLA-DPB1*14:01
HLA-DRB1*15:02
HLA-DRB1*04:01
HLA-DRB1*03:01
HLA-DQB1*03:02
HLA-DQA1*01:02
HLA-DRB1*09:01
HLA-DQB1*06:02
HLA-DPB1*04:02
HLA-DRB1*08:02


In [None]:
# QEFFIASGAAVDAIM WLFLECYDLQRATYHVGFT
# QEFFIASGAAVDAIM
# 9 11 22 24 31 52 53 58 59 61 65 66 68 72 73

allele_sequences['HLA-DRB1*01:01']

{'DRB1*01:01:01:01': 'MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHFFNGTERVRLLERCIYNQEESVRFDSDVGEYRAVTELGRPDAEYWNSQKDLLEQRRAAVDTYCRHNYGVGESFTVQRRVEPKVTVYPSKTQPLQHHNLLVCSVSGFYPGSIEVRWFRNGQEEKAGVVSTGLIQNGDWTFQTLVMLETVPRSGEVYTCQVEHPSVTSPLTVEWRARSESAQSKMLSGVGGFVLGLLFLGAGLFIYFRNQKGHSGLQPTGFLS',
 'DRB1*01:01:02': 'MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHFFNGTERVRLLERCIYNQEESVRFDSDVGEYRAVTELGRPDAEYWNSQKDLLEQRRAAVDTYCRHNYGVGESFTVQRRVEPKVTVYPSKTQPLQHHNLLVCSVSGFYPGSIEVRWFRNGQEEKAGVVSTGLIQNGDWTFQTLVMLETVPRSGEVYTCQVEHPSVTSPLTVEWRARSESAQSKMLSGVGGFVLGLLFLGAGLFIYFRNQKGHSGLQPTGFLS',
 'DRB1*01:01:07': 'MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHFFNGTERVRLLERCIYNQEESVRFDSDVGEYRAVTELGRPDAEYWNSQKDLLEQRRAAVDTYCRHNYGVGESFTVQRRVEPKVTVYPSKTQPLQHHNLLVCSVSGFYPGSIEVRWFRNGQEEKAGVVSTGLIQNGDWTFQTLVMLETVPRSGEVYTCQVEHPSVTSPLTVEWRARSESAQSKMLSGVGGFVLGLLFLGAGLFIYFRNQKGHSGLQPTGFLS',
 'DRB1*01:01:01:02': 'MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHFFNGTERVRLLERCIYNQEESVRFDSDVGEYRAVTELGRPDAEYWNSQKDLLEQRRAAVDTYCRHN

In [None]:
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# # Путь к исполняемому файлу ClustalW
# clustalw_exe = "clustalw2"  # Укажите путь, если требуется

# # Входной файл с последовательностями
# input_file = "hla_sequences.fasta"
# output_file = "aligned_hla.aln"

# # Создание команды ClustalW
# clustalw_cline = ClustalwCommandline(clustalw_exe, infile=input_file, outfile=output_file, align=True)

# # Выполнение выравнивания
# stdout, stderr = clustalw_cline()

# # Чтение выровненных последовательностей
# alignment = AlignIO.read(output_file, "clustal")

# # Вывод выравнивания
# print(alignment)

# # Анализ позиций
# for record in alignment:
#     print(f"ID: {record.id}, Sequence: {record.seq}")

In [141]:
receptors = dict()
df = pd.read_csv('/home/user11/data/mhcii/tcr_full_v3.csv', low_memory=False, header=[0, 1])
df = df.drop_duplicates(subset=[('Assay', 'MHC Allele Names')])
df.columns
# for index, line in df.iterrows():
#     receptors[line[()]]

MultiIndex([( 'Receptor',             'Group IRI'),
            ( 'Receptor',      'IEDB Receptor ID'),
            ( 'Receptor',        'Reference Name'),
            ( 'Receptor',                  'Type'),
            ('Reference',              'IEDB IRI'),
            (  'Epitope',              'IEDB IRI'),
            (  'Epitope',                  'Name'),
            (  'Epitope',       'Source Molecule'),
            (  'Epitope',       'Source Organism'),
            (    'Assay',                  'Type'),
            (    'Assay',              'IEDB IDs'),
            (    'Assay',      'MHC Allele Names'),
            (  'Chain 1',                  'Type'),
            (  'Chain 1',          'Organism IRI'),
            (  'Chain 1',   'Nucleotide Sequence'),
            (  'Chain 1',        'Curated V Gene'),
            (  'Chain 1',     'Calculated V Gene'),
            (  'Chain 1',        'Curated D Gene'),
            (  'Chain 1',     'Calculated D Gene'),
            

In [136]:
seq = df[df[('Assay', 'MHC Allele Names')] == 'HLA-DRA*01:01/DRB1*01:01'][(  'Chain 1',      'Protein Sequence')].to_list()

#df.drop_duplicates(subset=[('Assay', 'MHC Allele Names')])

In [142]:
seq = df[(  'Chain 1',      'Protein Sequence')].to_list()

#df.drop_duplicates(subset=[('Assay', 'MHC Allele Names')])

In [138]:
seq = [s for s in seq if isinstance(s,str)]

In [134]:
# QEFFIASGAAVDAIM WLFLECYDLQRATYHVGFT
# YAFFMFSGGAILNTL YGQFEYFAIEKVRVHLDVT
# YAFFMFSGGAILNTL YGQFEYFAIEKVRVHLDVT
# 9 11 22 24 31 52 53 58 59 61 65 66 68 72 73

seq

['ADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDESYGYTFGSGTRLTVVEDLNKVSPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVCTDPQPLKEQPALNDSRYSLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRAD',
 'ADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDESYGYTFGSGTRLTVVEDLNKVSPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVCTDPQPLKEQPALNDSRYSLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRAD',
 'ADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDESYGYTFGSGTRLTVVEDLNKVSPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVCTDPQPLKEQPALNDSRYSLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRAD']

In [135]:
seq[0][8]

'N'

In [16]:
set(df[('Assay', 'MHC Allele Names')])
#df[('Chain 2',           'Type')]

{'Class II, allele undetermined',
 'Gaga-BF2*015:01',
 'H2 class I',
 'H2 class II',
 'H2 class II, H2-b class I',
 'H2 class II, H2-b class II',
 'H2 class II, HLA-DRB1*15:01',
 'H2-Db',
 'H2-Db, H2-Db Y159F mutant',
 'H2-IAb',
 'H2-IAb I67F, R70Q, T71K mutant',
 'H2-IAb, H2-a class II',
 'H2-IAb, mouse CD1d',
 'H2-IAd',
 'H2-IAg7',
 'H2-IAk',
 'H2-IAq',
 'H2-IAs',
 'H2-IAu',
 'H2-IEd',
 'H2-IEd, H2-d class II',
 'H2-IEk',
 'H2-Kb',
 'H2-Kb D77S, K89A mutant',
 'H2-Kb Y22F, M23I, E24S, D30N mutant',
 'H2-Kb, H2-Kb Y22F, M23I, E24S, D30N mutant',
 'H2-Kb, H2-Kb Y84A mutant, H2-Kb Y84C mutant',
 'H2-Kd',
 'H2-Ld',
 'H2-Q9',
 'H2-Qa-1b',
 'H2-b class I',
 'H2-b class II',
 'H2-d class II',
 'H2-s class I',
 'H2-u class II',
 'HLA class I',
 'HLA class II',
 'HLA-A*01:01',
 'HLA-A*02:01',
 'HLA-A*02:01 K66A mutant',
 'HLA-A*02:01 K66A, E63Q mutant',
 'HLA-A*02:01 T163A mutant',
 'HLA-A*02:01 W167A mutant',
 'HLA-A*02:01, HLA-A*02:01 A150P mutant',
 'HLA-A*02:01, HLA-A*02:01 A69G mutant, H

In [102]:

ipd_accessions = list(set(table["allele_A"].unique()).union(table["allele_B"].unique()))

allele_sequences = {}

index = 0 
for ipd_accession in tqdm(ipd_accessions, total=len(ipd_accessions)):
    alleles = get_alleles_by_ipd_accession(ipd_accession)
    index += 1
    if index == 10:
        break


 12%|█▏        | 9/73 [00:06<00:49,  1.29it/s]


In [11]:
get_alleles_by_ipd_accession('DRA*01:01')

['DRA*01:01:01:01',
 'DRA*01:01:01:02',
 'DRA*01:01:01:03',
 'DRA*01:01:01:05',
 'DRA*01:01:01:04',
 'DRA*01:01:01:10',
 'DRA*01:01:01:11',
 'DRA*01:01:01:08',
 'DRA*01:01:01:07',
 'DRA*01:01:01:06',
 'DRA*01:01:01:12',
 'DRA*01:01:01:16',
 'DRA*01:01:01:13',
 'DRA*01:01:01:14',
 'DRA*01:01:01:15',
 'DRA*01:01:01:18',
 'DRA*01:01:01:20',
 'DRA*01:01:03',
 'DRA*01:01:01:19',
 'DRA*01:01:01:25',
 'DRA*01:01:01:23',
 'DRA*01:01:01:22',
 'DRA*01:01:01:21',
 'DRA*01:01:01:24',
 'DRA*01:01:04']

In [1]:
seq = 'MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV'
seq[52]

'D'

In [5]:
seq[8]

'L'

In [74]:
pd.DataFrame(ids_table)

Unnamed: 0,name_A,name_B
0,,HLA-DRB11302
1,HLA-DPA10103,HLA-DPB10301
2,,HLA-DRB10901
3,,HLA-DRB10301
4,HLA-DQA10303,HLA-DQB10402
...,...,...
67,HLA-DQA10301,HLA-DQB10302
68,HLA-DQA10201,HLA-DQB10303
69,HLA-DQA10201,HLA-DQB10301
70,,HLA-DRB11301


In [19]:
import xmltodict

path = '/home/user11/data/mhcii/MhcAlleleNameList.xml'
with open(path) as file:
    xml = file.readlines()

xml_str = ''.join(xml)
xml_dict = xmltodict.parse(xml_str)


In [47]:
xml_dict['MhcAlleleNameList'].keys()

dict_keys(['@xsi:noNamespaceSchemaLocation', '@xmlns:xsi', 'MhcAlleleName'])

In [51]:
len(xml_dict['MhcAlleleNameList']['MhcAlleleName'])

22697

In [86]:
name = 'HLA-DRB11302'
for record in xml_dict['MhcAlleleNameList']['MhcAlleleName']:
    # if 'DisplayedRestriction' in record:
    #     if name == record['DisplayedRestriction']:
    #         print(record)
    if 'Synonyms' in record:
        if name in record['Synonyms'].split('|'):
            print(record)


{'MhcAlleleRestrictionId': '425', 'DisplayedRestriction': 'HLA-DRB1*13:02', 'Synonyms': 'HLA-DRB1*1302|HLA-DRB1*130201|HLA-DRB113:02|HLA-DRB11302', 'Includes': 'HLA-DRB1*130201', 'RestrictionLevel': 'partial molecule', 'Organism': 'Homo sapiens (human)', 'OrganismNcbiTaxId': '9606', 'Class': 'II', 'Locus': 'DR', 'Molecule': 'HLA-DRB1*13:02', 'Chain1Name': 'HLA-DRA', 'Chain2Name': 'HLA-DRB1*13:02'}


In [115]:
#allele_name = 'HLA00066'
allele_name = 'HLA-DPA10103-DPB10601'
container = []
for i in ids:
    if 'HLA-' in i:
        allele_name = i
    else:
        allele_name = 'HLA-' + i
    out = get_alleles_by_ipd_accession(allele_name)
    container.append(out)

In [106]:
#allele_name = 'DQA*1:05:01'
allele_name = 'DQB1*03:02'
out = get_alleles_by_ipd_accession(allele_name)

In [107]:
out

['DQB1*03:02:01:01',
 'DQB1*03:02:02:01',
 'DQB1*03:02:03',
 'DQB1*03:02:09',
 'DQB1*03:02:11',
 'DQB1*03:02:12',
 'DQB1*03:02:01:02',
 'DQB1*03:02:21',
 'DQB1*03:02:01:04',
 'DQB1*03:02:01:05',
 'DQB1*03:02:32',
 'DQB1*03:02:01:10',
 'DQB1*03:02:01:12',
 'DQB1*03:02:02:02',
 'DQB1*03:02:37',
 'DQB1*03:02:39',
 'DQB1*03:02:01:25',
 'DQB1*03:02:01:29']

In [None]:
# YAFFMFSGGAILNTL YGQFEYFAIEKVRVHLDVT
# YAFFMFSGGAILNTL YGQFEYFAIEKVRVHLDVT
# YAFFQFSGGAILNTL
# 9 11 22 24 31 52 53 58 59 61 65 66 68 72 73
# 82 - 52 = 30

# zero based 
# 39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 96, 98, 102, 103 

s1 = 'MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQTHRPTGEFMFEFDEDEQFYVDLDKKETVWHLEEFGRAFSFEAQGGLANIAILNNNLNTLIQRSNHTQAANDPPEVTVFPKEPVELGQPNTLICHIDRFFPPVLNVTWLCNGEPVTEGVAESLFLPRTDYSFHKFHYLTFVPSAEDVYDCRVEHWGLDQPLLKHWEAQEPIQMPETTETVLCALGLVLGLVGIIVGTVLIIKSLRSGHDPRAQGPL'
s2 = 'MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQTHRPTGEFMFEFDEDEQFYVDLDKKETVWHLEEFGRAFSFEAQGGLANIAILNNNLNTLIQRSNHTQAANDPPEVTVFPKEPVELGQPNTLICHIDRFFPPVLNVTWLCNGEPVTEGVAESLFLPRTDYSFHKFHYLTFVPSAEDVYDCRVEHWGLDQPLLKHWEAQEPIQMPETTETVLCALGLVLGLVGIIVGTVLIIKSLRSGHDPRAQGPL'


In [None]:
'MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSNYTPITNVPPEVTVLTNSPVELREPNVLICFIDKFTPPVVNVTWLRNGKPVTTGVSETVFLPREDHLFRKFHYLPFLPSTEDVYDCRVEHWGLDEPLLKHWEFDAPSPLPETTENVVCALGLTVGLVGIIIGTIFIIKGVRKSNAAERRGPL'

In [122]:
s3 = 'MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV'

In [121]:
positions = [39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 96, 98, 102, 103 ]

site = ''
for p in positions:
    site += s2[p]
print(site)


YAFFQFSGGAILNTL


In [117]:
s2[30+22]

'F'

In [111]:
s1.find('FS')

82

In [112]:
s1[82]

'F'