In [72]:
# !pip install datasets huggingface_hub pandas biopython

In [65]:
import glob
import re
from collections import Counter
import pandas as pd
from Bio import SeqIO
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import notebook_login

# Load IEDB dataset
# url_train = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/training_set.txt"
# df_train_raw = pd.read_csv(url_train, sep="\t")  # Adjust delimiter if needed
# url_test = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/testing_set.txt"
# df_test_raw = pd.read_csv(url_test, sep="\t")  # Adjust delimiter if needed
# dfs = []
# for df_raw in [df_train_raw, df_test_raw]:
#     df = df_raw.copy()
#     df = df[df['species'] == 'human'] if 'species' in df else df
#     # Convert "inequality" column
#     def convert_meas(row):
#         if row["inequality"] == "<":
#             return row["meas"] / 2  # Assign a smaller estimate
#         elif row["inequality"] == ">":
#             return row["meas"] * 2  # Assign a larger estimate
#         else:
#             return row["meas"]

#     df["processed_meas"] = df.apply(convert_meas, axis=1) if 'inequality' in df else df["Measurement value"]
#     print(len(df))
#     dfs.append(df.copy())

# Load Anthem dataset
urls = ['https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/train_data.txt',
        'https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/test_data.txt']
dfs_raw = [pd.read_csv(url, sep="\t") for url in urls]

In [37]:
dfs_raw[1]

Unnamed: 0,HLA,peptide,Label,Length
0,HLA-A*01:01,LFGRDLSY,1,8
1,HLA-A*01:01,TDKKTHLY,1,8
2,HLA-A*01:01,RSDTPLIY,1,8
3,HLA-A*01:01,NSDLVQKY,1,8
4,HLA-A*01:01,LSDLLDWK,1,8
...,...,...,...,...
172575,HLA-C*06:02,AAEQLVKTGWRSWH,0,14
172576,HLA-C*06:02,GCCMTGAAFGAMNG,0,14
172577,HLA-C*06:02,QYQLRNDSAEYKVI,0,14
172578,HLA-C*06:02,FFAGLVKYMHSGPV,0,14


In [69]:
# Function to extract HLA mapping from FASTA files
def parse_fasta(fasta_files):
    hla_dict = {}
    
    for file in fasta_files:
        for record in SeqIO.parse(file, "fasta"):
            header_parts = record.description.split()
            if len(header_parts) > 1:
                hla_id = header_parts[1]  # Extract HLA type (e.g., A*01:01:01:01)
                normalized_hla = re.sub(r":\d+$", "", hla_id)  # Normalize (e.g., A*01:01:01)
                normalized_hla = "HLA-" + re.sub(r":\d+$", "", normalized_hla)  # Normalize (e.g., A*01:01)
                hla_dict[normalized_hla] = str(record.seq)  # Store sequence
    
    return hla_dict

# Load HLA nucleotide sequences from FASTA files https://github.com/ANHIG/IMGTHLA/blob/Latest/fasta/A_nuc.fasta
fasta_files = glob.glob("../data/hla/*_nuc.fasta")  # Load all relevant FASTA files
hla_sequences = parse_fasta(fasta_files)

dfs = []
unknown_hla = set()
for df_raw, ds in zip(dfs_raw, ['train', 'test']):
    df = df_raw.copy()
    # Map sequences to HLA types
    df.loc[:, "Sequence"] = df["HLA"].map(hla_sequences)
    print(f'Sequences in {ds}: {len(df)}')
    un_hla = set(df[df['Sequence'] != df['Sequence']]['HLA'])
    unknown_hla = unknown_hla.union(un_hla)
    df.dropna(subset=['Sequence'], inplace=True)
    print(f'Sequences in {ds} after removing unmatched HLA: {len(df)}', Counter(df['Label']))
    unique_hla = len(df['HLA'].unique())
    unique_pep = len(df['peptide'].unique())
    print(f'Unique HLA: {unique_hla}, Unique peptide: {unique_pep}')
    dfs.append(df)

print('Unmatched HLA types: ', unknown_hla)

Sequences in train: 539019
Sequences in train after removing unmatched HLA: 510896 Counter({0: 255537, 1: 255359})
Unique HLA: 88, Unique peptide: 435153
Sequences in test: 172580
Sequences in test after removing unmatched HLA: 163120 Counter({0: 81715, 1: 81405})
Unique HLA: 88, Unique peptide: 151402
Unmatched HLA types:  {'HLA-B*27:20', 'HLA-A*24:06', 'HLA-A*26:02', 'HLA-B*15:42', 'HLA-A*02:04', 'HLA-A*32:15', 'HLA-A*24:13', 'HLA-B*15:11', 'HLA-B*27:04', 'HLA-A*02:20', 'HLA-B*27:01', 'HLA-A*02:12', 'HLA-B*27:08', 'HLA-A*68:23', 'HLA-A*02:50', 'HLA-B*15:09', 'HLA-A*02:16', 'HLA-A*02:19', 'HLA-B*83:01', 'HLA-A*32:07', 'HLA-B*27:07', 'HLA-B*27:03', 'HLA-B*27:09', 'HLA-B*45:06'}


In [75]:
# Log in to Hugging Face
notebook_login()

# Create a dataset dictionary (train/test split)
dataset = DatasetDict({
    "train": Dataset.from_pandas(dfs[0]),
    "test": Dataset.from_pandas(dfs[1])
})
# Push dataset to Hugging Face Hub
dataset.push_to_hub("vladak/anthem_hla_seq")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/256 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/256 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/164 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/vladak/anthem_hla_seq/commit/c673bd50f5d2226bb6628ba7699c5ffbd708a9dd', commit_message='Upload dataset', commit_description='', oid='c673bd50f5d2226bb6628ba7699c5ffbd708a9dd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/vladak/anthem_hla_seq', endpoint='https://huggingface.co', repo_type='dataset', repo_id='vladak/anthem_hla_seq'), pr_revision=None, pr_num=None)