In [1]:
# !pip install datasets huggingface_hub pandas biopython

In [57]:
import glob
import re
from collections import Counter

import pandas as pd
import numpy as np
from Bio import SeqIO
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login
from huggingface_hub import HfApi

In [58]:
# Function to extract HLA mapping from FASTA files
def parse_fasta(fasta_files):
    hla_dict = {}
    
    for file in fasta_files:
        for record in SeqIO.parse(file, "fasta"):
            header_parts = record.description.split()
            if len(header_parts) > 1:
                hla_id = header_parts[1]  # Extract HLA type (e.g., A*01:01:01:01)
                if len(hla_id.split(":")) > 2:
                    hla_id = ":".join(hla_id.split(":")[:2])
                normalized_hla = "HLA-" + hla_id
                hla_dict[normalized_hla] = str(record.seq)  # Store sequence
    
    return hla_dict

# Load HLA nucleotide sequences from FASTA files https://github.com/ANHIG/IMGTHLA/blob/Latest/fasta/A_nuc.fasta
fasta_files = glob.glob("../data/hla/*_nuc.fasta")  # Load all relevant FASTA files
hla_sequences = parse_fasta(fasta_files)

# Anthem

In [59]:
# Load Anthem dataset
urls = ['https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/train_data.txt',
        'https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/test_data.txt']
dfs_raw = [pd.read_csv(url, sep="\t") for url in urls]

In [60]:
dfs_raw[1]

Unnamed: 0,HLA,peptide,Label,Length
0,HLA-A*01:01,LFGRDLSY,1,8
1,HLA-A*01:01,TDKKTHLY,1,8
2,HLA-A*01:01,RSDTPLIY,1,8
3,HLA-A*01:01,NSDLVQKY,1,8
4,HLA-A*01:01,LSDLLDWK,1,8
...,...,...,...,...
172575,HLA-C*06:02,AAEQLVKTGWRSWH,0,14
172576,HLA-C*06:02,GCCMTGAAFGAMNG,0,14
172577,HLA-C*06:02,QYQLRNDSAEYKVI,0,14
172578,HLA-C*06:02,FFAGLVKYMHSGPV,0,14


In [63]:
dfs = []
unknown_hla = set()
for df_raw, ds in zip(dfs_raw, ['train', 'test']):
    df = df_raw.copy()
    # Map sequences to HLA types
    df.loc[:, "Sequence"] = df["HLA"].map(hla_sequences)
    print(f'Sequences in {ds}: {len(df)}')
    un_hla = set(df[df['Sequence'] != df['Sequence']]['HLA'])
    unknown_hla = unknown_hla.union(un_hla)
    df.dropna(subset=['Sequence'], inplace=True)
    print(f'Sequences in {ds} after removing unmatched HLA: {len(df)}', Counter(df['Label']))
    unique_hla = len(df['HLA'].unique())
    unique_pep = len(df['peptide'].unique())
    print(f'Unique HLA: {unique_hla}, Unique peptide: {unique_pep}')
    df.rename(columns={'peptide': 'Peptide'}, inplace=True)
    df = df[['Peptide', 'Sequence', 'HLA', 'Length', 'Label']]
    dfs.append(df)

print('Unmatched HLA types: ', unknown_hla)

Sequences in train: 539019
Sequences in train after removing unmatched HLA: 539019 Counter({0: 269645, 1: 269374})
Unique HLA: 112, Unique peptide: 453277
Sequences in test: 172580
Sequences in test after removing unmatched HLA: 172580 Counter({0: 86470, 1: 86110})
Unique HLA: 112, Unique peptide: 159082
Unmatched HLA types:  set()


In [65]:
# Log in to Hugging Face
notebook_login()

# Create a dataset dictionary (train/test split)
dataset = DatasetDict({
    "train": Dataset.from_pandas(dfs[0]),
    "test": Dataset.from_pandas(dfs[1])
})
# Push dataset to Hugging Face Hub
dataset.push_to_hub("vladak/anthem_hla_seq")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/270 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/270 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/vladak/anthem_hla_seq/commit/96727f5ee31513810c8e24b81e416c1779b60a0c', commit_message='Upload dataset', commit_description='', oid='96727f5ee31513810c8e24b81e416c1779b60a0c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/vladak/anthem_hla_seq', endpoint='https://huggingface.co', repo_type='dataset', repo_id='vladak/anthem_hla_seq'), pr_revision=None, pr_num=None)

# IEDB dataset

In [66]:
# Load IEDB dataset: http://tools.iedb.org/mhci/download/  binding_data_2013.zip
url_train = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/training_set.txt"
df_train_raw = pd.read_csv(url_train, sep="\t")  # Adjust delimiter if needed
url_test = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/testing_set.txt"
df_test_raw = pd.read_csv(url_test, sep="\t")  # Adjust delimiter if needed
dfs = []
for df_raw in [df_train_raw, df_test_raw]:
    df = df_raw.copy()
    df = df[df['species'] == 'human'] if 'species' in df else df
    # Convert "inequality" column
    def convert_meas(row):
        if row["inequality"] == "<":
            ret = row["meas"] / 2  # Assign a smaller estimate
        elif row["inequality"] == ">":
            ret = row["meas"] * 2  # Assign a larger estimate
        else:
            ret = row["meas"]
        return np.round(np.log10(ret), 5)

    df["processed_log_meas"] = df.apply(convert_meas, axis=1) if 'inequality' in df else df["Measurement value"]
    print(len(df))
    dfs.append(df.copy())

157325
8855


In [67]:
dfs[0]  # Skip test dataset... split train to train and test

Unnamed: 0,species,mhc,peptide_length,sequence,inequality,meas,processed_log_meas
5009,human,HLA-A*01:01,8,ASFCGSPY,=,51.400000,1.71096
5010,human,HLA-A*01:01,8,LTDFGLSK,=,739.385479,2.86887
5011,human,HLA-A*01:01,8,FTSFFYRY,=,1285.000000,3.10890
5012,human,HLA-A*01:01,8,KSVFNSLY,=,1466.000000,3.16613
5013,human,HLA-A*01:01,8,RDWAHNSL,=,1804.675523,3.25640
...,...,...,...,...,...,...,...
162329,human,HLA-E*01:03,9,SMADRAENL,>,50000.000000,5.00000
162330,human,HLA-E*01:03,9,SMAGRAGQL,>,50000.000000,5.00000
162331,human,HLA-E*01:03,9,VEAFRTRPL,>,50000.000000,5.00000
162332,human,HLA-E*01:03,9,VSNLRTGKL,>,50000.000000,5.00000


In [69]:
df = dfs[0][['mhc', 'sequence', 'processed_log_meas', 'peptide_length']].copy()
df.columns = ['HLA', 'Peptide', 'Log_meas', 'Length']
unknown_hla = set()
df.loc[:, "Sequence"] = df["HLA"].map(hla_sequences)
df = df[['Peptide', 'Sequence', 'HLA', 'Length', 'Log_meas']]

unknown_hla = set(df[df['Sequence'] != df['Sequence']]['HLA'])
df.dropna(subset=['Sequence'], inplace=True)
print(f'Sequences after removing unmatched HLA: {len(df)}')
unique_hla = len(df['HLA'].unique())
unique_pep = len(df['Peptide'].unique())
print(f'Unique HLA: {unique_hla}, Unique peptide: {unique_pep}')

Sequences after removing unmatched HLA: 156921
Unique HLA: 102, Unique peptide: 29089


In [70]:
df

Unnamed: 0,Peptide,Sequence,HLA,Length,Log_meas
5009,ASFCGSPY,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,HLA-A*01:01,8,1.71096
5010,LTDFGLSK,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,HLA-A*01:01,8,2.86887
5011,FTSFFYRY,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,HLA-A*01:01,8,3.10890
5012,KSVFNSLY,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,HLA-A*01:01,8,3.16613
5013,RDWAHNSL,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,HLA-A*01:01,8,3.25640
...,...,...,...,...,...
162160,YVRTNGASY,ATGCGGGTCATGGCGCCCCGAACCCTCCTCCTGCTGCTCTCGGGAG...,HLA-C*15:02,9,4.60206
162161,YWMGGTTYF,ATGCGGGTCATGGCGCCCCGAACCCTCCTCCTGCTGCTCTCGGGAG...,HLA-C*15:02,9,4.60206
162162,YYGRWVHEF,ATGCGGGTCATGGCGCCCCGAACCCTCCTCCTGCTGCTCTCGGGAG...,HLA-C*15:02,9,4.60206
162163,YYKKTFSAL,ATGCGGGTCATGGCGCCCCGAACCCTCCTCCTGCTGCTCTCGGGAG...,HLA-C*15:02,9,4.60206


In [71]:
# Split the dataset into 80/20 train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

# Convert dataframes to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Push the dataset to Hugging Face Hub
dataset_dict.push_to_hub('vladak/iedb-2013')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/126 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/820 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


CommitInfo(commit_url='https://huggingface.co/datasets/vladak/iedb-2013/commit/a7169b4db1b70f21990b2d4a42715e7ffb4e01b8', commit_message='Upload dataset', commit_description='', oid='a7169b4db1b70f21990b2d4a42715e7ffb4e01b8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/vladak/iedb-2013', endpoint='https://huggingface.co', repo_type='dataset', repo_id='vladak/iedb-2013'), pr_revision=None, pr_num=None)

In [55]:
dataset_dict['train']

Dataset({
    features: ['HLA', 'peptide', 'Log_meas', 'Length', 'Sequence'],
    num_rows: 125536
})

In [51]:
train_df

Unnamed: 0,index,HLA,peptide,Log_meas,Length,Sequence
0,40236,HLA-A*02:12,RKLTNPANK,4.60206,9,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...
1,14629,HLA-A*02:01,NVLLYNRLL,3.78010,9,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...
2,52483,HLA-A*11:01,AAATSAGTR,3.61142,9,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...
3,81394,HLA-A*31:01,ISDYDYYRY,4.60206,9,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTTGGGGG...
4,123674,HLA-B*18:01,RRARSLSAERY,5.10220,11,ATGCGGGTCACGGCGCCCCGAACCCTCCTCCTGCTGCTCTGGGGGG...
...,...,...,...,...,...,...
125531,125040,HLA-B*27:05,RRLAATTEK,1.56544,9,ATGCGGGTCACGGCGCCCCGAACCCTCCTCCTGCTGCTCTGGGGGG...
125532,108855,HLA-B*08:01,CSIMRAPFA,4.03104,9,ATGCTGGTCATGGCGCCCCGAACCGTCCTCCTGCTGCTCTCGGCGG...
125533,137093,HLA-B*40:01,SPAIFQSSM,5.18293,9,ATGCGGGTCACGGCACCCCGAACCGTCCTCCTGCTGCTCTCGGCGG...
125534,152028,HLA-B*54:01,LPTNASLSF,4.69897,9,ATGCGGGTCACGGCACCCCGAACCCTCCTCCTGCTGCTCTGGGGGG...
