In [None]:
!pip install datasets huggingface_hub pandas biopython

In [None]:
import glob
import re
from collections import Counter
import pandas as pd
from Bio import SeqIO
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import notebook_login

# Load IEDB dataset
# url_train = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/training_set.txt"
# df_train_raw = pd.read_csv(url_train, sep="\t")  # Adjust delimiter if needed
# url_test = "https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/IEDB_dataset/testing_set.txt"
# df_test_raw = pd.read_csv(url_test, sep="\t")  # Adjust delimiter if needed
# dfs = []
# for df_raw in [df_train_raw, df_test_raw]:
#     df = df_raw.copy()
#     df = df[df['species'] == 'human'] if 'species' in df else df
#     # Convert "inequality" column
#     def convert_meas(row):
#         if row["inequality"] == "<":
#             return row["meas"] / 2  # Assign a smaller estimate
#         elif row["inequality"] == ">":
#             return row["meas"] * 2  # Assign a larger estimate
#         else:
#             return row["meas"]

#     df["processed_meas"] = df.apply(convert_meas, axis=1) if 'inequality' in df else df["Measurement value"]
#     print(len(df))
#     dfs.append(df.copy())

# Load Anthem dataset
urls = ['https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/train_data.txt',
        'https://raw.githubusercontent.com/s7776d/CapsNet-MHC/refs/heads/main/dataset/Anthem_dataset/test_data.txt']
dfs_raw = [pd.read_csv(url, sep="\t") for url in urls]

157325
8855


In [27]:
dfs_raw[1]

Unnamed: 0,HLA,peptide,Label,Length
0,HLA-A*01:01,LFGRDLSY,1,8
1,HLA-A*01:01,TDKKTHLY,1,8
2,HLA-A*01:01,RSDTPLIY,1,8
3,HLA-A*01:01,NSDLVQKY,1,8
4,HLA-A*01:01,LSDLLDWK,1,8
...,...,...,...,...
172575,HLA-C*06:02,AAEQLVKTGWRSWH,0,14
172576,HLA-C*06:02,GCCMTGAAFGAMNG,0,14
172577,HLA-C*06:02,QYQLRNDSAEYKVI,0,14
172578,HLA-C*06:02,FFAGLVKYMHSGPV,0,14


In [58]:
# Function to extract HLA mapping from FASTA files
def parse_fasta(fasta_files):
    hla_dict = {}
    
    for file in fasta_files:
        for record in SeqIO.parse(file, "fasta"):
            header_parts = record.description.split()
            if len(header_parts) > 1:
                hla_id = header_parts[1]  # Extract HLA type (e.g., A*01:01:01:01)
                if hla_id.count(":") > 1:
                    normalized_hla = re.sub(r":\d+$", "", hla_id)
                    if normalized_hla.count(":") > 1:
                        normalized_hla = re.sub(r":\d+$", "", normalized_hla)
                    normalized_hla = "HLA-" + normalized_hla
                else:
                    normalized_hla = "HLA-" + hla_id
                hla_dict[normalized_hla] = str(record.seq)  # Store sequence
    
    return hla_dict

# Load HLA nucleotide sequences from FASTA files https://github.com/ANHIG/IMGTHLA/blob/Latest/fasta/A_nuc.fasta
fasta_files = glob.glob("/root/nucl/nucl-protein/hla/*_nuc.fasta")  # Load all relevant FASTA files
hla_sequences = parse_fasta(fasta_files)

dfs = []
unknown_hla = set()
for df_raw, ds in zip(dfs_raw, ['train', 'test']):
    df = df_raw.copy()
    # Map sequences to HLA types
    df.loc[:, "Sequence"] = df["HLA"].map(hla_sequences)
    print(f'Sequences in {ds}: {len(df)}')
    un_hla = set(df[df['Sequence'] != df['Sequence']]['HLA'])
    unknown_hla = unknown_hla.union(un_hla)
    df.dropna(subset=['Sequence'], inplace=True)
    print(f'Sequences in {ds} after removing unmatched HLA: {len(df)}', Counter(df['Label']))
    unique_hla = len(df['HLA'].unique())
    unique_pep = len(df['peptide'].unique())
    print(f'Unique HLA: {unique_hla}, Unique peptide: {unique_pep}')
    dfs.append(df)

print('Unmatched HLA types: ', unknown_hla)

Sequences in train: 539019
Sequences in train after removing unmatched HLA: 539019 Counter({0: 269645, 1: 269374})
Unique HLA: 112, Unique peptide: 453277
Sequences in test: 172580
Sequences in test after removing unmatched HLA: 172580 Counter({0: 86470, 1: 86110})
Unique HLA: 112, Unique peptide: 159082
Unmatched HLA types:  set()


In [None]:
# Log in to Hugging Face
notebook_login()

# Create a dataset dictionary (train/test split)
dataset = DatasetDict({
    "train": Dataset.from_pandas(dfs[0]),
    "test": Dataset.from_pandas(dfs[1])
})
# Push dataset to Hugging Face Hub
dataset.push_to_hub("vladak/anthem_hla_seq")