### This script takes a csv file and adds the full HLA sequence from IMGT
- Inputs: csv files
- outputs: csv file with additional column for creation of Boltz yaml files

In [16]:
import pandas as pd
from pathlib import Path


raw_dir = Path("/home/natasha/multimodal_model") / "data" / "raw"
df1 = pd.read_excel(raw_dir / "IEDB" / "IEDB_Negatives_HLA_class_I.xlsx")

# Keep only first entry if HLA column contains multiple entries
df1['HLA'] = df1['HLA'].str.split(',').str[0]
df1 = df1[df1['HLA'].notna()]
df1 = df1[~df1['HLA'].str.contains('mutant', case=False, na=False)]


In [22]:
HLA_list = list(df1['HLA'].unique())
print(len(HLA_list))

with open(raw_dir / "HLA" / "hla_list.txt", 'w') as f:
    for hla in HLA_list:
        f.write(f"{hla}\n")

# test for one HLA
HLA_list = [hla.replace('HLA-', '') for hla in HLA_list]  # Ensure no 'HLA-' prefix

filtered_records = []
header = []
sequence_list = []

with open(raw_dir / "HLA" / "hla_prot.fasta", 'r') as fasta_file:
    fasta_lines = fasta_file.readlines()

for hla in HLA_list:
    found = False
    record = ''
    sequence = '' #needs to be empty for each HLA
    for i, line in enumerate(fasta_lines):
        if line.startswith('>') and 'N ' not in line:
            if found:
                break
            if hla in line:
                # Replace header with just the HLA tag
                record = f'>{hla}\n'
                # Collect the sequence lines
                for seq_line in fasta_lines[i+1:]:
                    if seq_line.startswith('>'):
                        break
                    record += seq_line
                    sequence += seq_line
                filtered_records.append(record)
                header.append(hla)
                sequence_list.append(sequence)
                found = True
    if not found:
        print(f"HLA {hla} not found in fasta file")

print(filtered_records)
print(header)
print(sequence_list)


# Write filtered records to a new file
with open(raw_dir / "HLA" / "hla_prot_filtered_firsthits.fasta", 'w') as out_file:
    out_file.writelines(filtered_records)

45
HLA B7 not found in fasta file
HLA B8 not found in fasta file
HLA Cw3 not found in fasta file
HLA B35 not found in fasta file
['>A*02:01\nMAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF\nDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQ\nRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQL\nRAYLEGTCVEWLRRYLENGKETLQRTDAPKTHMTHHAVSDHEATLRCWALSFYPAEITLT\nWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGQEQRYTCHVQHEGLPKPLTLRWEP\nSSQPTIPIVGIIAGLVLFGAVITGAVVAAVMWRRKSSDRKGGSYSQAASSDSAQGSDVSL\nTACKV\n', '>B*08:01\nMLVMAPRTVLLLLSAALALTETWAGSHSMRYFDTAMSRPGRGEPRFISVGYVDDTQFVRF\nDSDAASPREEPRAPWIEQEGPEYWDRNTQIFKTNTQTDRESLRNLRGYYNQSEAGSHTLQ\nSMYGCDVGPDGRLLRGHNQYAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAARVAEQD\nRAYLEGTCVEWLRRYLENGKDTLERADPPKTHVTHHPISDHEATLRCWALGFYPAEITLT\nWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEP\nSSQSTVPIVGIVAGLAVLAVVVIGAVVAAVMCRRKSSGGKGGSYSQAACSDSAQGSDVSL\nTA\n', '>B*44:02\nMRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRF\nDSDATSPRKEPRAP

In [24]:
df1['HLA'].dtype

dtype('O')

In [None]:
df_filtered = pd.DataFrame({'HLA': header, 'HLA_sequence': sequence_list})
df_filtered['HLA'] = 'HLA-' + df_filtered['HLA']

df2 = df1.merge(df_filtered, on='HLA', how='left')
df2.to_csv(raw_dir / "HLA" / "IEDB_Negatives_HLA_class_I_with_HLA_seq.csv", index=False) 



object
           HLA                                       HLA_sequence
0  HLA-A*02:01  MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1  HLA-B*08:01  MLVMAPRTVLLLLSAALALTETWAGSHSMRYFDTAMSRPGRGEPRF...
2  HLA-B*44:02  MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...
3  HLA-B*44:05  MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...
4      HLA-A11  SHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEP...
      Peptide                           Source Molecule  \
0   SLLMWITQC                   Cancer/testis antigen 1   
1   AAGIGILTV  Melanoma antigen recognized by T-cells 1   
2  ELAGIGILTV                                       NaN   
3   AAGIGILTV  Melanoma antigen recognized by T-cells 1   
4   NLVPMVATV                                      pp65   

                               Source Organism    Type  \
0                         Homo sapiens (human)  T cell   
1                         Homo sapiens (human)  T cell   
2                                          NaN  T cell   
3         