In [65]:
import itertools
import requests
import pandas as pd
from io import StringIO
import os
import csv

In [66]:
IEDB_API_URL_CLASSI = "http://tools-cluster-interface.iedb.org/tools_api/mhci/"

In [67]:
# Helper Function: Determine the Best Binding Affinity
def get_best_binding(peptide_bindings):
    """
    Determines the best (lowest) binding affinity for a given peptide.
    
    Args:
        peptide_bindings (list of tuples): Each tuple contains (allele, ic50)
    
    Returns:
        tuple: (allele, ic50) with the lowest ic50
    """
    if not peptide_bindings:
        return ("", "")
    return min(peptide_bindings, key=lambda x: x[1])

In [68]:
# Step 1: Split Protein Sequence into Peptides
def split_protein_sequence_classI(protein_sequence, min_length=8, max_length=11):
    """
    Splits a protein sequence into peptides suitable for Class I MHC binding.
    """
    peptides = []
    for length in range(min_length, max_length + 1):
        peptides.extend([protein_sequence[i:i+length] 
                         for i in range(len(protein_sequence) - length + 1)])
    return peptides

# Step 2: Create Initial CSV File
def create_initial_csv(peptides, csv_file="classI_peptides.csv"):
    """
    Creates an initial CSV file with peptide sequences.
    Ensures that binding affinity columns are of type string.
    """
    columns = [
        "Peptide_Sequence", 
        "ClassI_TCR_Recognition", 
        "ClassI_MHC_Binding_Affinity",
        "ClassI_pMHC_Stability", 
        "Best_Binding_Affinity", 
        "Best_pMHC_Stability"
    ]
    initial_df = pd.DataFrame(peptides, columns=["Peptide_Sequence"])
    
    # Initialize other columns with empty strings and set dtype to 'string'
    for col in columns[1:]:
        initial_df[col] = ""
        initial_df[col] = initial_df[col].astype('string')  # Explicitly set dtype
    
    # Save the DataFrame to CSV with quoting for non-numeric fields
    initial_df.to_csv(csv_file, index=False, quoting=csv.QUOTE_NONNUMERIC)
    print(f"Initial CSV file '{csv_file}' created with {len(peptides)} peptides.")

# Step 3: Run NetMHC Binding Affinity Predictions
def run_netmhci_binding_affinity_classI(peptides, alleles, method="netmhcpan-4.1"):
    """
    Uses IEDB API to generate binding affinity for each peptide and HLA interaction.
    
    Args:
        peptides (list): A list of peptide sequences.
        alleles (list): A list of HLA alleles for which to make predictions.
        method (str): Prediction method to use.
    
    Returns:
        list: A list of dictionaries containing the binding affinity results or errors.
    """
    results = []
    # Group peptides by their length
    peptides_by_length = {}
    for peptide in peptides:
        length = len(peptide)
        if length not in peptides_by_length:
            peptides_by_length[length] = []
        peptides_by_length[length].append(peptide)
    
    # Iterate over each allele and each peptide length
    for allele in alleles:
        for length, peptides_subset in peptides_by_length.items():
            # Prepare the sequence text in FASTA format
            sequence_text = "".join([f">peptide{i}\n{peptide}\n" for i, peptide in enumerate(peptides_subset)])
            
            # Prepare the payload for the POST request
            payload = {
                "method": method,
                "sequence_text": sequence_text,
                "allele": allele,
                "length": str(length),
                "species": "human"  # Assuming all are human; modify if needed
            }
            
            # Make the POST request to IEDB API
            try:
                response = requests.post(IEDB_API_URL_CLASSI, data=payload)
                response.raise_for_status()
                results.append({
                    "allele": allele,
                    "length": length,
                    "peptides": peptides_subset,
                    "result": response.text
                })
                print(f"Successfully retrieved data for allele {allele} and length {length}.")
            except requests.exceptions.RequestException as e:
                results.append({
                    "allele": allele,
                    "length": length,
                    "peptides": peptides_subset,
                    "error": str(e)
                })
                print(f"Error occurred for allele {allele} and length {length}: {e}")
    
    return results

# Step 4: Process the Class I Results
def process_classI_results(results):
    """
    Processes the Class I results returned by the IEDB API.
    """
    # Dictionary to hold peptide-wise HLA binding affinities and stabilities
    peptide_data = {}

    for res in results:
        if "result" in res:
            results_io = StringIO(res["result"])
            try:
                df = pd.read_csv(results_io, sep="\t")
                # Check if the columns we need are present in the API response
                if {'peptide', 'allele', 'ic50'}.issubset(df.columns):
                    # Iterate through each row in the response
                    for _, row in df.iterrows():
                        peptide = row['peptide']
                        allele = row['allele']
                        ic50 = row['ic50']

                        if peptide not in peptide_data:
                            peptide_data[peptide] = {
                                "binding_affinities": [],
                            }
                        
                        # Append binding affinity
                        peptide_data[peptide]["binding_affinities"].append((allele, float(ic50)))
                else:
                    print(f"Unexpected columns in API response: {df.columns}")
            except pd.errors.EmptyDataError:
                print(f"Received empty data from API for allele {res['allele']} and length {res['length']}.")
        else:
            # Handle errors if any
            allele = res["allele"]
            error_msg = res["error"]
            for peptide in res["peptides"]:
                if peptide not in peptide_data:
                    peptide_data[peptide] = {
                        "binding_affinities": [],
                    }
                print(f"Error for peptide {peptide}: {error_msg}")
    
    # Convert peptide_data to a DataFrame
    processed_results = []
    for peptide, data in peptide_data.items():
        binding_affinities = data.get("binding_affinities", [])

        # Format ClassI_MHC_Binding_Affinity
        binding_affinity_str = "|".join([f"{allele}={ic50} nM" for allele, ic50 in binding_affinities])

        # Determine Best_Binding_Affinity
        if binding_affinities:
            best_binding = get_best_binding(binding_affinities)
            best_binding_affinity = f"{best_binding[0]}={best_binding[1]} nM"
        else:
            best_binding_affinity = ""
        
        processed_results.append({
            "Peptide_Sequence": peptide,
            "ClassI_TCR_Recognition": "",  # Assuming to be filled separately
            "ClassI_MHC_Binding_Affinity": binding_affinity_str,
            "ClassI_pMHC_Stability": "",  # Assuming no stability data in the response
            "Best_Binding_Affinity": best_binding_affinity,
            "Best_pMHC_Stability": "",  # Assuming no stability data in the response
        })
    
    processed_df = pd.DataFrame(processed_results)
    return processed_df

# Step 5: Update CSV with Class I Results
def update_csv_with_classI_results(df, csv_file="classI_peptides.csv"):
    """
    Updates the CSV file with Class I MHC binding results.
    Ensures that the binding affinity columns are treated as strings.
    """
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"The CSV file {csv_file} does not exist. Please create it first.")
    
    # Read existing CSV with all columns as strings
    existing_df = pd.read_csv(csv_file, dtype=str)
    
    # Merge existing DataFrame with the new results on Peptide_Sequence, updating only relevant fields
    # Here, we will use the index-based update to avoid unnecessary '_new' suffixes
    merged_df = existing_df.set_index("Peptide_Sequence").combine_first(df.set_index("Peptide_Sequence")).reset_index()

    # Drop any unwanted columns that were created by mistake, e.g., Errors, _new columns
    columns_to_drop = [col for col in merged_df.columns if '_new' in col or col == "Errors"]
    merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    # Save the updated DataFrame back to CSV
    merged_df.to_csv(csv_file, index=False, quoting=csv.QUOTE_NONNUMERIC)
    print(f"CSV file '{csv_file}' has been updated with Class I binding affinity results.")

In [69]:
# Main function for execution
if __name__ == "__main__":
    protein_sequence = "MTEYKLVVVGAGG"
    #MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG
    # Step 1: Split the protein sequence into peptides
    classI_peptides = split_protein_sequence_classI(protein_sequence)
    print(f"Generated {len(classI_peptides)} peptides.")
    
    # Step 2: Create initial CSV file
    create_initial_csv(classI_peptides)
    
    # Step 3: Define alleles
    hla_alleles_classI = ["HLA-A*01:01", "HLA-A*02:01"]
    
    # Step 4: Run NetMHC binding affinity predictions
    result_classI = run_netmhci_binding_affinity_classI(classI_peptides, hla_alleles_classI)
    print("Class I Results:")
    for res in result_classI:
        print(res)
    
    # Step 5: Process the results
    processed_df = process_classI_results(result_classI)
    print("Processed Results:")
    print(processed_df.head())
    
    # Step 6: Update the CSV with results
    if not processed_df.empty:
        update_csv_with_classI_results(processed_df)
        print(f"CSV file 'classI_peptides.csv' has been updated with Class I binding affinity results.")
    else:
        print("No valid results were returned for updating the CSV.")

Generated 18 peptides.
Initial CSV file 'classI_peptides.csv' created with 18 peptides.
Successfully retrieved data for allele HLA-A*01:01 and length 8.
Successfully retrieved data for allele HLA-A*01:01 and length 9.
Successfully retrieved data for allele HLA-A*01:01 and length 10.
Successfully retrieved data for allele HLA-A*01:01 and length 11.
Successfully retrieved data for allele HLA-A*02:01 and length 8.
Successfully retrieved data for allele HLA-A*02:01 and length 9.
Successfully retrieved data for allele HLA-A*02:01 and length 10.
Successfully retrieved data for allele HLA-A*02:01 and length 11.
Class I Results:
{'allele': 'HLA-A*01:01', 'length': 8, 'peptides': ['MTEYKLVV', 'TEYKLVVV', 'EYKLVVVG', 'YKLVVVGA', 'KLVVVGAG', 'LVVVGAGG'], 'result': 'allele\tseq_num\tstart\tend\tlength\tpeptide\tcore\ticore\tic50\tpercentile_rank\nHLA-A*01:01\t1\t1\t8\t8\tMTEYKLVV\tMTEYK-LVV\tMTEYKLVV\t11573.32\t3.2\nHLA-A*01:01\t2\t1\t8\t8\tTEYKLVVV\t-TEYKLVVV\tTEYKLVVV\t31110.44\t29\nHLA-A*01:01\