In [1]:
import pandas as pd
import numpy as np

my_path = 'C:/Users/sebas/Desktop/School/Ma 2/Stage/Internship/'

def structure_netMHCpan(filename, path, separator=','):
    ''' 
    Takes a cleaned up netCTLpan file as input.
    The default separator is a ',' (comma).
    The function returns a cleaned up pandas dataframe.
    
    Parameter description:
    
    - Index in protein: Residue number (starting from 0)
    - HLA: Molecule/allele name
    - Peptide: Amino acid sequence of the potential ligand
    - Core: The minimal 9 amino acid binding core directly in contact with the MHC
    - Of: The starting position of the Core within the Peptide (if > 0, the method predicts a N-terminal protrusion)
    - Gp: Position of the deletion, if any.
    - Gl: Length of the deletion.
    - Ip: Position of the insertions, if any.
    - Il: Length of the insertion.
    - Icore: Interaction core. This is the sequence of the binding core including eventual insertions of deletions.
    - Identity: Protein identifier, i.e. the name of the Fasta entry. -> will be split into 'Uniprot ID' and 'Protein name'
    - Score: The raw prediction score
    - Aff(nM): Predicted binding affinity in nanoMolar units (if binding affinity predictions is selected).
    - Rank: Rank of the predicted affinity compared to a set of random natural peptides. 
    This measure is not affected by inherent bias of certain molecules towards higher or lower mean predicted affinities. 
    Strong binders are defined as having %rank<0.5, and weak binders with %rank<2. We advise to select candidate binders based on %Rank rather than nM Affinity
    
    BindLevel: (SB: strong binder, WB: weak binder). 
    The peptide will be identified as a strong binder if the % Rank is below the specified threshold for the strong binders, by default 0.5%. 
    The peptide will be identified as a weak binder if the % Rank is above the threshold of the strong binders but below the specified threshold for the weak binders, by default 2%.
    '''
    
    my_path = path
    
    header = ['A', 'Index in protein' ,'HLA', 'B', 'Peptide', 'Core', 'Of', 'Gp', 'Gl', 'Ip', 'Il', 'Icore', 'Identity', 'Score', 'Rank']
    df = pd.read_csv(filename, sep=separator, 
                     names=header)
    new = df['Identity'] = df['Identity'].str.split('_', expand=True) # Splits the protein column in 3 new columns
    df['Uniprot ID'] = new[1] # Uniprot ID of proteins from which the peptides originate
    df['Protein name'] = new[2] # Canonical name of proteins from which the peptides originate
    df.drop(columns=['Identity', 'A', 'B'], inplace=True)
    
df = structure_netMHCpan('affinity_clean.txt', my_path)
df

FileNotFoundError: [Errno 2] File b'affinity_clean.txt' does not exist: b'affinity_clean.txt'