In [None]:
import oktoberfest as ok
%load_ext autoreload 
%autoreload 2
import pandas as pd 


In [None]:
config = {
    "type": "Rescoring",
    "tag": "",
    "output": "./out",
    "inputs": {
        "search_results": "./msms.txt",
        "search_results_type": "Maxquant",
        "spectra": "./",
        "spectra_type": "raw"
    },
    "models": {
        "intensity": "Prosit_2020_intensity_HCD",
        "irt": "Prosit_2019_irt"
    },
    "prediction_server": "koina.proteomicsdb.org:443",
    "numThreads": 1,
    "fdr_estimation_method": "mokapot",
    "allFeatures": False,
    "regressionMethod":"spline",
    "ssl": False,
    "thermoExe": "ThermoRawFileParser.exe",
    "massTolerance": 20,
    "unitMassTolerance": "ppm"}

    Pre-Processing : 

Spectral preprocessing

In [None]:
output = '/home/karim/projects/oktoberfest/tutorials/out/'
therm = '/home/karim/non_git/essentials/therm/ThermoRawFileParser.exe'


Getting a list of raw files

In [None]:
list_of_spectra = ok.pp.list_spectra('/home/karim/projects/oktoberfest/tutorials/Oktoberfest_input','raw')
print(list_of_spectra)

Converting raw files to mzml and merging into one file

In [None]:
from pyteomics import mzml
import os

# Define the output directory where individual mzML files are stored
output_directory = '/home/karim/projects/oktoberfest/tutorials/out/' 

# Create a directory to store the mzML files if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Initialize an empty list to store the paths of the converted mzML files
conversion_results = []
# TODO you need to understand this code 
# Loop through the list of spectra files and convert each one
for file in list_of_spectra:
    # Generate a unique output file name for the mzML file
    base_filename = os.path.splitext(os.path.basename(file))[0]  # Remove file extension
    output_mzml_file = os.path.join(output_directory, f"{base_filename}.mzML")
    
    # Call the convert_spectra function with the current file and the output file path
    ok.pp.convert_spectra(file, output_file=output_mzml_file, thermo_exe=therm)
    
    # Append the path of the converted mzML file to the conversion_results list
    conversion_results.append(output_mzml_file)




Path of the mzml file

In [None]:
mzml = '/home/karim/projects/oktoberfest/tutorials/out/'

Loading the spectra from mzml file

In [None]:
spectra = ok.pp.load_spectra(mzml_file= mzml, parser= 'pyteomics')

In [None]:
spectra.head()

Peptide processing

Converting search results to Oktoberfest format.

In [None]:
peptide_out_file = '/home/karim/projects/oktoberfest/tutorials/out/peptide_output.tsv'
split_search = '/home/karim/projects/oktoberfest/tutorials/out/split_search'


In [None]:
ok.pp.convert_search(input_path= '/home/karim/projects/oktoberfest/tutorials/Oktoberfest_input/msms.txt',output_file = peptide_out_file , search_engine= 'maxquant')

In [None]:
peptide_csv = pd.read_csv('/home/karim/projects/oktoberfest/tutorials/out/peptide_output.tsv')

In [None]:
peptide_csv.head()

Loading search results

In [None]:
peptide_df = ok.pp.load_search(peptide_out_file)

In [None]:
peptide_df.info()

Filtering search results using given constraints.

In [None]:
filtered_peptides = ok.pp.filter_peptides_for_model(peptides = peptide_df , model='prosit')

In [None]:
filtered_peptides.info()

Generating Psms

In [None]:
psms = ok.pp.merge_spectra_and_peptides(spectra= spectra , search= filtered_peptides)

In [None]:
psms.spectra_data

    Rescoring

Generating original features 

In [None]:
# adding the path for the original and rescored features.
original_features = '/home/karim/projects/oktoberfest/tutorials/out/original.tab'
rescore_features = '/home/karim/projects/oktoberfest/tutorials/out/rescore.tab'


Anotate the Psms

In [None]:
# No need to assign this function to a variable as it does not have a return.
ok.pp.annotate_spectral_library(psms= psms , mass_tol= 20 , unit_mass_tol= 'ppm' )

Generating features for the original

Predictions 
first perform ce caliberation and add to a variable as the function returns a pd series

In [None]:
alignment_library = ok.pr.ce_calibration(psms,url= config['prediction_server'] , intensity_model= config['models']['intensity'],  
                   irt_model= config['models']['irt'])

In [None]:
ce_alignment = alignment_library.spectra_data.groupby(by=["COLLISION_ENERGY"])["SPECTRAL_ANGLE"].mean()

In [None]:
best_ce = ce_alignment.idxmax()
psms.spectra_data['COLLISION_ENERGY']= best_ce

you have to understand this part to do it the caliberation. you need the collision energy and also the best collision energy for the file to be ready to predict the features of the rescored. Also apperently you need to split the psms file. 

In [None]:
gprc = ok.pr.grpc_predict(library = psms ,url= config['prediction_server'] , intensity_model= config['models']['intensity'],  
                   irt_model= config['models']['irt'], ssl= True , alignment= False)

In [None]:
psms.spectra_data['PREDICTED_IRT']

In [None]:
ok.re.generate_features( library= psms , search_type= 'rescore' , output_file= rescore_features , all_features=False , regression_method= 'spline' )

In [None]:
ok.re.generate_features( library= psms , search_type= 'original' , output_file= original_features, all_features=False, regression_method= 'spline')

Merging the input 

Generating a list of Tab files 

In [None]:
rescore_percolator = '/home/karim/projects/oktoberfest/tutorials/out/percolator/'
original_percolator = '/home/karim/projects/oktoberfest/tutorials/out/percolator/'
plot = '/home/karim/projects/oktoberfest/tutorials/out/' 

In [None]:
#ok.re.merge_input(tab_list,combined_tab)

In [None]:
ok.re.rescore_with_percolator(rescore_features,rescore_percolator)

In [None]:
ok.re.rescore_with_percolator(original_features, original_percolator)

# Visualisations

In [None]:
# Plots path 
plots = '/home/karim/projects/oktoberfest/tutorials/out/plots'

# Generate histogram of the score distribution for targets and decoys.
from pathlib import Path
perc_output_path = Path('/home/karim/projects/oktoberfest/tutorials/out/percolator')
peptide_decoy_df = pd.read_csv(perc_output_path / 'rescore.percolator.decoy.peptides.txt', sep='\t')

peptide_target_df = pd.read_csv(perc_output_path / 'rescore.percolator.peptides.txt', sep='\t')

display(peptide_decoy_df)

ok.pl.plot_score_distribution(peptide_target_df, peptide_decoy_df, level = 'peptide', filename=plots)

In [None]:
ok.pl.plot_all(perc_output_path)